diff options
Diffstat (limited to 'prism')
62 files changed, 48733 insertions, 0 deletions
diff --git a/prism/api_pack.c b/prism/api_pack.c new file mode 100644 index 0000000000..98509ae65c --- /dev/null +++ b/prism/api_pack.c @@ -0,0 +1,276 @@ +#include "prism/extension.h" + +#ifdef PRISM_EXCLUDE_PACK + +void +Init_prism_pack(void) {} + +#else + +static VALUE rb_cPrism; +static VALUE rb_cPrismPack; +static VALUE rb_cPrismPackDirective; +static VALUE rb_cPrismPackFormat; + +static VALUE v3_2_0_symbol; +static VALUE pack_symbol; +static VALUE unpack_symbol; + +#if SIZEOF_UINT64_T == SIZEOF_LONG_LONG +# define UINT64T2NUM(x) ULL2NUM(x) +# define NUM2UINT64T(x) (uint64_t)NUM2ULL(x) +#elif SIZEOF_UINT64_T == SIZEOF_LONG +# define UINT64T2NUM(x) ULONG2NUM(x) +# define NUM2UINT64T(x) (uint64_t)NUM2ULONG(x) +#else +// error No uint64_t conversion +#endif + +static VALUE +pack_type_to_symbol(pm_pack_type type) { + switch (type) { + case PM_PACK_SPACE: + return ID2SYM(rb_intern("SPACE")); + case PM_PACK_COMMENT: + return ID2SYM(rb_intern("COMMENT")); + case PM_PACK_INTEGER: + return ID2SYM(rb_intern("INTEGER")); + case PM_PACK_UTF8: + return ID2SYM(rb_intern("UTF8")); + case PM_PACK_BER: + return ID2SYM(rb_intern("BER")); + case PM_PACK_FLOAT: + return ID2SYM(rb_intern("FLOAT")); + case PM_PACK_STRING_SPACE_PADDED: + return ID2SYM(rb_intern("STRING_SPACE_PADDED")); + case PM_PACK_STRING_NULL_PADDED: + return ID2SYM(rb_intern("STRING_NULL_PADDED")); + case PM_PACK_STRING_NULL_TERMINATED: + return ID2SYM(rb_intern("STRING_NULL_TERMINATED")); + case PM_PACK_STRING_MSB: + return ID2SYM(rb_intern("STRING_MSB")); + case PM_PACK_STRING_LSB: + return ID2SYM(rb_intern("STRING_LSB")); + case PM_PACK_STRING_HEX_HIGH: + return ID2SYM(rb_intern("STRING_HEX_HIGH")); + case PM_PACK_STRING_HEX_LOW: + return ID2SYM(rb_intern("STRING_HEX_LOW")); + case PM_PACK_STRING_UU: + return ID2SYM(rb_intern("STRING_UU")); + case PM_PACK_STRING_MIME: + return ID2SYM(rb_intern("STRING_MIME")); + case PM_PACK_STRING_BASE64: + return ID2SYM(rb_intern("STRING_BASE64")); + case PM_PACK_STRING_FIXED: + return ID2SYM(rb_intern("STRING_FIXED")); + case PM_PACK_STRING_POINTER: + return ID2SYM(rb_intern("STRING_POINTER")); + case PM_PACK_MOVE: + return ID2SYM(rb_intern("MOVE")); + case PM_PACK_BACK: + return ID2SYM(rb_intern("BACK")); + case PM_PACK_NULL: + return ID2SYM(rb_intern("NULL")); + default: + return Qnil; + } +} + +static VALUE +pack_signed_to_symbol(pm_pack_signed signed_type) { + switch (signed_type) { + case PM_PACK_UNSIGNED: + return ID2SYM(rb_intern("UNSIGNED")); + case PM_PACK_SIGNED: + return ID2SYM(rb_intern("SIGNED")); + case PM_PACK_SIGNED_NA: + return ID2SYM(rb_intern("SIGNED_NA")); + default: + return Qnil; + } +} + +static VALUE +pack_endian_to_symbol(pm_pack_endian endian) { + switch (endian) { + case PM_PACK_AGNOSTIC_ENDIAN: + return ID2SYM(rb_intern("AGNOSTIC_ENDIAN")); + case PM_PACK_LITTLE_ENDIAN: + return ID2SYM(rb_intern("LITTLE_ENDIAN")); + case PM_PACK_BIG_ENDIAN: + return ID2SYM(rb_intern("BIG_ENDIAN")); + case PM_PACK_NATIVE_ENDIAN: + return ID2SYM(rb_intern("NATIVE_ENDIAN")); + case PM_PACK_ENDIAN_NA: + return ID2SYM(rb_intern("ENDIAN_NA")); + default: + return Qnil; + } +} + +static VALUE +pack_size_to_symbol(pm_pack_size size) { + switch (size) { + case PM_PACK_SIZE_SHORT: + return ID2SYM(rb_intern("SIZE_SHORT")); + case PM_PACK_SIZE_INT: + return ID2SYM(rb_intern("SIZE_INT")); + case PM_PACK_SIZE_LONG: + return ID2SYM(rb_intern("SIZE_LONG")); + case PM_PACK_SIZE_LONG_LONG: + return ID2SYM(rb_intern("SIZE_LONG_LONG")); + case PM_PACK_SIZE_8: + return ID2SYM(rb_intern("SIZE_8")); + case PM_PACK_SIZE_16: + return ID2SYM(rb_intern("SIZE_16")); + case PM_PACK_SIZE_32: + return ID2SYM(rb_intern("SIZE_32")); + case PM_PACK_SIZE_64: + return ID2SYM(rb_intern("SIZE_64")); + case PM_PACK_SIZE_P: + return ID2SYM(rb_intern("SIZE_P")); + case PM_PACK_SIZE_NA: + return ID2SYM(rb_intern("SIZE_NA")); + default: + return Qnil; + } +} + +static VALUE +pack_length_type_to_symbol(pm_pack_length_type length_type) { + switch (length_type) { + case PM_PACK_LENGTH_FIXED: + return ID2SYM(rb_intern("LENGTH_FIXED")); + case PM_PACK_LENGTH_MAX: + return ID2SYM(rb_intern("LENGTH_MAX")); + case PM_PACK_LENGTH_RELATIVE: + return ID2SYM(rb_intern("LENGTH_RELATIVE")); + case PM_PACK_LENGTH_NA: + return ID2SYM(rb_intern("LENGTH_NA")); + default: + return Qnil; + } +} + +static VALUE +pack_encoding_to_ruby(pm_pack_encoding encoding) { + int index; + switch (encoding) { + case PM_PACK_ENCODING_ASCII_8BIT: + index = rb_ascii8bit_encindex(); + break; + case PM_PACK_ENCODING_US_ASCII: + index = rb_usascii_encindex(); + break; + case PM_PACK_ENCODING_UTF_8: + index = rb_utf8_encindex(); + break; + default: + return Qnil; + } + return rb_enc_from_encoding(rb_enc_from_index(index)); +} + +/** + * call-seq: + * Pack::parse(version, variant, source) -> Format + * + * Parse the given source and return a format object. + */ +static VALUE +pack_parse(VALUE self, VALUE version_symbol, VALUE variant_symbol, VALUE format_string) { + if (version_symbol != v3_2_0_symbol) { + rb_raise(rb_eArgError, "invalid version"); + } + + pm_pack_variant variant; + if (variant_symbol == pack_symbol) { + variant = PM_PACK_VARIANT_PACK; + } else if (variant_symbol == unpack_symbol) { + variant = PM_PACK_VARIANT_UNPACK; + } else { + rb_raise(rb_eArgError, "invalid variant"); + } + + StringValue(format_string); + + const char *format = RSTRING_PTR(format_string); + const char *format_end = format + RSTRING_LEN(format_string); + pm_pack_encoding encoding = PM_PACK_ENCODING_START; + + VALUE directives_array = rb_ary_new(); + + while (format < format_end) { + pm_pack_type type; + pm_pack_signed signed_type; + pm_pack_endian endian; + pm_pack_size size; + pm_pack_length_type length_type; + uint64_t length; + + const char *directive_start = format; + + pm_pack_result parse_result = pm_pack_parse(variant, &format, format_end, &type, &signed_type, &endian, + &size, &length_type, &length, &encoding); + + const char *directive_end = format; + + switch (parse_result) { + case PM_PACK_OK: + break; + case PM_PACK_ERROR_UNSUPPORTED_DIRECTIVE: + rb_raise(rb_eArgError, "unsupported directive"); + case PM_PACK_ERROR_UNKNOWN_DIRECTIVE: + rb_raise(rb_eArgError, "unsupported directive"); + case PM_PACK_ERROR_LENGTH_TOO_BIG: + rb_raise(rb_eRangeError, "pack length too big"); + case PM_PACK_ERROR_BANG_NOT_ALLOWED: + rb_raise(rb_eRangeError, "bang not allowed"); + case PM_PACK_ERROR_DOUBLE_ENDIAN: + rb_raise(rb_eRangeError, "double endian"); + default: + rb_bug("parse result"); + } + + if (type == PM_PACK_END) { + break; + } + + VALUE directive_args[9] = { + version_symbol, + variant_symbol, + rb_usascii_str_new(directive_start, directive_end - directive_start), + pack_type_to_symbol(type), + pack_signed_to_symbol(signed_type), + pack_endian_to_symbol(endian), + pack_size_to_symbol(size), + pack_length_type_to_symbol(length_type), + UINT64T2NUM(length) + }; + + rb_ary_push(directives_array, rb_class_new_instance(9, directive_args, rb_cPrismPackDirective)); + } + + VALUE format_args[2]; + format_args[0] = directives_array; + format_args[1] = pack_encoding_to_ruby(encoding); + return rb_class_new_instance(2, format_args, rb_cPrismPackFormat); +} + +/** + * The function that gets called when Ruby initializes the prism extension. + */ +void +Init_prism_pack(void) { + rb_cPrism = rb_define_module("Prism"); + rb_cPrismPack = rb_define_module_under(rb_cPrism, "Pack"); + rb_cPrismPackDirective = rb_define_class_under(rb_cPrismPack, "Directive", rb_cObject); + rb_cPrismPackFormat = rb_define_class_under(rb_cPrismPack, "Format", rb_cObject); + rb_define_singleton_method(rb_cPrismPack, "parse", pack_parse, 3); + + v3_2_0_symbol = ID2SYM(rb_intern("v3_2_0")); + pack_symbol = ID2SYM(rb_intern("pack")); + unpack_symbol = ID2SYM(rb_intern("unpack")); +} + +#endif diff --git a/prism/config.yml b/prism/config.yml new file mode 100644 index 0000000000..4e5b077a35 --- /dev/null +++ b/prism/config.yml @@ -0,0 +1,4739 @@ +errors: + - ALIAS_ARGUMENT + - ALIAS_ARGUMENT_NUMBERED_REFERENCE + - AMPAMPEQ_MULTI_ASSIGN + - ARGUMENT_AFTER_BLOCK + - ARGUMENT_AFTER_FORWARDING_ELLIPSES + - ARGUMENT_BARE_HASH + - ARGUMENT_BLOCK_FORWARDING + - ARGUMENT_BLOCK_MULTI + - ARGUMENT_CONFLICT_AMPERSAND + - ARGUMENT_CONFLICT_STAR + - ARGUMENT_CONFLICT_STAR_STAR + - ARGUMENT_FORMAL_CLASS + - ARGUMENT_FORMAL_CONSTANT + - ARGUMENT_FORMAL_GLOBAL + - ARGUMENT_FORMAL_IVAR + - ARGUMENT_FORWARDING_UNBOUND + - ARGUMENT_NO_FORWARDING_AMPERSAND + - ARGUMENT_NO_FORWARDING_ELLIPSES + - ARGUMENT_NO_FORWARDING_STAR + - ARGUMENT_NO_FORWARDING_STAR_STAR + - ARGUMENT_SPLAT_AFTER_ASSOC_SPLAT + - ARGUMENT_SPLAT_AFTER_SPLAT + - ARGUMENT_TERM_PAREN + - ARGUMENT_UNEXPECTED_BLOCK + - ARRAY_ELEMENT + - ARRAY_EXPRESSION + - ARRAY_EXPRESSION_AFTER_STAR + - ARRAY_SEPARATOR + - ARRAY_TERM + - BEGIN_LONELY_ELSE + - BEGIN_TERM + - BEGIN_UPCASE_BRACE + - BEGIN_UPCASE_TERM + - BEGIN_UPCASE_TOPLEVEL + - BLOCK_PARAM_LOCAL_VARIABLE + - BLOCK_PARAM_PIPE_TERM + - BLOCK_TERM_BRACE + - BLOCK_TERM_END + - CANNOT_PARSE_EXPRESSION + - CANNOT_PARSE_STRING_PART + - CASE_EXPRESSION_AFTER_CASE + - CASE_EXPRESSION_AFTER_WHEN + - CASE_MATCH_MISSING_PREDICATE + - CASE_MISSING_CONDITIONS + - CASE_TERM + - CLASS_IN_METHOD + - CLASS_NAME + - CLASS_SUPERCLASS + - CLASS_TERM + - CLASS_UNEXPECTED_END + - CLASS_VARIABLE_BARE + - CONDITIONAL_ELSIF_PREDICATE + - CONDITIONAL_IF_PREDICATE + - CONDITIONAL_PREDICATE_TERM + - CONDITIONAL_TERM + - CONDITIONAL_TERM_ELSE + - CONDITIONAL_UNLESS_PREDICATE + - CONDITIONAL_UNTIL_PREDICATE + - CONDITIONAL_WHILE_PREDICATE + - CONSTANT_PATH_COLON_COLON_CONSTANT + - DEF_ENDLESS + - DEF_ENDLESS_PARAMETERS + - DEF_ENDLESS_SETTER + - DEF_NAME + - DEF_PARAMS_TERM + - DEF_PARAMS_TERM_PAREN + - DEF_RECEIVER + - DEF_RECEIVER_TERM + - DEF_TERM + - DEFINED_EXPRESSION + - EMBDOC_TERM + - EMBEXPR_END + - EMBVAR_INVALID + - END_UPCASE_BRACE + - END_UPCASE_TERM + - ESCAPE_INVALID_CONTROL + - ESCAPE_INVALID_CONTROL_REPEAT + - ESCAPE_INVALID_HEXADECIMAL + - ESCAPE_INVALID_META + - ESCAPE_INVALID_META_REPEAT + - ESCAPE_INVALID_UNICODE + - ESCAPE_INVALID_UNICODE_CM_FLAGS + - ESCAPE_INVALID_UNICODE_LIST + - ESCAPE_INVALID_UNICODE_LITERAL + - ESCAPE_INVALID_UNICODE_LONG + - ESCAPE_INVALID_UNICODE_SHORT + - ESCAPE_INVALID_UNICODE_TERM + - EXPECT_ARGUMENT + - EXPECT_EOL_AFTER_STATEMENT + - EXPECT_EXPRESSION_AFTER_AMPAMPEQ + - EXPECT_EXPRESSION_AFTER_COMMA + - EXPECT_EXPRESSION_AFTER_EQUAL + - EXPECT_EXPRESSION_AFTER_LESS_LESS + - EXPECT_EXPRESSION_AFTER_LPAREN + - EXPECT_EXPRESSION_AFTER_OPERATOR + - EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ + - EXPECT_EXPRESSION_AFTER_QUESTION + - EXPECT_EXPRESSION_AFTER_SPLAT + - EXPECT_EXPRESSION_AFTER_SPLAT_HASH + - EXPECT_EXPRESSION_AFTER_STAR + - EXPECT_FOR_DELIMITER + - EXPECT_IDENT_REQ_PARAMETER + - EXPECT_IN_DELIMITER + - EXPECT_LPAREN_AFTER_NOT_LPAREN + - EXPECT_LPAREN_AFTER_NOT_OTHER + - EXPECT_LPAREN_REQ_PARAMETER + - EXPECT_MESSAGE + - EXPECT_RBRACKET + - EXPECT_RPAREN + - EXPECT_RPAREN_AFTER_MULTI + - EXPECT_RPAREN_REQ_PARAMETER + - EXPECT_SINGLETON_CLASS_DELIMITER + - EXPECT_STRING_CONTENT + - EXPECT_WHEN_DELIMITER + - EXPRESSION_BARE_HASH + - EXPRESSION_NOT_WRITABLE + - EXPRESSION_NOT_WRITABLE_ENCODING + - EXPRESSION_NOT_WRITABLE_FALSE + - EXPRESSION_NOT_WRITABLE_FILE + - EXPRESSION_NOT_WRITABLE_LINE + - EXPRESSION_NOT_WRITABLE_NIL + - EXPRESSION_NOT_WRITABLE_NUMBERED + - EXPRESSION_NOT_WRITABLE_SELF + - EXPRESSION_NOT_WRITABLE_TRUE + - FLOAT_PARSE + - FOR_COLLECTION + - FOR_IN + - FOR_INDEX + - FOR_TERM + - GLOBAL_VARIABLE_BARE + - HASH_EXPRESSION_AFTER_LABEL + - HASH_KEY + - HASH_ROCKET + - HASH_TERM + - HASH_VALUE + - HEREDOC_IDENTIFIER + - HEREDOC_TERM + - INCOMPLETE_QUESTION_MARK + - INCOMPLETE_VARIABLE_CLASS + - INCOMPLETE_VARIABLE_CLASS_3_3 + - INCOMPLETE_VARIABLE_INSTANCE + - INCOMPLETE_VARIABLE_INSTANCE_3_3 + - INSTANCE_VARIABLE_BARE + - INVALID_BLOCK_EXIT + - INVALID_CHARACTER + - INVALID_COMMA + - INVALID_ENCODING_MAGIC_COMMENT + - INVALID_ESCAPE_CHARACTER + - INVALID_FLOAT_EXPONENT + - INVALID_LOCAL_VARIABLE_READ + - INVALID_LOCAL_VARIABLE_WRITE + - INVALID_MULTIBYTE_CHAR + - INVALID_MULTIBYTE_CHARACTER + - INVALID_MULTIBYTE_ESCAPE + - INVALID_NUMBER_BINARY + - INVALID_NUMBER_DECIMAL + - INVALID_NUMBER_FRACTION + - INVALID_NUMBER_HEXADECIMAL + - INVALID_NUMBER_OCTAL + - INVALID_NUMBER_UNDERSCORE_INNER + - INVALID_NUMBER_UNDERSCORE_TRAILING + - INVALID_PERCENT + - INVALID_PERCENT_EOF + - INVALID_PRINTABLE_CHARACTER + - INVALID_RETRY_AFTER_ELSE + - INVALID_RETRY_AFTER_ENSURE + - INVALID_RETRY_WITHOUT_RESCUE + - INVALID_SYMBOL + - INVALID_VARIABLE_GLOBAL + - INVALID_VARIABLE_GLOBAL_3_3 + - INVALID_YIELD + - IT_NOT_ALLOWED_NUMBERED + - IT_NOT_ALLOWED_ORDINARY + - LAMBDA_OPEN + - LAMBDA_TERM_BRACE + - LAMBDA_TERM_END + - LIST_I_LOWER_ELEMENT + - LIST_I_LOWER_TERM + - LIST_I_UPPER_ELEMENT + - LIST_I_UPPER_TERM + - LIST_W_LOWER_ELEMENT + - LIST_W_LOWER_TERM + - LIST_W_UPPER_ELEMENT + - LIST_W_UPPER_TERM + - MALLOC_FAILED + - MIXED_ENCODING + - MODULE_IN_METHOD + - MODULE_NAME + - MODULE_TERM + - MULTI_ASSIGN_MULTI_SPLATS + - MULTI_ASSIGN_UNEXPECTED_REST + - NESTING_TOO_DEEP + - NO_LOCAL_VARIABLE + - NON_ASSOCIATIVE_OPERATOR + - NOT_EXPRESSION + - NUMBER_LITERAL_UNDERSCORE + - NUMBERED_PARAMETER_INNER_BLOCK + - NUMBERED_PARAMETER_IT + - NUMBERED_PARAMETER_ORDINARY + - NUMBERED_PARAMETER_OUTER_BLOCK + - OPERATOR_MULTI_ASSIGN + - OPERATOR_WRITE_ARGUMENTS + - OPERATOR_WRITE_BLOCK + - PARAMETER_ASSOC_SPLAT_MULTI + - PARAMETER_BLOCK_MULTI + - PARAMETER_CIRCULAR + - PARAMETER_FORWARDING_AFTER_REST + - PARAMETER_METHOD_NAME + - PARAMETER_NAME_DUPLICATED + - PARAMETER_NO_DEFAULT + - PARAMETER_NO_DEFAULT_KW + - PARAMETER_NUMBERED_RESERVED + - PARAMETER_ORDER + - PARAMETER_SPLAT_MULTI + - PARAMETER_STAR + - PARAMETER_UNEXPECTED_FWD + - PARAMETER_UNEXPECTED_NO_KW + - PARAMETER_WILD_LOOSE_COMMA + - PATTERN_ARRAY_MULTIPLE_RESTS + - PATTERN_CAPTURE_DUPLICATE + - PATTERN_CAPTURE_IN_ALTERNATIVE + - PATTERN_EXPRESSION_AFTER_BRACKET + - PATTERN_EXPRESSION_AFTER_COMMA + - PATTERN_EXPRESSION_AFTER_HROCKET + - PATTERN_EXPRESSION_AFTER_IN + - PATTERN_EXPRESSION_AFTER_KEY + - PATTERN_EXPRESSION_AFTER_PAREN + - PATTERN_EXPRESSION_AFTER_PIN + - PATTERN_EXPRESSION_AFTER_PIPE + - PATTERN_EXPRESSION_AFTER_RANGE + - PATTERN_EXPRESSION_AFTER_REST + - PATTERN_FIND_MISSING_INNER + - PATTERN_HASH_IMPLICIT + - PATTERN_HASH_KEY + - PATTERN_HASH_KEY_DUPLICATE + - PATTERN_HASH_KEY_INTERPOLATED + - PATTERN_HASH_KEY_LABEL + - PATTERN_HASH_KEY_LOCALS + - PATTERN_IDENT_AFTER_HROCKET + - PATTERN_LABEL_AFTER_COMMA + - PATTERN_REST + - PATTERN_TERM_BRACE + - PATTERN_TERM_BRACKET + - PATTERN_TERM_PAREN + - PIPEPIPEEQ_MULTI_ASSIGN + - REGEXP_ENCODING_OPTION_MISMATCH + - REGEXP_INCOMPAT_CHAR_ENCODING + - REGEXP_INVALID_UNICODE_RANGE + - REGEXP_NON_ESCAPED_MBC + - REGEXP_PARSE_ERROR + - REGEXP_TERM + - REGEXP_UNKNOWN_OPTIONS + - REGEXP_UTF8_CHAR_NON_UTF8_REGEXP + - RESCUE_EXPRESSION + - RESCUE_MODIFIER_VALUE + - RESCUE_TERM + - RESCUE_VARIABLE + - RETURN_INVALID + - SCRIPT_NOT_FOUND + - SINGLETON_FOR_LITERALS + - STATEMENT_ALIAS + - STATEMENT_POSTEXE_END + - STATEMENT_PREEXE_BEGIN + - STATEMENT_UNDEF + - STRING_CONCATENATION + - STRING_INTERPOLATED_TERM + - STRING_LITERAL_EOF + - STRING_LITERAL_TERM + - SYMBOL_INVALID + - SYMBOL_TERM_DYNAMIC + - SYMBOL_TERM_INTERPOLATED + - TERNARY_COLON + - TERNARY_EXPRESSION_FALSE + - TERNARY_EXPRESSION_TRUE + - UNARY_DISALLOWED + - UNARY_RECEIVER + - UNDEF_ARGUMENT + - UNEXPECTED_BLOCK_ARGUMENT + - UNEXPECTED_INDEX_BLOCK + - UNEXPECTED_INDEX_KEYWORDS + - UNEXPECTED_LABEL + - UNEXPECTED_MULTI_WRITE + - UNEXPECTED_PARAMETER_DEFAULT_VALUE + - UNEXPECTED_RANGE_OPERATOR + - UNEXPECTED_SAFE_NAVIGATION + - UNEXPECTED_TOKEN_CLOSE_CONTEXT + - UNEXPECTED_TOKEN_IGNORE + - UNTIL_TERM + - VOID_EXPRESSION + - WHILE_TERM + - WRITE_TARGET_IN_METHOD + - WRITE_TARGET_READONLY + - WRITE_TARGET_UNEXPECTED + - XSTRING_TERM +warnings: + - AMBIGUOUS_BINARY_OPERATOR + - AMBIGUOUS_FIRST_ARGUMENT_MINUS + - AMBIGUOUS_FIRST_ARGUMENT_PLUS + - AMBIGUOUS_PREFIX_AMPERSAND + - AMBIGUOUS_PREFIX_STAR + - AMBIGUOUS_PREFIX_STAR_STAR + - AMBIGUOUS_SLASH + - COMPARISON_AFTER_COMPARISON + - DOT_DOT_DOT_EOL + - EQUAL_IN_CONDITIONAL + - EQUAL_IN_CONDITIONAL_3_3 + - END_IN_METHOD + - DUPLICATED_HASH_KEY + - DUPLICATED_WHEN_CLAUSE + - FLOAT_OUT_OF_RANGE + - IGNORED_FROZEN_STRING_LITERAL + - INDENTATION_MISMATCH + - INTEGER_IN_FLIP_FLOP + - INVALID_CHARACTER + - INVALID_MAGIC_COMMENT_VALUE + - INVALID_NUMBERED_REFERENCE + - KEYWORD_EOL + - LITERAL_IN_CONDITION_DEFAULT + - LITERAL_IN_CONDITION_VERBOSE + - SHAREABLE_CONSTANT_VALUE_LINE + - SHEBANG_CARRIAGE_RETURN + - UNEXPECTED_CARRIAGE_RETURN + - UNREACHABLE_STATEMENT + - UNUSED_LOCAL_VARIABLE + - VOID_STATEMENT +tokens: + # The order of the tokens at the beginning is important, because we use them + # for a lookup table. + - name: EOF + value: 1 + comment: final token in the file + - name: BRACE_RIGHT + comment: "}" + - name: COMMA + comment: "," + - name: EMBEXPR_END + comment: "}" + - name: KEYWORD_DO + comment: "do" + - name: KEYWORD_ELSE + comment: "else" + - name: KEYWORD_ELSIF + comment: "elsif" + - name: KEYWORD_END + comment: "end" + - name: KEYWORD_ENSURE + comment: "ensure" + - name: KEYWORD_IN + comment: "in" + - name: KEYWORD_RESCUE + comment: "rescue" + - name: KEYWORD_THEN + comment: "then" + - name: KEYWORD_WHEN + comment: "when" + - name: NEWLINE + comment: "a newline character outside of other tokens" + - name: PARENTHESIS_RIGHT + comment: ")" + - name: PIPE + comment: "|" + - name: SEMICOLON + comment: ";" + # Tokens from here on are not used for lookup, and can be in any order. + - name: AMPERSAND + comment: "&" + - name: AMPERSAND_AMPERSAND + comment: "&&" + - name: AMPERSAND_AMPERSAND_EQUAL + comment: "&&=" + - name: AMPERSAND_DOT + comment: "&." + - name: AMPERSAND_EQUAL + comment: "&=" + - name: BACKTICK + comment: "`" + - name: BACK_REFERENCE + comment: "a back reference" + - name: BANG + comment: "! or !@" + - name: BANG_EQUAL + comment: "!=" + - name: BANG_TILDE + comment: "!~" + - name: BRACE_LEFT + comment: "{" + - name: BRACKET_LEFT + comment: "[" + - name: BRACKET_LEFT_ARRAY + comment: "[ for the beginning of an array" + - name: BRACKET_LEFT_RIGHT + comment: "[]" + - name: BRACKET_LEFT_RIGHT_EQUAL + comment: "[]=" + - name: BRACKET_RIGHT + comment: "]" + - name: CARET + comment: "^" + - name: CARET_EQUAL + comment: "^=" + - name: CHARACTER_LITERAL + comment: "a character literal" + - name: CLASS_VARIABLE + comment: "a class variable" + - name: COLON + comment: ":" + - name: COLON_COLON + comment: "::" + - name: COMMENT + comment: "a comment" + - name: CONSTANT + comment: "a constant" + - name: DOT + comment: "the . call operator" + - name: DOT_DOT + comment: "the .. range operator" + - name: DOT_DOT_DOT + comment: "the ... range operator or forwarding parameter" + - name: EMBDOC_BEGIN + comment: "=begin" + - name: EMBDOC_END + comment: "=end" + - name: EMBDOC_LINE + comment: "a line inside of embedded documentation" + - name: EMBEXPR_BEGIN + comment: "#{" + - name: EMBVAR + comment: "#" + - name: EQUAL + comment: "=" + - name: EQUAL_EQUAL + comment: "==" + - name: EQUAL_EQUAL_EQUAL + comment: "===" + - name: EQUAL_GREATER + comment: "=>" + - name: EQUAL_TILDE + comment: "=~" + - name: FLOAT + comment: "a floating point number" + - name: FLOAT_IMAGINARY + comment: "a floating pointer number with an imaginary suffix" + - name: FLOAT_RATIONAL + comment: "a floating pointer number with a rational suffix" + - name: FLOAT_RATIONAL_IMAGINARY + comment: "a floating pointer number with a rational and imaginary suffix" + - name: GLOBAL_VARIABLE + comment: "a global variable" + - name: GREATER + comment: ">" + - name: GREATER_EQUAL + comment: ">=" + - name: GREATER_GREATER + comment: ">>" + - name: GREATER_GREATER_EQUAL + comment: ">>=" + - name: HEREDOC_END + comment: "the end of a heredoc" + - name: HEREDOC_START + comment: "the start of a heredoc" + - name: IDENTIFIER + comment: "an identifier" + - name: IGNORED_NEWLINE + comment: "an ignored newline" + - name: INSTANCE_VARIABLE + comment: "an instance variable" + - name: INTEGER + comment: "an integer (any base)" + - name: INTEGER_IMAGINARY + comment: "an integer with an imaginary suffix" + - name: INTEGER_RATIONAL + comment: "an integer with a rational suffix" + - name: INTEGER_RATIONAL_IMAGINARY + comment: "an integer with a rational and imaginary suffix" + - name: KEYWORD_ALIAS + comment: "alias" + - name: KEYWORD_AND + comment: "and" + - name: KEYWORD_BEGIN + comment: "begin" + - name: KEYWORD_BEGIN_UPCASE + comment: "BEGIN" + - name: KEYWORD_BREAK + comment: "break" + - name: KEYWORD_CASE + comment: "case" + - name: KEYWORD_CLASS + comment: "class" + - name: KEYWORD_DEF + comment: "def" + - name: KEYWORD_DEFINED + comment: "defined?" + - name: KEYWORD_DO_LOOP + comment: "do keyword for a predicate in a while, until, or for loop" + - name: KEYWORD_END_UPCASE + comment: "END" + - name: KEYWORD_FALSE + comment: "false" + - name: KEYWORD_FOR + comment: "for" + - name: KEYWORD_IF + comment: "if" + - name: KEYWORD_IF_MODIFIER + comment: "if in the modifier form" + - name: KEYWORD_MODULE + comment: "module" + - name: KEYWORD_NEXT + comment: "next" + - name: KEYWORD_NIL + comment: "nil" + - name: KEYWORD_NOT + comment: "not" + - name: KEYWORD_OR + comment: "or" + - name: KEYWORD_REDO + comment: "redo" + - name: KEYWORD_RESCUE_MODIFIER + comment: "rescue in the modifier form" + - name: KEYWORD_RETRY + comment: "retry" + - name: KEYWORD_RETURN + comment: "return" + - name: KEYWORD_SELF + comment: "self" + - name: KEYWORD_SUPER + comment: "super" + - name: KEYWORD_TRUE + comment: "true" + - name: KEYWORD_UNDEF + comment: "undef" + - name: KEYWORD_UNLESS + comment: "unless" + - name: KEYWORD_UNLESS_MODIFIER + comment: "unless in the modifier form" + - name: KEYWORD_UNTIL + comment: "until" + - name: KEYWORD_UNTIL_MODIFIER + comment: "until in the modifier form" + - name: KEYWORD_WHILE + comment: "while" + - name: KEYWORD_WHILE_MODIFIER + comment: "while in the modifier form" + - name: KEYWORD_YIELD + comment: "yield" + - name: KEYWORD___ENCODING__ + comment: "__ENCODING__" + - name: KEYWORD___FILE__ + comment: "__FILE__" + - name: KEYWORD___LINE__ + comment: "__LINE__" + - name: LABEL + comment: "a label" + - name: LABEL_END + comment: "the end of a label" + - name: LAMBDA_BEGIN + comment: "{" + - name: LESS + comment: "<" + - name: LESS_EQUAL + comment: "<=" + - name: LESS_EQUAL_GREATER + comment: "<=>" + - name: LESS_LESS + comment: "<<" + - name: LESS_LESS_EQUAL + comment: "<<=" + - name: METHOD_NAME + comment: "a method name" + - name: MINUS + comment: "-" + - name: MINUS_EQUAL + comment: "-=" + - name: MINUS_GREATER + comment: "->" + - name: NUMBERED_REFERENCE + comment: "a numbered reference to a capture group in the previous regular expression match" + - name: PARENTHESIS_LEFT + comment: "(" + - name: PARENTHESIS_LEFT_PARENTHESES + comment: "( for a parentheses node" + - name: PERCENT + comment: "%" + - name: PERCENT_EQUAL + comment: "%=" + - name: PERCENT_LOWER_I + comment: "%i" + - name: PERCENT_LOWER_W + comment: "%w" + - name: PERCENT_LOWER_X + comment: "%x" + - name: PERCENT_UPPER_I + comment: "%I" + - name: PERCENT_UPPER_W + comment: "%W" + - name: PIPE_EQUAL + comment: "|=" + - name: PIPE_PIPE + comment: "||" + - name: PIPE_PIPE_EQUAL + comment: "||=" + - name: PLUS + comment: "+" + - name: PLUS_EQUAL + comment: "+=" + - name: QUESTION_MARK + comment: "?" + - name: REGEXP_BEGIN + comment: "the beginning of a regular expression" + - name: REGEXP_END + comment: "the end of a regular expression" + - name: SLASH + comment: "/" + - name: SLASH_EQUAL + comment: "/=" + - name: STAR + comment: "*" + - name: STAR_EQUAL + comment: "*=" + - name: STAR_STAR + comment: "**" + - name: STAR_STAR_EQUAL + comment: "**=" + - name: STRING_BEGIN + comment: "the beginning of a string" + - name: STRING_CONTENT + comment: "the contents of a string" + - name: STRING_END + comment: "the end of a string" + - name: SYMBOL_BEGIN + comment: "the beginning of a symbol" + - name: TILDE + comment: "~ or ~@" + - name: UAMPERSAND + comment: "unary &" + - name: UCOLON_COLON + comment: "unary ::" + - name: UDOT_DOT + comment: "unary .. operator" + - name: UDOT_DOT_DOT + comment: "unary ... operator" + - name: UMINUS + comment: "-@" + - name: UMINUS_NUM + comment: "-@ for a number" + - name: UPLUS + comment: "+@" + - name: USTAR + comment: "unary *" + - name: USTAR_STAR + comment: "unary **" + - name: WORDS_SEP + comment: "a separator between words in a list" + - name: __END__ + comment: "marker for the point in the file at which the parser should stop" + - name: MISSING + comment: "a token that was expected but not found" + - name: NOT_PROVIDED + comment: "a token that was not present but it is okay" +flags: + - name: ArgumentsNodeFlags + values: + - name: CONTAINS_FORWARDING + comment: "if the arguments contain forwarding" + - name: CONTAINS_KEYWORDS + comment: "if the arguments contain keywords" + - name: CONTAINS_KEYWORD_SPLAT + comment: "if the arguments contain a keyword splat" + - name: CONTAINS_SPLAT + comment: "if the arguments contain a splat" + - name: CONTAINS_MULTIPLE_SPLATS + comment: "if the arguments contain multiple splats" + comment: Flags for arguments nodes. + - name: ArrayNodeFlags + values: + - name: CONTAINS_SPLAT + comment: "if array contains splat nodes" + comment: Flags for array nodes. + - name: CallNodeFlags + values: + - name: SAFE_NAVIGATION + comment: "&. operator" + - name: VARIABLE_CALL + comment: "a call that could have been a local variable" + - name: ATTRIBUTE_WRITE + comment: "a call that is an attribute write, so the value being written should be returned" + - name: IGNORE_VISIBILITY + comment: "a call that ignores method visibility" + comment: Flags for call nodes. + - name: EncodingFlags + values: + - name: FORCED_UTF8_ENCODING + comment: "internal bytes forced the encoding to UTF-8" + - name: FORCED_BINARY_ENCODING + comment: "internal bytes forced the encoding to binary" + comment: Flags for nodes that have unescaped content. + - name: IntegerBaseFlags + values: + - name: BINARY + comment: "0b prefix" + - name: DECIMAL + comment: "0d or no prefix" + - name: OCTAL + comment: "0o or 0 prefix" + - name: HEXADECIMAL + comment: "0x prefix" + comment: Flags for integer nodes that correspond to the base of the integer. + - name: InterpolatedStringNodeFlags + values: + - name: FROZEN + comment: "frozen by virtue of a `frozen_string_literal: true` comment or `--enable-frozen-string-literal`; only for adjacent string literals like `'a' 'b'`" + - name: MUTABLE + comment: "mutable by virtue of a `frozen_string_literal: false` comment or `--disable-frozen-string-literal`; only for adjacent string literals like `'a' 'b'`" + comment: Flags for interpolated string nodes that indicated mutability if they are also marked as literals. + - name: KeywordHashNodeFlags + values: + - name: SYMBOL_KEYS + comment: "a keyword hash which only has `AssocNode` elements all with symbol keys, which means the elements can be treated as keyword arguments" + comment: Flags for keyword hash nodes. + - name: LoopFlags + values: + - name: BEGIN_MODIFIER + comment: "a loop after a begin statement, so the body is executed first before the condition" + comment: Flags for while and until loop nodes. + - name: ParameterFlags + values: + - name: REPEATED_PARAMETER + comment: "a parameter name that has been repeated in the method signature" + comment: Flags for parameter nodes. + - name: ParenthesesNodeFlags + values: + - name: MULTIPLE_STATEMENTS + comment: "parentheses that contain multiple potentially void statements" + comment: Flags for parentheses nodes. + - name: RangeFlags + values: + - name: EXCLUDE_END + comment: "... operator" + comment: Flags for range and flip-flop nodes. + - name: RegularExpressionFlags + values: + - name: IGNORE_CASE + comment: "i - ignores the case of characters when matching" + - name: EXTENDED + comment: "x - ignores whitespace and allows comments in regular expressions" + - name: MULTI_LINE + comment: "m - allows $ to match the end of lines within strings" + - name: ONCE + comment: "o - only interpolates values into the regular expression once" + - name: EUC_JP + comment: "e - forces the EUC-JP encoding" + - name: ASCII_8BIT + comment: "n - forces the ASCII-8BIT encoding" + - name: WINDOWS_31J + comment: "s - forces the Windows-31J encoding" + - name: UTF_8 + comment: "u - forces the UTF-8 encoding" + - name: FORCED_UTF8_ENCODING + comment: "internal bytes forced the encoding to UTF-8" + - name: FORCED_BINARY_ENCODING + comment: "internal bytes forced the encoding to binary" + - name: FORCED_US_ASCII_ENCODING + comment: "internal bytes forced the encoding to US-ASCII" + comment: Flags for regular expression and match last line nodes. + - name: ShareableConstantNodeFlags + values: + - name: LITERAL + comment: "constant writes that should be modified with shareable constant value literal" + - name: EXPERIMENTAL_EVERYTHING + comment: "constant writes that should be modified with shareable constant value experimental everything" + - name: EXPERIMENTAL_COPY + comment: "constant writes that should be modified with shareable constant value experimental copy" + comment: Flags for shareable constant nodes. + - name: StringFlags + values: + - name: FORCED_UTF8_ENCODING + comment: "internal bytes forced the encoding to UTF-8" + - name: FORCED_BINARY_ENCODING + comment: "internal bytes forced the encoding to binary" + - name: FROZEN + comment: "frozen by virtue of a `frozen_string_literal: true` comment or `--enable-frozen-string-literal`" + - name: MUTABLE + comment: "mutable by virtue of a `frozen_string_literal: false` comment or `--disable-frozen-string-literal`" + comment: Flags for string nodes. + - name: SymbolFlags + values: + - name: FORCED_UTF8_ENCODING + comment: "internal bytes forced the encoding to UTF-8" + - name: FORCED_BINARY_ENCODING + comment: "internal bytes forced the encoding to binary" + - name: FORCED_US_ASCII_ENCODING + comment: "internal bytes forced the encoding to US-ASCII" + comment: Flags for symbol nodes. +nodes: + - name: AliasGlobalVariableNode + fields: + - name: new_name + type: node + kind: + - GlobalVariableReadNode + - BackReferenceReadNode + - NumberedReferenceReadNode + comment: | + Represents the new name of the global variable that can be used after aliasing. + + alias $foo $bar + ^^^^ + - name: old_name + type: node + kind: + - GlobalVariableReadNode + - BackReferenceReadNode + - NumberedReferenceReadNode + - on error: SymbolNode # alias $a b + - on error: MissingNode # alias $a 42 + comment: | + Represents the old name of the global variable that can be used before aliasing. + + alias $foo $bar + ^^^^ + - name: keyword_loc + type: location + comment: | + The location of the `alias` keyword. + + alias $foo $bar + ^^^^^ + comment: | + Represents the use of the `alias` keyword to alias a global variable. + + alias $foo $bar + ^^^^^^^^^^^^^^^ + - name: AliasMethodNode + fields: + - name: new_name + type: node + kind: + - SymbolNode + - InterpolatedSymbolNode + comment: | + Represents the new name of the method that will be aliased. + + alias foo bar + ^^^ + + alias :foo :bar + ^^^^ + + alias :"#{foo}" :"#{bar}" + ^^^^^^^^^ + - name: old_name + type: node + kind: + - SymbolNode + - InterpolatedSymbolNode + - on error: GlobalVariableReadNode # alias a $b + - on error: MissingNode # alias a 42 + comment: | + Represents the old name of the method that will be aliased. + + alias foo bar + ^^^ + + alias :foo :bar + ^^^^ + + alias :"#{foo}" :"#{bar}" + ^^^^^^^^^ + - name: keyword_loc + type: location + comment: | + Represents the location of the `alias` keyword. + + alias foo bar + ^^^^^ + comment: | + Represents the use of the `alias` keyword to alias a method. + + alias foo bar + ^^^^^^^^^^^^^ + - name: AlternationPatternNode + fields: + - name: left + type: node + kind: pattern expression + comment: | + Represents the left side of the expression. + + foo => bar | baz + ^^^ + - name: right + type: node + kind: pattern expression + comment: | + Represents the right side of the expression. + + foo => bar | baz + ^^^ + - name: operator_loc + type: location + comment: | + Represents the alternation operator location. + + foo => bar | baz + ^ + comment: | + Represents an alternation pattern in pattern matching. + + foo => bar | baz + ^^^^^^^^^ + - name: AndNode + fields: + - name: left + type: node + kind: non-void expression + comment: | + Represents the left side of the expression. It can be any [non-void expression](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#non-void-expression). + + left and right + ^^^^ + + 1 && 2 + ^ + - name: right + type: node + kind: Node + comment: | + Represents the right side of the expression. + + left && right + ^^^^^ + + 1 and 2 + ^ + - name: operator_loc + type: location + comment: | + The location of the `and` keyword or the `&&` operator. + + left and right + ^^^ + comment: | + Represents the use of the `&&` operator or the `and` keyword. + + left and right + ^^^^^^^^^^^^^^ + - name: ArgumentsNode + flags: ArgumentsNodeFlags + fields: + - name: arguments + type: node[] + kind: non-void expression + comment: | + The list of arguments, if present. These can be any [non-void expressions](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#non-void-expression). + + foo(bar, baz) + ^^^^^^^^ + comment: | + Represents a set of arguments to a method or a keyword. + + return foo, bar, baz + ^^^^^^^^^^^^^ + - name: ArrayNode + flags: ArrayNodeFlags + fields: + - name: elements + type: node[] + kind: non-void expression + comment: Represent the list of zero or more [non-void expressions](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#non-void-expression) within the array. + - name: opening_loc + type: location? + comment: | + Represents the optional source location for the opening token. + + [1,2,3] # "[" + %w[foo bar baz] # "%w[" + %I(apple orange banana) # "%I(" + foo = 1, 2, 3 # nil + - name: closing_loc + type: location? + comment: | + Represents the optional source location for the closing token. + + [1,2,3] # "]" + %w[foo bar baz] # "]" + %I(apple orange banana) # ")" + foo = 1, 2, 3 # nil + comment: | + Represents an array literal. This can be a regular array using brackets or a special array using % like %w or %i. + + [1, 2, 3] + ^^^^^^^^^ + - name: ArrayPatternNode + fields: + - name: constant + type: node? + kind: + - ConstantPathNode + - ConstantReadNode + comment: | + Represents the optional constant preceding the Array + + foo in Bar[] + ^^^ + + foo in Bar[1, 2, 3] + ^^^ + + foo in Bar::Baz[1, 2, 3] + ^^^^^^^^ + - name: requireds + type: node[] + kind: pattern expression + comment: | + Represents the required elements of the array pattern. + + foo in [1, 2] + ^ ^ + - name: rest + type: node? + kind: pattern expression + comment: | + Represents the rest element of the array pattern. + + foo in *bar + ^^^^ + - name: posts + type: node[] + kind: pattern expression + comment: | + Represents the elements after the rest element of the array pattern. + + foo in *bar, baz + ^^^ + - name: opening_loc + type: location? + comment: | + Represents the opening location of the array pattern. + + foo in [1, 2] + ^ + - name: closing_loc + type: location? + comment: | + Represents the closing location of the array pattern. + + foo in [1, 2] + ^ + comment: | + Represents an array pattern in pattern matching. + + foo in 1, 2 + ^^^^^^^^^^^ + + foo in [1, 2] + ^^^^^^^^^^^^^ + + foo in *bar + ^^^^^^^^^^^ + + foo in Bar[] + ^^^^^^^^^^^^ + + foo in Bar[1, 2, 3] + ^^^^^^^^^^^^^^^^^^^ + - name: AssocNode + fields: + - name: key + type: node + kind: non-void expression + comment: | + The key of the association. This can be any [non-void expression](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#non-void-expression). + + { a: b } + ^ + + { foo => bar } + ^^^ + + { def a; end => 1 } + ^^^^^^^^^^ + - name: value + type: node + kind: non-void expression + comment: | + The value of the association, if present. This can be any [non-void expression](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#non-void-expression). + + { foo => bar } + ^^^ + + { x: 1 } + ^ + - name: operator_loc + type: location? + comment: | + The location of the `=>` operator, if present. + + { foo => bar } + ^^ + comment: | + Represents a hash key/value pair. + + { a => b } + ^^^^^^ + - name: AssocSplatNode + fields: + - name: value + type: node? + kind: non-void expression + comment: | + The value to be splatted, if present. Will be missing when keyword rest argument forwarding is used. + + { **foo } + ^^^ + - name: operator_loc + type: location + comment: | + The location of the `**` operator. + + { **x } + ^^ + comment: | + Represents a splat in a hash literal. + + { **foo } + ^^^^^ + - name: BackReferenceReadNode + fields: + - name: name + type: constant + comment: | + The name of the back-reference variable, including the leading `$`. + + $& # name `:$&` + + $+ # name `:$+` + comment: | + Represents reading a reference to a field in the previous match. + + $' + ^^ + - name: BeginNode + fields: + - name: begin_keyword_loc + type: location? + comment: | + Represents the location of the `begin` keyword. + + begin x end + ^^^^^ + - name: statements + type: node? + kind: StatementsNode + comment: | + Represents the statements within the begin block. + + begin x end + ^ + - name: rescue_clause + type: node? + kind: RescueNode + comment: | + Represents the rescue clause within the begin block. + + begin x; rescue y; end + ^^^^^^^^ + - name: else_clause + type: node? + kind: ElseNode + comment: | + Represents the else clause within the begin block. + + begin x; rescue y; else z; end + ^^^^^^ + - name: ensure_clause + type: node? + kind: EnsureNode + comment: | + Represents the ensure clause within the begin block. + + begin x; ensure y; end + ^^^^^^^^ + - name: end_keyword_loc + type: location? + comment: | + Represents the location of the `end` keyword. + + begin x end + ^^^ + newline: false + comment: | + Represents a begin statement. + + begin + foo + end + ^^^^^ + - name: BlockArgumentNode + fields: + - name: expression + type: node? + kind: non-void expression + comment: | + The expression that is being passed as a block argument. This can be any [non-void expression](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#non-void-expression). + + foo(&args) + ^^^^^ + - name: operator_loc + type: location + comment: | + Represents the location of the `&` operator. + + foo(&args) + ^ + comment: | + Represents a block argument using `&`. + + bar(&args) + ^^^^^^^^^^ + - name: BlockLocalVariableNode + flags: ParameterFlags + fields: + - name: name + type: constant + comment: | + The name of the block local variable. + + a { |; b| } # name `:b` + ^ + comment: | + Represents a block local variable. + + a { |; b| } + ^ + - name: BlockNode + fields: + - name: locals + type: constant[] + comment: | + The local variables declared in the block. + + [1, 2, 3].each { |i| puts x } # locals: [:i] + ^ + - name: parameters + type: node? + kind: + - BlockParametersNode + - NumberedParametersNode + - ItParametersNode + comment: | + The parameters of the block. + + [1, 2, 3].each { |i| puts x } + ^^^ + [1, 2, 3].each { puts _1 } + ^^^^^^^^^^^ + [1, 2, 3].each { puts it } + ^^^^^^^^^^^ + - name: body + type: node? + kind: + - StatementsNode + - BeginNode + comment: | + The body of the block. + + [1, 2, 3].each { |i| puts x } + ^^^^^^ + - name: opening_loc + type: location + comment: | + Represents the location of the opening `{` or `do`. + + [1, 2, 3].each { |i| puts x } + ^ + - name: closing_loc + type: location + comment: | + Represents the location of the closing `}` or `end`. + + [1, 2, 3].each { |i| puts x } + ^ + comment: | + Represents a block of ruby code. + + [1, 2, 3].each { |i| puts x } + ^^^^^^^^^^^^^^ + - name: BlockParameterNode + flags: ParameterFlags + fields: + - name: name + type: constant? + comment: | + The name of the block parameter. + + def a(&b) # name `:b` + ^ + end + - name: name_loc + type: location? + comment: | + Represents the location of the block parameter name. + + def a(&b) + ^ + - name: operator_loc + type: location + comment: | + Represents the location of the `&` operator. + + def a(&b) + ^ + end + comment: | + Represents a block parameter of a method, block, or lambda definition. + + def a(&b) + ^^ + end + - name: BlockParametersNode + fields: + - name: parameters + type: node? + kind: ParametersNode + comment: | + Represents the parameters of the block. + + -> (a, b = 1; local) { } + ^^^^^^^^ + + foo do |a, b = 1; local| + ^^^^^^^^ + end + - name: locals + type: node[] + kind: BlockLocalVariableNode + comment: | + Represents the local variables of the block. + + -> (a, b = 1; local) { } + ^^^^^ + + foo do |a, b = 1; local| + ^^^^^ + end + - name: opening_loc + type: location? + comment: | + Represents the opening location of the block parameters. + + -> (a, b = 1; local) { } + ^ + + foo do |a, b = 1; local| + ^ + end + - name: closing_loc + type: location? + comment: | + Represents the closing location of the block parameters. + + -> (a, b = 1; local) { } + ^ + + foo do |a, b = 1; local| + ^ + end + comment: | + Represents a block's parameters declaration. + + -> (a, b = 1; local) { } + ^^^^^^^^^^^^^^^^^ + + foo do |a, b = 1; local| + ^^^^^^^^^^^^^^^^^ + end + - name: BreakNode + fields: + - name: arguments + type: node? + kind: ArgumentsNode + comment: | + The arguments to the break statement, if present. These can be any [non-void expressions](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#non-void-expression). + + break foo + ^^^ + - name: keyword_loc + type: location + comment: | + The location of the `break` keyword. + + break foo + ^^^^^ + comment: | + Represents the use of the `break` keyword. + + break foo + ^^^^^^^^^ + - name: CallAndWriteNode + flags: CallNodeFlags + fields: + - name: receiver + type: node? + kind: non-void expression + comment: | + The object that the method is being called on. This can be either `nil` or any [non-void expression](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#non-void-expression). + + foo.bar &&= value + ^^^ + - name: call_operator_loc + type: location? + comment: | + Represents the location of the call operator. + + foo.bar &&= value + ^ + - name: message_loc + type: location? + comment: | + Represents the location of the message. + + foo.bar &&= value + ^^^ + - name: read_name + type: constant + comment: | + Represents the name of the method being called. + + foo.bar &&= value # read_name `:bar` + ^^^ + - name: write_name + type: constant + comment: | + Represents the name of the method being written to. + + foo.bar &&= value # write_name `:bar=` + ^^^ + - name: operator_loc + type: location + comment: | + Represents the location of the operator. + + foo.bar &&= value + ^^^ + - name: value + type: node + kind: non-void expression + comment: | + Represents the value being assigned. + + foo.bar &&= value + ^^^^^ + comment: | + Represents the use of the `&&=` operator on a call. + + foo.bar &&= value + ^^^^^^^^^^^^^^^^^ + - name: CallNode + flags: CallNodeFlags + fields: + - name: receiver + type: node? + kind: non-void expression + comment: | + The object that the method is being called on. This can be either `nil` or any [non-void expression](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#non-void-expression). + + foo.bar + ^^^ + + +foo + ^^^ + + foo + bar + ^^^ + - name: call_operator_loc + type: location? + comment: | + Represents the location of the call operator. + + foo.bar + ^ + + foo&.bar + ^^ + - name: name + type: constant + comment: | + Represents the name of the method being called. + + foo.bar # name `:foo` + ^^^ + - name: message_loc + type: location? + comment: | + Represents the location of the message. + + foo.bar + ^^^ + - name: opening_loc + type: location? + comment: | + Represents the location of the left parenthesis. + foo(bar) + ^ + - name: arguments + type: node? + kind: ArgumentsNode + comment: | + Represents the arguments to the method call. These can be any [non-void expressions](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#non-void-expression). + + foo(bar) + ^^^ + - name: closing_loc + type: location? + comment: | + Represents the location of the right parenthesis. + + foo(bar) + ^ + - name: equal_loc + type: location? + comment: | + Represents the location of the equal sign, in the case that this is an attribute write. + + foo.bar = value + ^ + + foo[bar] = value + ^ + - name: block + type: node? + kind: + - BlockNode + - BlockArgumentNode + comment: | + Represents the block that is being passed to the method. + + foo { |a| a } + ^^^^^^^^^ + comment: | + Represents a method call, in all of the various forms that can take. + + foo + ^^^ + + foo() + ^^^^^ + + +foo + ^^^^ + + foo + bar + ^^^^^^^^^ + + foo.bar + ^^^^^^^ + + foo&.bar + ^^^^^^^^ + - name: CallOperatorWriteNode + flags: CallNodeFlags + fields: + - name: receiver + type: node? + kind: non-void expression + comment: | + The object that the method is being called on. This can be either `nil` or any [non-void expressions](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#non-void-expression). + + foo.bar += value + ^^^ + - name: call_operator_loc + type: location? + comment: | + Represents the location of the call operator. + + foo.bar += value + ^ + - name: message_loc + type: location? + comment: | + Represents the location of the message. + + foo.bar += value + ^^^ + - name: read_name + type: constant + comment: | + Represents the name of the method being called. + + foo.bar += value # read_name `:bar` + ^^^ + - name: write_name + type: constant + comment: | + Represents the name of the method being written to. + + foo.bar += value # write_name `:bar=` + ^^^ + - name: binary_operator + type: constant + comment: | + Represents the binary operator being used. + + foo.bar += value # binary_operator `:+` + ^ + - name: binary_operator_loc + type: location + comment: | + Represents the location of the binary operator. + + foo.bar += value + ^^ + - name: value + type: node + kind: non-void expression + comment: | + Represents the value being assigned. + + foo.bar += value + ^^^^^ + comment: | + Represents the use of an assignment operator on a call. + + foo.bar += baz + ^^^^^^^^^^^^^^ + - name: CallOrWriteNode + flags: CallNodeFlags + fields: + - name: receiver + type: node? + kind: non-void expression + comment: | + The object that the method is being called on. This can be either `nil` or any [non-void expressions](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#non-void-expression). + + foo.bar ||= value + ^^^ + - name: call_operator_loc + type: location? + comment: | + Represents the location of the call operator. + + foo.bar ||= value + ^ + - name: message_loc + type: location? + comment: | + Represents the location of the message. + + foo.bar ||= value + ^^^ + - name: read_name + type: constant + comment: | + Represents the name of the method being called. + + foo.bar ||= value # read_name `:bar` + ^^^ + - name: write_name + type: constant + comment: | + Represents the name of the method being written to. + + foo.bar ||= value # write_name `:bar=` + ^^^ + - name: operator_loc + type: location + comment: | + Represents the location of the operator. + + foo.bar ||= value + ^^^ + - name: value + type: node + kind: non-void expression + comment: | + Represents the value being assigned. + + foo.bar ||= value + ^^^^^ + comment: | + Represents the use of the `||=` operator on a call. + + foo.bar ||= value + ^^^^^^^^^^^^^^^^^ + - name: CallTargetNode + flags: CallNodeFlags + fields: + - name: receiver + type: node + kind: non-void expression + comment: | + The object that the method is being called on. This can be any [non-void expression](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#non-void-expression). + + foo.bar = 1 + ^^^ + - name: call_operator_loc + type: location + comment: | + Represents the location of the call operator. + + foo.bar = 1 + ^ + - name: name + type: constant + comment: | + Represents the name of the method being called. + + foo.bar = 1 # name `:foo` + ^^^ + - name: message_loc + type: location + comment: | + Represents the location of the message. + + foo.bar = 1 + ^^^ + comment: | + Represents assigning to a method call. + + foo.bar, = 1 + ^^^^^^^ + + begin + rescue => foo.bar + ^^^^^^^ + end + + for foo.bar in baz do end + ^^^^^^^ + - name: CapturePatternNode + fields: + - name: value + type: node + kind: pattern expression + comment: | + Represents the value to capture. + + foo => bar + ^^^ + - name: target + type: node + kind: LocalVariableTargetNode + comment: | + Represents the target of the capture. + + foo => bar + ^^^ + - name: operator_loc + type: location + comment: | + Represents the location of the `=>` operator. + + foo => bar + ^^ + comment: | + Represents assigning to a local variable in pattern matching. + + foo => [bar => baz] + ^^^^^^^^^^^^ + - name: CaseMatchNode + fields: + - name: predicate + type: node? + kind: non-void expression + comment: | + Represents the predicate of the case match. This can be either `nil` or any [non-void expressions](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#non-void-expression). + + case true; in false; end + ^^^^ + - name: conditions + type: node[] + kind: InNode + comment: | + Represents the conditions of the case match. + + case true; in false; end + ^^^^^^^^ + - name: else_clause + type: node? + kind: ElseNode + comment: | + Represents the else clause of the case match. + + case true; in false; else; end + ^^^^ + - name: case_keyword_loc + type: location + comment: | + Represents the location of the `case` keyword. + + case true; in false; end + ^^^^ + - name: end_keyword_loc + type: location + comment: | + Represents the location of the `end` keyword. + + case true; in false; end + ^^^ + comment: | + Represents the use of a case statement for pattern matching. + + case true + in false + end + ^^^^^^^^^ + - name: CaseNode + fields: + - name: predicate + type: node? + kind: non-void expression + comment: | + Represents the predicate of the case statement. This can be either `nil` or any [non-void expressions](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#non-void-expression). + + case true; when false; end + ^^^^ + - name: conditions + type: node[] + kind: WhenNode + comment: | + Represents the conditions of the case statement. + + case true; when false; end + ^^^^^^^^^^ + - name: else_clause + type: node? + kind: ElseNode + comment: | + Represents the else clause of the case statement. + + case true; when false; else; end + ^^^^ + - name: case_keyword_loc + type: location + comment: | + Represents the location of the `case` keyword. + + case true; when false; end + ^^^^ + - name: end_keyword_loc + type: location + comment: | + Represents the location of the `end` keyword. + + case true; when false; end + ^^^ + comment: | + Represents the use of a case statement. + + case true + when false + end + ^^^^^^^^^^ + - name: ClassNode + fields: + - name: locals + type: constant[] + - name: class_keyword_loc + type: location + comment: | + Represents the location of the `class` keyword. + + class Foo end + ^^^^^ + - name: constant_path + type: node + kind: + - ConstantReadNode + - ConstantPathNode + - on error: CallNode # class 0.X end + - name: inheritance_operator_loc + type: location? + comment: | + Represents the location of the `<` operator. + + class Foo < Bar + ^ + - name: superclass + type: node? + kind: non-void expression + comment: | + Represents the superclass of the class. + + class Foo < Bar + ^^^ + - name: body + type: node? + kind: + - StatementsNode + - BeginNode + comment: | + Represents the body of the class. + + class Foo + foo + ^^^ + - name: end_keyword_loc + type: location + comment: | + Represents the location of the `end` keyword. + + class Foo end + ^^^ + - name: name + type: constant + comment: | + The name of the class. + + class Foo end # name `:Foo` + comment: | + Represents a class declaration involving the `class` keyword. + + class Foo end + ^^^^^^^^^^^^^ + - name: ClassVariableAndWriteNode + fields: + - name: name + type: constant + comment: | + The name of the class variable, which is a `@@` followed by an [identifier](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#identifiers). + + @@target &&= value # name `:@@target` + ^^^^^^^^ + - name: name_loc + type: location + comment: | + Represents the location of the variable name. + + @@target &&= value + ^^^^^^^^ + - name: operator_loc + type: location + comment: | + Represents the location of the `&&=` operator. + + @@target &&= value + ^^^ + - name: value + type: node + kind: non-void expression + comment: | + Represents the value being assigned. This can be any [non-void expression](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#non-void-expression). + + @@target &&= value + ^^^^^ + comment: | + Represents the use of the `&&=` operator for assignment to a class variable. + + @@target &&= value + ^^^^^^^^^^^^^^^^^^ + - name: ClassVariableOperatorWriteNode + fields: + - name: name + type: constant + - name: name_loc + type: location + - name: binary_operator_loc + type: location + - name: value + type: node + kind: non-void expression + - name: binary_operator + type: constant + comment: | + Represents assigning to a class variable using an operator that isn't `=`. + + @@target += value + ^^^^^^^^^^^^^^^^^ + - name: ClassVariableOrWriteNode + fields: + - name: name + type: constant + - name: name_loc + type: location + - name: operator_loc + type: location + - name: value + type: node + kind: non-void expression + comment: | + Represents the use of the `||=` operator for assignment to a class variable. + + @@target ||= value + ^^^^^^^^^^^^^^^^^^ + - name: ClassVariableReadNode + fields: + - name: name + type: constant + comment: | + The name of the class variable, which is a `@@` followed by an [identifier](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#identifiers). + + @@abc # name `:@@abc` + + @@_test # name `:@@_test` + comment: | + Represents referencing a class variable. + + @@foo + ^^^^^ + - name: ClassVariableTargetNode + fields: + - name: name + type: constant + comment: | + Represents writing to a class variable in a context that doesn't have an explicit value. + + @@foo, @@bar = baz + ^^^^^ ^^^^^ + - name: ClassVariableWriteNode + fields: + - name: name + type: constant + comment: | + The name of the class variable, which is a `@@` followed by an [identifier](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#identifiers). + + @@abc = 123 # name `@@abc` + + @@_test = :test # name `@@_test` + - name: name_loc + type: location + comment: | + The location of the variable name. + + @@foo = :bar + ^^^^^ + - name: value + type: node + kind: non-void expression + comment: | + The value to write to the class variable. This can be any [non-void expression](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#non-void-expression). + + @@foo = :bar + ^^^^ + + @@_xyz = 123 + ^^^ + - name: operator_loc + type: location + comment: | + The location of the `=` operator. + + @@foo = :bar + ^ + comment: | + Represents writing to a class variable. + + @@foo = 1 + ^^^^^^^^^ + - name: ConstantAndWriteNode + fields: + - name: name + type: constant + - name: name_loc + type: location + - name: operator_loc + type: location + - name: value + type: node + kind: non-void expression + comment: | + Represents the use of the `&&=` operator for assignment to a constant. + + Target &&= value + ^^^^^^^^^^^^^^^^ + - name: ConstantOperatorWriteNode + fields: + - name: name + type: constant + - name: name_loc + type: location + - name: binary_operator_loc + type: location + - name: value + type: node + kind: non-void expression + - name: binary_operator + type: constant + comment: | + Represents assigning to a constant using an operator that isn't `=`. + + Target += value + ^^^^^^^^^^^^^^^ + - name: ConstantOrWriteNode + fields: + - name: name + type: constant + - name: name_loc + type: location + - name: operator_loc + type: location + - name: value + type: node + kind: non-void expression + comment: | + Represents the use of the `||=` operator for assignment to a constant. + + Target ||= value + ^^^^^^^^^^^^^^^^ + - name: ConstantPathAndWriteNode + fields: + - name: target + type: node + kind: ConstantPathNode + - name: operator_loc + type: location + - name: value + type: node + kind: non-void expression + comment: | + Represents the use of the `&&=` operator for assignment to a constant path. + + Parent::Child &&= value + ^^^^^^^^^^^^^^^^^^^^^^^ + - name: ConstantPathNode + fields: + - name: parent + type: node? + kind: non-void expression + comment: | + The left-hand node of the path, if present. It can be `nil` or any [non-void expression](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#non-void-expression). It will be `nil` when the constant lookup is at the root of the module tree. + + Foo::Bar + ^^^ + + self::Test + ^^^^ + + a.b::C + ^^^ + - name: name + type: constant? + comment: The name of the constant being accessed. This could be `nil` in the event of a syntax error. + - name: delimiter_loc + type: location + comment: | + The location of the `::` delimiter. + + ::Foo + ^^ + + One::Two + ^^ + - name: name_loc + type: location + comment: | + The location of the name of the constant. + + ::Foo + ^^^ + + One::Two + ^^^ + comment: | + Represents accessing a constant through a path of `::` operators. + + Foo::Bar + ^^^^^^^^ + - name: ConstantPathOperatorWriteNode + fields: + - name: target + type: node + kind: ConstantPathNode + - name: binary_operator_loc + type: location + - name: value + type: node + kind: non-void expression + - name: binary_operator + type: constant + comment: | + Represents assigning to a constant path using an operator that isn't `=`. + + Parent::Child += value + ^^^^^^^^^^^^^^^^^^^^^^ + - name: ConstantPathOrWriteNode + fields: + - name: target + type: node + kind: ConstantPathNode + - name: operator_loc + type: location + - name: value + type: node + kind: non-void expression + comment: | + Represents the use of the `||=` operator for assignment to a constant path. + + Parent::Child ||= value + ^^^^^^^^^^^^^^^^^^^^^^^ + - name: ConstantPathTargetNode + fields: + - name: parent + type: node? + kind: non-void expression + - name: name + type: constant? + - name: delimiter_loc + type: location + - name: name_loc + type: location + comment: | + Represents writing to a constant path in a context that doesn't have an explicit value. + + Foo::Foo, Bar::Bar = baz + ^^^^^^^^ ^^^^^^^^ + - name: ConstantPathWriteNode + fields: + - name: target + type: node + kind: ConstantPathNode + comment: | + A node representing the constant path being written to. + + Foo::Bar = 1 + ^^^^^^^^ + + ::Foo = :abc + ^^^^^ + - name: operator_loc + type: location + comment: | + The location of the `=` operator. + + ::ABC = 123 + ^ + - name: value + type: node + kind: non-void expression + comment: | + The value to write to the constant path. It can be any [non-void expression](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#non-void-expression). + + FOO::BAR = :abc + ^^^^ + comment: | + Represents writing to a constant path. + + ::Foo = 1 + ^^^^^^^^^ + + Foo::Bar = 1 + ^^^^^^^^^^^^ + + ::Foo::Bar = 1 + ^^^^^^^^^^^^^^ + - name: ConstantReadNode + fields: + - name: name + type: constant + comment: | + The name of the [constant](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#constants). + + X # name `:X` + + SOME_CONSTANT # name `:SOME_CONSTANT` + comment: | + Represents referencing a constant. + + Foo + ^^^ + - name: ConstantTargetNode + fields: + - name: name + type: constant + comment: | + Represents writing to a constant in a context that doesn't have an explicit value. + + Foo, Bar = baz + ^^^ ^^^ + - name: ConstantWriteNode + fields: + - name: name + type: constant + comment: | + The name of the [constant](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#constants). + + Foo = :bar # name `:Foo` + + XYZ = 1 # name `:XYZ` + - name: name_loc + type: location + comment: | + The location of the constant name. + + FOO = 1 + ^^^ + - name: value + type: node + kind: non-void expression + comment: | + The value to write to the constant. It can be any [non-void expression](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#non-void-expression). + + FOO = :bar + ^^^^ + + MyClass = Class.new + ^^^^^^^^^ + - name: operator_loc + type: location + comment: | + The location of the `=` operator. + + FOO = :bar + ^ + comment: | + Represents writing to a constant. + + Foo = 1 + ^^^^^^^ + - name: DefNode + fields: + - name: name + type: constant + - name: name_loc + type: location + - name: receiver + type: node? + kind: non-void expression + - name: parameters + type: node? + kind: ParametersNode + - name: body + type: node? + kind: + - StatementsNode + - BeginNode + - name: locals + type: constant[] + - name: def_keyword_loc + type: location + - name: operator_loc + type: location? + - name: lparen_loc + type: location? + - name: rparen_loc + type: location? + - name: equal_loc + type: location? + - name: end_keyword_loc + type: location? + comment: | + Represents a method definition. + + def method + end + ^^^^^^^^^^ + - name: DefinedNode + fields: + - name: lparen_loc + type: location? + - name: value + type: node + kind: Node # More than non-void expression as defined?(return) is allowed, yet defined?(BEGIN{}) is SyntaxError + - name: rparen_loc + type: location? + - name: keyword_loc + type: location + comment: | + Represents the use of the `defined?` keyword. + + defined?(a) + ^^^^^^^^^^^ + - name: ElseNode + fields: + - name: else_keyword_loc + type: location + - name: statements + type: node? + kind: StatementsNode + - name: end_keyword_loc + type: location? + comment: | + Represents an `else` clause in a `case`, `if`, or `unless` statement. + + if a then b else c end + ^^^^^^^^^^ + - name: EmbeddedStatementsNode + fields: + - name: opening_loc + type: location + - name: statements + type: node? + kind: StatementsNode + - name: closing_loc + type: location + comment: | + Represents an interpolated set of statements. + + "foo #{bar}" + ^^^^^^ + - name: EmbeddedVariableNode + fields: + - name: operator_loc + type: location + - name: variable + type: node + kind: + - InstanceVariableReadNode + - ClassVariableReadNode + - GlobalVariableReadNode + - BackReferenceReadNode + - NumberedReferenceReadNode + comment: | + Represents an interpolated variable. + + "foo #@bar" + ^^^^^ + - name: EnsureNode + fields: + - name: ensure_keyword_loc + type: location + - name: statements + type: node? + kind: StatementsNode + - name: end_keyword_loc + type: location + comment: | + Represents an `ensure` clause in a `begin` statement. + + begin + foo + ensure + ^^^^^^ + bar + end + - name: FalseNode + comment: | + Represents the use of the literal `false` keyword. + + false + ^^^^^ + - name: FindPatternNode + fields: + - name: constant + type: node? + kind: + - ConstantPathNode + - ConstantReadNode + comment: | + Represents the optional constant preceding the pattern + + foo in Foo(*bar, baz, *qux) + ^^^ + - name: left + type: node + kind: SplatNode + comment: | + Represents the first wildcard node in the pattern. + + foo in *bar, baz, *qux + ^^^^ + + foo in Foo(*bar, baz, *qux) + ^^^^ + - name: requireds + type: node[] + kind: pattern expression + comment: | + Represents the nodes in between the wildcards. + + foo in *bar, baz, *qux + ^^^ + + foo in Foo(*bar, baz, 1, *qux) + ^^^^^^ + - name: right + type: node + kind: + - SplatNode + - on error: MissingNode + comment: | + Represents the second wildcard node in the pattern. + + foo in *bar, baz, *qux + ^^^^ + + foo in Foo(*bar, baz, *qux) + ^^^^ + - name: opening_loc + type: location? + comment: | + The location of the opening brace. + + foo in [*bar, baz, *qux] + ^ + + foo in Foo(*bar, baz, *qux) + ^ + - name: closing_loc + type: location? + comment: | + The location of the closing brace. + + foo in [*bar, baz, *qux] + ^ + + foo in Foo(*bar, baz, *qux) + ^ + comment: | + Represents a find pattern in pattern matching. + + foo in *bar, baz, *qux + ^^^^^^^^^^^^^^^ + + foo in [*bar, baz, *qux] + ^^^^^^^^^^^^^^^^^ + + foo in Foo(*bar, baz, *qux) + ^^^^^^^^^^^^^^^^^^^^ + + foo => *bar, baz, *qux + ^^^^^^^^^^^^^^^ + - name: FlipFlopNode + flags: RangeFlags + fields: + - name: left + type: node? + kind: non-void expression + - name: right + type: node? + kind: non-void expression + - name: operator_loc + type: location + comment: | + Represents the use of the `..` or `...` operators to create flip flops. + + baz if foo .. bar + ^^^^^^^^^^ + - name: FloatNode + fields: + - name: value + type: double + comment: The value of the floating point number as a Float. + comment: | + Represents a floating point number literal. + + 1.0 + ^^^ + - name: ForNode + fields: + - name: index + type: node + kind: + - LocalVariableTargetNode + - InstanceVariableTargetNode + - ClassVariableTargetNode + - GlobalVariableTargetNode + - ConstantTargetNode + - ConstantPathTargetNode + - CallTargetNode + - IndexTargetNode + - MultiTargetNode + - on error: BackReferenceReadNode # for $& in a end + - on error: NumberedReferenceReadNode # for $1 in a end + - on error: MissingNode # for in 1..10; end + comment: | + The index expression for `for` loops. + + for i in a end + ^ + - name: collection + type: node + kind: non-void expression + comment: | + The collection to iterate over. + + for i in a end + ^ + - name: statements + type: node? + kind: StatementsNode + comment: | + Represents the body of statements to execute for each iteration of the loop. + + for i in a + foo(i) + ^^^^^^ + end + - name: for_keyword_loc + type: location + comment: | + The location of the `for` keyword. + + for i in a end + ^^^ + - name: in_keyword_loc + type: location + comment: | + The location of the `in` keyword. + + for i in a end + ^^ + - name: do_keyword_loc + type: location? + comment: | + The location of the `do` keyword, if present. + + for i in a do end + ^^ + - name: end_keyword_loc + type: location + comment: | + The location of the `end` keyword. + + for i in a end + ^^^ + comment: | + Represents the use of the `for` keyword. + + for i in a end + ^^^^^^^^^^^^^^ + - name: ForwardingArgumentsNode + comment: | + Represents forwarding all arguments to this method to another method. + + def foo(...) + bar(...) + ^^^ + end + - name: ForwardingParameterNode + comment: | + Represents the use of the forwarding parameter in a method, block, or lambda declaration. + + def foo(...) + ^^^ + end + - name: ForwardingSuperNode + fields: + - name: block + type: node? + kind: BlockNode + comment: | + All other arguments are forwarded as normal, except the original block is replaced with the new block. + comment: | + Represents the use of the `super` keyword without parentheses or arguments, but which might have a block. + + super + ^^^^^ + + super { 123 } + ^^^^^^^^^^^^^ + + If it has any other arguments, it would be a `SuperNode` instead. + - name: GlobalVariableAndWriteNode + fields: + - name: name + type: constant + - name: name_loc + type: location + - name: operator_loc + type: location + - name: value + type: node + kind: non-void expression + comment: | + Represents the use of the `&&=` operator for assignment to a global variable. + + $target &&= value + ^^^^^^^^^^^^^^^^^ + - name: GlobalVariableOperatorWriteNode + fields: + - name: name + type: constant + - name: name_loc + type: location + - name: binary_operator_loc + type: location + - name: value + type: node + kind: non-void expression + - name: binary_operator + type: constant + comment: | + Represents assigning to a global variable using an operator that isn't `=`. + + $target += value + ^^^^^^^^^^^^^^^^ + - name: GlobalVariableOrWriteNode + fields: + - name: name + type: constant + - name: name_loc + type: location + - name: operator_loc + type: location + - name: value + type: node + kind: non-void expression + comment: | + Represents the use of the `||=` operator for assignment to a global variable. + + $target ||= value + ^^^^^^^^^^^^^^^^^ + - name: GlobalVariableReadNode + fields: + - name: name + type: constant + comment: | + The name of the global variable, which is a `$` followed by an [identifier](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#identifier). Alternatively, it can be one of the special global variables designated by a symbol. + + $foo # name `:$foo` + + $_Test # name `:$_Test` + comment: | + Represents referencing a global variable. + + $foo + ^^^^ + - name: GlobalVariableTargetNode + fields: + - name: name + type: constant + comment: | + Represents writing to a global variable in a context that doesn't have an explicit value. + + $foo, $bar = baz + ^^^^ ^^^^ + - name: GlobalVariableWriteNode + fields: + - name: name + type: constant + comment: | + The name of the global variable, which is a `$` followed by an [identifier](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#identifier). Alternatively, it can be one of the special global variables designated by a symbol. + + $foo = :bar # name `:$foo` + + $_Test = 123 # name `:$_Test` + - name: name_loc + type: location + comment: | + The location of the global variable's name. + + $foo = :bar + ^^^^ + - name: value + type: node + kind: non-void expression + comment: | + The value to write to the global variable. It can be any [non-void expression](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#non-void-expression). + + $foo = :bar + ^^^^ + + $-xyz = 123 + ^^^ + - name: operator_loc + type: location + comment: | + The location of the `=` operator. + + $foo = :bar + ^ + comment: | + Represents writing to a global variable. + + $foo = 1 + ^^^^^^^^ + - name: HashNode + fields: + - name: opening_loc + type: location + comment: | + The location of the opening brace. + + { a => b } + ^ + - name: elements + type: node[] + kind: + - AssocNode + - AssocSplatNode + comment: | + The elements of the hash. These can be either `AssocNode`s or `AssocSplatNode`s. + + { a: b } + ^^^^ + + { **foo } + ^^^^^ + - name: closing_loc + type: location + comment: | + The location of the closing brace. + + { a => b } + ^ + comment: | + Represents a hash literal. + + { a => b } + ^^^^^^^^^^ + - name: HashPatternNode + fields: + - name: constant + type: node? + kind: + - ConstantPathNode + - ConstantReadNode + comment: | + Represents the optional constant preceding the Hash. + + foo => Bar[a: 1, b: 2] + ^^^ + + foo => Bar::Baz[a: 1, b: 2] + ^^^^^^^^ + - name: elements + type: node[] + kind: AssocNode + comment: | + Represents the explicit named hash keys and values. + + foo => { a: 1, b:, ** } + ^^^^^^^^ + - name: rest + type: node? + kind: + - AssocSplatNode + - NoKeywordsParameterNode + comment: | + Represents the rest of the Hash keys and values. This can be named, unnamed, or explicitly forbidden via `**nil`, this last one results in a `NoKeywordsParameterNode`. + + foo => { a: 1, b:, **c } + ^^^ + + foo => { a: 1, b:, ** } + ^^ + + foo => { a: 1, b:, **nil } + ^^^^^ + - name: opening_loc + type: location? + comment: | + The location of the opening brace. + + foo => { a: 1 } + ^ + + foo => Bar[a: 1] + ^ + - name: closing_loc + type: location? + comment: | + The location of the closing brace. + + foo => { a: 1 } + ^ + + foo => Bar[a: 1] + ^ + comment: | + Represents a hash pattern in pattern matching. + + foo => { a: 1, b: 2 } + ^^^^^^^^^^^^^^ + + foo => { a: 1, b: 2, **c } + ^^^^^^^^^^^^^^^^^^^ + + foo => Bar[a: 1, b: 2] + ^^^^^^^^^^^^^^^ + + foo in { a: 1, b: 2 } + ^^^^^^^^^^^^^^ + - name: IfNode + fields: + - name: if_keyword_loc + type: location? + comment: | + The location of the `if` keyword if present. + + bar if foo + ^^ + + The `if_keyword_loc` field will be `nil` when the `IfNode` represents a ternary expression. + - name: predicate + type: node + kind: non-void expression + comment: | + The node for the condition the `IfNode` is testing. + + if foo + ^^^ + bar + end + + bar if foo + ^^^ + + foo ? bar : baz + ^^^ + - name: then_keyword_loc + type: location? + comment: | + The location of the `then` keyword (if present) or the `?` in a ternary expression, `nil` otherwise. + + if foo then bar end + ^^^^ + + a ? b : c + ^ + - name: statements + type: node? + kind: StatementsNode + comment: | + Represents the body of statements that will be executed when the predicate is evaluated as truthy. Will be `nil` when no body is provided. + + if foo + bar + ^^^ + baz + ^^^ + end + - name: subsequent + type: node? + kind: + - ElseNode + - IfNode + comment: | + Represents an `ElseNode` or an `IfNode` when there is an `else` or an `elsif` in the `if` statement. + + if foo + bar + elsif baz + ^^^^^^^^^ + qux + ^^^ + end + ^^^ + + if foo then bar else baz end + ^^^^^^^^^^^^ + - name: end_keyword_loc + type: location? + comment: | + The location of the `end` keyword if present, `nil` otherwise. + + if foo + bar + end + ^^^ + newline: predicate + comment: | + Represents the use of the `if` keyword, either in the block form or the modifier form, or a ternary expression. + + bar if foo + ^^^^^^^^^^ + + if foo then bar end + ^^^^^^^^^^^^^^^^^^^ + + foo ? bar : baz + ^^^^^^^^^^^^^^^ + - name: ImaginaryNode + fields: + - name: numeric + type: node + kind: + - FloatNode + - IntegerNode + - RationalNode + comment: | + Represents an imaginary number literal. + + 1.0i + ^^^^ + - name: ImplicitNode + fields: + - name: value + type: node + kind: + - LocalVariableReadNode + - CallNode + - ConstantReadNode + - LocalVariableTargetNode + comment: | + Represents a node that is implicitly being added to the tree but doesn't correspond directly to a node in the source. + + { foo: } + ^^^^ + + { Foo: } + ^^^^ + + foo in { bar: } + ^^^^ + - name: ImplicitRestNode + comment: | + Represents using a trailing comma to indicate an implicit rest parameter. + + foo { |bar,| } + ^ + + foo in [bar,] + ^ + + for foo, in bar do end + ^ + + foo, = bar + ^ + - name: InNode + fields: + - name: pattern + type: node + kind: pattern expression + - name: statements + type: node? + kind: StatementsNode + - name: in_loc + type: location + - name: then_loc + type: location? + comment: | + Represents the use of the `in` keyword in a case statement. + + case a; in b then c end + ^^^^^^^^^^^ + - name: IndexAndWriteNode + flags: CallNodeFlags + fields: + - name: receiver + type: node? + kind: non-void expression + - name: call_operator_loc + type: location? + - name: opening_loc + type: location + - name: arguments + type: node? + kind: ArgumentsNode + - name: closing_loc + type: location + - name: block + type: node? + kind: BlockArgumentNode # foo[&b] &&= value, only valid on Ruby < 3.4 + - name: operator_loc + type: location + - name: value + type: node + kind: non-void expression + comment: | + Represents the use of the `&&=` operator on a call to the `[]` method. + + foo.bar[baz] &&= value + ^^^^^^^^^^^^^^^^^^^^^^ + - name: IndexOperatorWriteNode + flags: CallNodeFlags + fields: + - name: receiver + type: node? + kind: non-void expression + - name: call_operator_loc + type: location? + - name: opening_loc + type: location + - name: arguments + type: node? + kind: ArgumentsNode + - name: closing_loc + type: location + - name: block + type: node? + kind: BlockArgumentNode # foo[&b] += value, only valid on Ruby < 3.4 + - name: binary_operator + type: constant + - name: binary_operator_loc + type: location + - name: value + type: node + kind: non-void expression + comment: | + Represents the use of an assignment operator on a call to `[]`. + + foo.bar[baz] += value + ^^^^^^^^^^^^^^^^^^^^^ + - name: IndexOrWriteNode + flags: CallNodeFlags + fields: + - name: receiver + type: node? + kind: non-void expression + - name: call_operator_loc + type: location? + - name: opening_loc + type: location + - name: arguments + type: node? + kind: ArgumentsNode + - name: closing_loc + type: location + - name: block + type: node? + kind: BlockArgumentNode # foo[&b] ||= value, only valid on Ruby < 3.4 + - name: operator_loc + type: location + - name: value + type: node + kind: non-void expression + comment: | + Represents the use of the `||=` operator on a call to `[]`. + + foo.bar[baz] ||= value + ^^^^^^^^^^^^^^^^^^^^^^ + - name: IndexTargetNode + flags: CallNodeFlags + fields: + - name: receiver + type: node + kind: non-void expression + - name: opening_loc + type: location + - name: arguments + type: node? + kind: ArgumentsNode + - name: closing_loc + type: location + - name: block + type: node? + kind: BlockArgumentNode # foo[&b], = 1, only valid on Ruby < 3.4 + comment: | + Represents assigning to an index. + + foo[bar], = 1 + ^^^^^^^^ + + begin + rescue => foo[bar] + ^^^^^^^^ + end + + for foo[bar] in baz do end + ^^^^^^^^ + - name: InstanceVariableAndWriteNode + fields: + - name: name + type: constant + - name: name_loc + type: location + - name: operator_loc + type: location + - name: value + type: node + kind: non-void expression + comment: | + Represents the use of the `&&=` operator for assignment to an instance variable. + + @target &&= value + ^^^^^^^^^^^^^^^^^ + - name: InstanceVariableOperatorWriteNode + fields: + - name: name + type: constant + - name: name_loc + type: location + - name: binary_operator_loc + type: location + - name: value + type: node + kind: non-void expression + - name: binary_operator + type: constant + comment: | + Represents assigning to an instance variable using an operator that isn't `=`. + + @target += value + ^^^^^^^^^^^^^^^^ + - name: InstanceVariableOrWriteNode + fields: + - name: name + type: constant + - name: name_loc + type: location + - name: operator_loc + type: location + - name: value + type: node + kind: non-void expression + comment: | + Represents the use of the `||=` operator for assignment to an instance variable. + + @target ||= value + ^^^^^^^^^^^^^^^^^ + - name: InstanceVariableReadNode + fields: + - name: name + type: constant + comment: | + The name of the instance variable, which is a `@` followed by an [identifier](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#identifiers). + + @x # name `:@x` + + @_test # name `:@_test` + comment: | + Represents referencing an instance variable. + + @foo + ^^^^ + - name: InstanceVariableTargetNode + fields: + - name: name + type: constant + comment: | + Represents writing to an instance variable in a context that doesn't have an explicit value. + + @foo, @bar = baz + ^^^^ ^^^^ + - name: InstanceVariableWriteNode + fields: + - name: name + type: constant + comment: | + The name of the instance variable, which is a `@` followed by an [identifier](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#identifiers). + + @x = :y # name `:@x` + + @_foo = "bar" # name `@_foo` + - name: name_loc + type: location + comment: | + The location of the variable name. + + @_x = 1 + ^^^ + - name: value + type: node + kind: non-void expression + comment: | + The value to write to the instance variable. It can be any [non-void expression](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#non-void-expression). + + @foo = :bar + ^^^^ + + @_x = 1234 + ^^^^ + - name: operator_loc + type: location + comment: | + The location of the `=` operator. + + @x = y + ^ + comment: | + Represents writing to an instance variable. + + @foo = 1 + ^^^^^^^^ + - name: IntegerNode + flags: IntegerBaseFlags + fields: + - name: value + type: integer + comment: The value of the integer literal as a number. + comment: | + Represents an integer number literal. + + 1 + ^ + - name: InterpolatedMatchLastLineNode + flags: RegularExpressionFlags + fields: + - name: opening_loc + type: location + - name: parts + type: node[] + kind: + - StringNode + - EmbeddedStatementsNode + - EmbeddedVariableNode + - name: closing_loc + type: location + newline: parts + comment: | + Represents a regular expression literal that contains interpolation that is being used in the predicate of a conditional to implicitly match against the last line read by an IO object. + + if /foo #{bar} baz/ then end + ^^^^^^^^^^^^^^^^ + - name: InterpolatedRegularExpressionNode + flags: RegularExpressionFlags + fields: + - name: opening_loc + type: location + - name: parts + type: node[] + kind: + - StringNode + - EmbeddedStatementsNode + - EmbeddedVariableNode + - name: closing_loc + type: location + newline: parts + comment: | + Represents a regular expression literal that contains interpolation. + + /foo #{bar} baz/ + ^^^^^^^^^^^^^^^^ + - name: InterpolatedStringNode + flags: InterpolatedStringNodeFlags + fields: + - name: opening_loc + type: location? + - name: parts + type: node[] + kind: + - StringNode + - EmbeddedStatementsNode + - EmbeddedVariableNode + - InterpolatedStringNode # `"a" "#{b}"` + - on error: XStringNode # `<<`FOO` "bar" + - on error: InterpolatedXStringNode + - on error: SymbolNode + - on error: InterpolatedSymbolNode + - name: closing_loc + type: location? + newline: parts + comment: | + Represents a string literal that contains interpolation. + + "foo #{bar} baz" + ^^^^^^^^^^^^^^^^ + - name: InterpolatedSymbolNode + fields: + - name: opening_loc + type: location? + - name: parts + type: node[] + kind: + - StringNode + - EmbeddedStatementsNode + - EmbeddedVariableNode + - name: closing_loc + type: location? + newline: parts + comment: | + Represents a symbol literal that contains interpolation. + + :"foo #{bar} baz" + ^^^^^^^^^^^^^^^^^ + - name: InterpolatedXStringNode + fields: + - name: opening_loc + type: location + - name: parts + type: node[] + kind: + - StringNode + - EmbeddedStatementsNode + - EmbeddedVariableNode + - name: closing_loc + type: location + newline: parts + comment: | + Represents an xstring literal that contains interpolation. + + `foo #{bar} baz` + ^^^^^^^^^^^^^^^^ + - name: ItLocalVariableReadNode + comment: | + Represents reading from the implicit `it` local variable. + + -> { it } + ^^ + - name: ItParametersNode + comment: | + Represents an implicit set of parameters through the use of the `it` keyword within a block or lambda. + + -> { it + it } + ^^^^^^^^^^^^^^ + - name: KeywordHashNode + flags: KeywordHashNodeFlags + fields: + - name: elements + type: node[] + kind: + - AssocNode + - AssocSplatNode + comment: | + Represents a hash literal without opening and closing braces. + + foo(a: b) + ^^^^ + - name: KeywordRestParameterNode + flags: ParameterFlags + fields: + - name: name + type: constant? + - name: name_loc + type: location? + - name: operator_loc + type: location + comment: | + Represents a keyword rest parameter to a method, block, or lambda definition. + + def a(**b) + ^^^ + end + - name: LambdaNode + fields: + - name: locals + type: constant[] + - name: operator_loc + type: location + - name: opening_loc + type: location + - name: closing_loc + type: location + - name: parameters + type: node? + kind: + - BlockParametersNode + - NumberedParametersNode + - ItParametersNode + - name: body + type: node? + kind: + - StatementsNode + - BeginNode + comment: | + Represents using a lambda literal (not the lambda method call). + + ->(value) { value * 2 } + ^^^^^^^^^^^^^^^^^^^^^^^ + - name: LocalVariableAndWriteNode + fields: + - name: name_loc + type: location + - name: operator_loc + type: location + - name: value + type: node + kind: non-void expression + - name: name + type: constant + - name: depth + type: uint32 + comment: | + Represents the use of the `&&=` operator for assignment to a local variable. + + target &&= value + ^^^^^^^^^^^^^^^^ + - name: LocalVariableOperatorWriteNode + fields: + - name: name_loc + type: location + - name: binary_operator_loc + type: location + - name: value + type: node + kind: non-void expression + - name: name + type: constant + - name: binary_operator + type: constant + - name: depth + type: uint32 + comment: | + Represents assigning to a local variable using an operator that isn't `=`. + + target += value + ^^^^^^^^^^^^^^^ + - name: LocalVariableOrWriteNode + fields: + - name: name_loc + type: location + - name: operator_loc + type: location + - name: value + type: node + kind: non-void expression + - name: name + type: constant + - name: depth + type: uint32 + comment: | + Represents the use of the `||=` operator for assignment to a local variable. + + target ||= value + ^^^^^^^^^^^^^^^^ + - name: LocalVariableReadNode + fields: + - name: name + type: constant + comment: | + The name of the local variable, which is an [identifier](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#identifiers). + + x # name `:x` + + _Test # name `:_Test` + + Note that this can also be an underscore followed by a number for the default block parameters. + + _1 # name `:_1` + + - name: depth + type: uint32 + comment: | + The number of visible scopes that should be searched to find the origin of this local variable. + + foo = 1; foo # depth 0 + + bar = 2; tap { bar } # depth 1 + + The specific rules for calculating the depth may differ from individual Ruby implementations, as they are not specified by the language. For more information, see [the Prism documentation](https://github.com/ruby/prism/blob/main/docs/local_variable_depth.md). + comment: | + Represents reading a local variable. Note that this requires that a local variable of the same name has already been written to in the same scope, otherwise it is parsed as a method call. + + foo + ^^^ + - name: LocalVariableTargetNode + fields: + - name: name + type: constant + - name: depth + type: uint32 + comment: | + Represents writing to a local variable in a context that doesn't have an explicit value. + + foo, bar = baz + ^^^ ^^^ + + foo => baz + ^^^ + - name: LocalVariableWriteNode + fields: + - name: name + type: constant + comment: | + The name of the local variable, which is an [identifier](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#identifiers). + + foo = :bar # name `:foo` + + abc = 123 # name `:abc` + - name: depth + type: uint32 + comment: | + The number of semantic scopes we have to traverse to find the declaration of this variable. + + foo = 1 # depth 0 + + tap { foo = 1 } # depth 1 + + The specific rules for calculating the depth may differ from individual Ruby implementations, as they are not specified by the language. For more information, see [the Prism documentation](https://github.com/ruby/prism/blob/main/docs/local_variable_depth.md). + - name: name_loc + type: location + comment: | + The location of the variable name. + + foo = :bar + ^^^ + - name: value + type: node + kind: non-void expression + comment: | + The value to write to the local variable. It can be any [non-void expression](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#non-void-expression). + + foo = :bar + ^^^^ + + abc = 1234 + ^^^^ + + Note that since the name of a local variable is known before the value is parsed, it is valid for a local variable to appear within the value of its own write. + + foo = foo + - name: operator_loc + type: location + comment: | + The location of the `=` operator. + + x = :y + ^ + comment: | + Represents writing to a local variable. + + foo = 1 + ^^^^^^^ + - name: MatchLastLineNode + flags: RegularExpressionFlags + fields: + - name: opening_loc + type: location + - name: content_loc + type: location + - name: closing_loc + type: location + - name: unescaped + type: string + comment: | + Represents a regular expression literal used in the predicate of a conditional to implicitly match against the last line read by an IO object. + + if /foo/i then end + ^^^^^^ + - name: MatchPredicateNode + fields: + - name: value + type: node + kind: non-void expression + - name: pattern + type: node + kind: pattern expression + - name: operator_loc + type: location + comment: | + Represents the use of the modifier `in` operator. + + foo in bar + ^^^^^^^^^^ + - name: MatchRequiredNode + fields: + - name: value + type: node + kind: non-void expression + comment: | + Represents the left-hand side of the operator. + + foo => bar + ^^^ + - name: pattern + type: node + kind: pattern expression + comment: | + Represents the right-hand side of the operator. The type of the node depends on the expression. + + Anything that looks like a local variable name (including `_`) will result in a `LocalVariableTargetNode`. + + foo => a # This is equivalent to writing `a = foo` + ^ + + Using an explicit `Array` or combining expressions with `,` will result in a `ArrayPatternNode`. This can be preceded by a constant. + + foo => [a] + ^^^ + + foo => a, b + ^^^^ + + foo => Bar[a, b] + ^^^^^^^^^ + + If the array pattern contains at least two wildcard matches, a `FindPatternNode` is created instead. + + foo => *, 1, *a + ^^^^^ + + Using an explicit `Hash` or a constant with square brackets and hash keys in the square brackets will result in a `HashPatternNode`. + + foo => { a: 1, b: } + + foo => Bar[a: 1, b:] + + foo => Bar[**] + + To use any variable that needs run time evaluation, pinning is required. This results in a `PinnedVariableNode` + + foo => ^a + ^^ + + Similar, any expression can be used with pinning. This results in a `PinnedExpressionNode`. + + foo => ^(a + 1) + + Anything else will result in the regular node for that expression, for example a `ConstantReadNode`. + + foo => CONST + - name: operator_loc + type: location + comment: | + The location of the operator. + + foo => bar + ^^ + comment: | + Represents the use of the `=>` operator. + + foo => bar + ^^^^^^^^^^ + - name: MatchWriteNode + fields: + - name: call + type: node + kind: CallNode + - name: targets + type: node[] + kind: LocalVariableTargetNode + comment: | + Represents writing local variables using a regular expression match with named capture groups. + + /(?<foo>bar)/ =~ baz + ^^^^^^^^^^^^^^^^^^^^ + - name: MissingNode + comment: | + Represents a node that is missing from the source and results in a syntax error. + - name: ModuleNode + fields: + - name: locals + type: constant[] + - name: module_keyword_loc + type: location + - name: constant_path + type: node + kind: + - ConstantReadNode + - ConstantPathNode + - on error: MissingNode # module Parent module end + - name: body + type: node? + kind: + - StatementsNode + - BeginNode + - name: end_keyword_loc + type: location + - name: name + type: constant + comment: | + Represents a module declaration involving the `module` keyword. + + module Foo end + ^^^^^^^^^^^^^^ + - name: MultiTargetNode + fields: + - name: lefts + type: node[] + kind: + - LocalVariableTargetNode + - InstanceVariableTargetNode + - ClassVariableTargetNode + - GlobalVariableTargetNode + - ConstantTargetNode + - ConstantPathTargetNode + - CallTargetNode + - IndexTargetNode + - MultiTargetNode + - RequiredParameterNode # def m((a,b)); end + - on error: BackReferenceReadNode # a, (b, $&) = z + - on error: NumberedReferenceReadNode # a, (b, $1) = z + comment: | + Represents the targets expressions before a splat node. + + a, (b, c, *) = 1, 2, 3, 4, 5 + ^^^^ + + The splat node can be absent, in that case all target expressions are in the left field. + + a, (b, c) = 1, 2, 3, 4, 5 + ^^^^ + - name: rest + type: node? + kind: + - ImplicitRestNode + - SplatNode + comment: | + Represents a splat node in the target expression. + + a, (b, *c) = 1, 2, 3, 4 + ^^ + + The variable can be empty, this results in a `SplatNode` with a `nil` expression field. + + a, (b, *) = 1, 2, 3, 4 + ^ + + If the `*` is omitted, this field will contain an `ImplicitRestNode` + + a, (b,) = 1, 2, 3, 4 + ^ + - name: rights + type: node[] + kind: + - LocalVariableTargetNode + - InstanceVariableTargetNode + - ClassVariableTargetNode + - GlobalVariableTargetNode + - ConstantTargetNode + - ConstantPathTargetNode + - CallTargetNode + - IndexTargetNode + - MultiTargetNode + - RequiredParameterNode # def m((*,b)); end + - on error: BackReferenceReadNode # a, (*, $&) = z + - on error: NumberedReferenceReadNode # a, (*, $1) = z + comment: | + Represents the targets expressions after a splat node. + + a, (*, b, c) = 1, 2, 3, 4, 5 + ^^^^ + - name: lparen_loc + type: location? + comment: | + The location of the opening parenthesis. + + a, (b, c) = 1, 2, 3 + ^ + - name: rparen_loc + type: location? + comment: | + The location of the closing parenthesis. + + a, (b, c) = 1, 2, 3 + ^ + comment: | + Represents a multi-target expression. + + a, (b, c) = 1, 2, 3 + ^^^^^^ + + This can be a part of `MultiWriteNode` as above, or the target of a `for` loop + + for a, b in [[1, 2], [3, 4]] + ^^^^ + - name: MultiWriteNode + fields: + - name: lefts + type: node[] + kind: + - LocalVariableTargetNode + - InstanceVariableTargetNode + - ClassVariableTargetNode + - GlobalVariableTargetNode + - ConstantTargetNode + - ConstantPathTargetNode + - CallTargetNode + - IndexTargetNode + - MultiTargetNode + - on error: BackReferenceReadNode # $&, = z + - on error: NumberedReferenceReadNode # $1, = z + comment: | + Represents the targets expressions before a splat node. + + a, b, * = 1, 2, 3, 4, 5 + ^^^^ + + The splat node can be absent, in that case all target expressions are in the left field. + + a, b, c = 1, 2, 3, 4, 5 + ^^^^^^^ + - name: rest + type: node? + kind: + - ImplicitRestNode + - SplatNode + comment: | + Represents a splat node in the target expression. + + a, b, *c = 1, 2, 3, 4 + ^^ + + The variable can be empty, this results in a `SplatNode` with a `nil` expression field. + + a, b, * = 1, 2, 3, 4 + ^ + + If the `*` is omitted, this field will contain an `ImplicitRestNode` + + a, b, = 1, 2, 3, 4 + ^ + - name: rights + type: node[] + kind: + - LocalVariableTargetNode + - InstanceVariableTargetNode + - ClassVariableTargetNode + - GlobalVariableTargetNode + - ConstantTargetNode + - ConstantPathTargetNode + - CallTargetNode + - IndexTargetNode + - MultiTargetNode + - on error: BackReferenceReadNode # *, $& = z + - on error: NumberedReferenceReadNode # *, $1 = z + comment: | + Represents the targets expressions after a splat node. + + a, *, b, c = 1, 2, 3, 4, 5 + ^^^^ + - name: lparen_loc + type: location? + comment: | + The location of the opening parenthesis. + + (a, b, c) = 1, 2, 3 + ^ + - name: rparen_loc + type: location? + comment: | + The location of the closing parenthesis. + + (a, b, c) = 1, 2, 3 + ^ + - name: operator_loc + type: location + comment: | + The location of the operator. + + a, b, c = 1, 2, 3 + ^ + - name: value + type: node + kind: non-void expression + comment: | + The value to write to the targets. It can be any [non-void expression](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#non-void-expression). + + a, b, c = 1, 2, 3 + ^^^^^^^ + comment: | + Represents a write to a multi-target expression. + + a, b, c = 1, 2, 3 + ^^^^^^^^^^^^^^^^^ + - name: NextNode + fields: + - name: arguments + type: node? + kind: ArgumentsNode + - name: keyword_loc + type: location + comment: | + Represents the use of the `next` keyword. + + next 1 + ^^^^^^ + - name: NilNode + comment: | + Represents the use of the `nil` keyword. + + nil + ^^^ + - name: NoKeywordsParameterNode + fields: + - name: operator_loc + type: location + - name: keyword_loc + type: location + comment: | + Represents the use of `**nil` inside method arguments. + + def a(**nil) + ^^^^^ + end + - name: NumberedParametersNode + fields: + - name: maximum + type: uint8 + comment: | + Represents an implicit set of parameters through the use of numbered parameters within a block or lambda. + + -> { _1 + _2 } + ^^^^^^^^^^^^^^ + - name: NumberedReferenceReadNode + fields: + - name: number + type: uint32 + comment: | + The (1-indexed, from the left) number of the capture group. Numbered references that are too large result in this value being `0`. + + $1 # number `1` + + $5432 # number `5432` + + $4294967296 # number `0` + comment: | + Represents reading a numbered reference to a capture in the previous match. + + $1 + ^^ + - name: OptionalKeywordParameterNode + flags: ParameterFlags + fields: + - name: name + type: constant + - name: name_loc + type: location + - name: value + type: node + kind: non-void expression + comment: | + Represents an optional keyword parameter to a method, block, or lambda definition. + + def a(b: 1) + ^^^^ + end + - name: OptionalParameterNode + flags: ParameterFlags + fields: + - name: name + type: constant + - name: name_loc + type: location + - name: operator_loc + type: location + - name: value + type: node + kind: non-void expression + comment: | + Represents an optional parameter to a method, block, or lambda definition. + + def a(b = 1) + ^^^^^ + end + - name: OrNode + fields: + - name: left + type: node + kind: non-void expression + comment: | + Represents the left side of the expression. It can be any [non-void expression](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#non-void-expression). + + left or right + ^^^^ + + 1 || 2 + ^ + - name: right + type: node + kind: Node + comment: | + Represents the right side of the expression. + + left || right + ^^^^^ + + 1 or 2 + ^ + - name: operator_loc + type: location + comment: | + The location of the `or` keyword or the `||` operator. + + left or right + ^^ + comment: | + Represents the use of the `||` operator or the `or` keyword. + + left or right + ^^^^^^^^^^^^^ + - name: ParametersNode + fields: + - name: requireds + type: node[] + kind: + - RequiredParameterNode + - MultiTargetNode + - name: optionals + type: node[] + kind: OptionalParameterNode + - name: rest + type: node? + kind: + - RestParameterNode + - ImplicitRestNode # Only in block parameters + - name: posts + type: node[] + kind: + - RequiredParameterNode + - MultiTargetNode + # On parsing error of `f(**kwargs, ...)` or `f(**nil, ...)`, the keyword_rest value is moved here: + - on error: KeywordRestParameterNode + - on error: NoKeywordsParameterNode + # On parsing error of `f(..., ...)`, the first forwarding parameter is moved here: + - on error: ForwardingParameterNode + - name: keywords + type: node[] + kind: + - RequiredKeywordParameterNode + - OptionalKeywordParameterNode + - name: keyword_rest + type: node? + kind: + - KeywordRestParameterNode + - ForwardingParameterNode + - NoKeywordsParameterNode + - name: block + type: node? + kind: BlockParameterNode + comment: | + Represents the list of parameters on a method, block, or lambda definition. + + def a(b, c, d) + ^^^^^^^ + end + - name: ParenthesesNode + flags: ParenthesesNodeFlags + fields: + - name: body + type: node? + kind: non-void expression # Usually a StatementsNode but not always e.g. `1 in (..10)` + - name: opening_loc + type: location + - name: closing_loc + type: location + newline: false + comment: | + Represents a parenthesized expression + + (10 + 34) + ^^^^^^^^^ + - name: PinnedExpressionNode + fields: + - name: expression + type: node + kind: non-void expression + comment: | + The expression used in the pinned expression + + foo in ^(bar) + ^^^ + - name: operator_loc + type: location + comment: | + The location of the `^` operator + + foo in ^(bar) + ^ + - name: lparen_loc + type: location + comment: | + The location of the opening parenthesis. + + foo in ^(bar) + ^ + - name: rparen_loc + type: location + comment: | + The location of the closing parenthesis. + + foo in ^(bar) + ^ + comment: | + Represents the use of the `^` operator for pinning an expression in a pattern matching expression. + + foo in ^(bar) + ^^^^^^ + - name: PinnedVariableNode + fields: + - name: variable + type: node + kind: + - LocalVariableReadNode + - InstanceVariableReadNode + - ClassVariableReadNode + - GlobalVariableReadNode # foo in ^$a + - BackReferenceReadNode # foo in ^$& + - NumberedReferenceReadNode # foo in ^$1 + - ItLocalVariableReadNode # proc { 1 in ^it } + - on error: MissingNode # foo in ^Bar + comment: | + The variable used in the pinned expression + + foo in ^bar + ^^^ + - name: operator_loc + type: location + comment: | + The location of the `^` operator + + foo in ^bar + ^ + comment: | + Represents the use of the `^` operator for pinning a variable in a pattern matching expression. + + foo in ^bar + ^^^^ + - name: PostExecutionNode + fields: + - name: statements + type: node? + kind: StatementsNode + - name: keyword_loc + type: location + - name: opening_loc + type: location + - name: closing_loc + type: location + comment: | + Represents the use of the `END` keyword. + + END { foo } + ^^^^^^^^^^^ + - name: PreExecutionNode + fields: + - name: statements + type: node? + kind: StatementsNode + - name: keyword_loc + type: location + - name: opening_loc + type: location + - name: closing_loc + type: location + comment: | + Represents the use of the `BEGIN` keyword. + + BEGIN { foo } + ^^^^^^^^^^^^^ + - name: ProgramNode + fields: + - name: locals + type: constant[] + - name: statements + type: node + kind: StatementsNode + comment: The top level node of any parse tree. + - name: RangeNode + flags: RangeFlags + fields: + - name: left + type: node? + kind: non-void expression + comment: | + The left-hand side of the range, if present. It can be either `nil` or any [non-void expression](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#non-void-expression). + + 1... + ^ + + hello...goodbye + ^^^^^ + - name: right + type: node? + kind: non-void expression + comment: | + The right-hand side of the range, if present. It can be either `nil` or any [non-void expression](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#non-void-expression). + + ..5 + ^ + + 1...foo + ^^^ + If neither right-hand or left-hand side was included, this will be a MissingNode. + - name: operator_loc + type: location + comment: | + The location of the `..` or `...` operator. + comment: | + Represents the use of the `..` or `...` operators. + + 1..2 + ^^^^ + + c if a =~ /left/ ... b =~ /right/ + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + - name: RationalNode + flags: IntegerBaseFlags + fields: + - name: numerator + type: integer + comment: | + The numerator of the rational number. + + 1.5r # numerator 3 + - name: denominator + type: integer + comment: | + The denominator of the rational number. + + 1.5r # denominator 2 + comment: | + Represents a rational number literal. + + 1.0r + ^^^^ + - name: RedoNode + comment: | + Represents the use of the `redo` keyword. + + redo + ^^^^ + - name: RegularExpressionNode + flags: RegularExpressionFlags + fields: + - name: opening_loc + type: location + - name: content_loc + type: location + - name: closing_loc + type: location + - name: unescaped + type: string + comment: | + Represents a regular expression literal with no interpolation. + + /foo/i + ^^^^^^ + - name: RequiredKeywordParameterNode + flags: ParameterFlags + fields: + - name: name + type: constant + - name: name_loc + type: location + comment: | + Represents a required keyword parameter to a method, block, or lambda definition. + + def a(b: ) + ^^ + end + - name: RequiredParameterNode + flags: ParameterFlags + fields: + - name: name + type: constant + comment: | + Represents a required parameter to a method, block, or lambda definition. + + def a(b) + ^ + end + - name: RescueModifierNode + fields: + - name: expression + type: node + kind: Node + - name: keyword_loc + type: location + - name: rescue_expression + type: node + kind: Node + newline: expression + comment: | + Represents an expression modified with a rescue. + + foo rescue nil + ^^^^^^^^^^^^^^ + - name: RescueNode + fields: + - name: keyword_loc + type: location + - name: exceptions + type: node[] + kind: non-void expression + - name: operator_loc + type: location? + - name: reference + type: node? + kind: + - LocalVariableTargetNode + - InstanceVariableTargetNode + - ClassVariableTargetNode + - GlobalVariableTargetNode + - ConstantTargetNode + - ConstantPathTargetNode + - CallTargetNode + - IndexTargetNode + - on error: BackReferenceReadNode # => begin; rescue => $&; end + - on error: NumberedReferenceReadNode # => begin; rescue => $1; end + - on error: MissingNode # begin; rescue =>; end + - name: then_keyword_loc + type: location? + - name: statements + type: node? + kind: StatementsNode + - name: subsequent + type: node? + kind: RescueNode + comment: | + Represents a rescue statement. + + begin + rescue Foo, *splat, Bar => ex + foo + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + end + + `Foo, *splat, Bar` are in the `exceptions` field. `ex` is in the `reference` field. + - name: RestParameterNode + flags: ParameterFlags + fields: + - name: name + type: constant? + - name: name_loc + type: location? + - name: operator_loc + type: location + comment: | + Represents a rest parameter to a method, block, or lambda definition. + + def a(*b) + ^^ + end + - name: RetryNode + comment: | + Represents the use of the `retry` keyword. + + retry + ^^^^^ + - name: ReturnNode + fields: + - name: keyword_loc + type: location + - name: arguments + type: node? + kind: ArgumentsNode + comment: | + Represents the use of the `return` keyword. + + return 1 + ^^^^^^^^ + - name: SelfNode + comment: | + Represents the `self` keyword. + + self + ^^^^ + - name: ShareableConstantNode + flags: ShareableConstantNodeFlags + fields: + - name: write + type: node + kind: + - ConstantWriteNode + - ConstantAndWriteNode + - ConstantOrWriteNode + - ConstantOperatorWriteNode + - ConstantPathWriteNode + - ConstantPathAndWriteNode + - ConstantPathOrWriteNode + - ConstantPathOperatorWriteNode + comment: The constant write that should be modified with the shareability state. + comment: | + This node wraps a constant write to indicate that when the value is written, it should have its shareability state modified. + + # shareable_constant_value: literal + C = { a: 1 } + ^^^^^^^^^^^^ + - name: SingletonClassNode + fields: + - name: locals + type: constant[] + - name: class_keyword_loc + type: location + - name: operator_loc + type: location + - name: expression + type: node + kind: non-void expression + - name: body + type: node? + kind: + - StatementsNode + - BeginNode + - name: end_keyword_loc + type: location + comment: | + Represents a singleton class declaration involving the `class` keyword. + + class << self end + ^^^^^^^^^^^^^^^^^ + - name: SourceEncodingNode + comment: | + Represents the use of the `__ENCODING__` keyword. + + __ENCODING__ + ^^^^^^^^^^^^ + - name: SourceFileNode + flags: StringFlags + fields: + - name: filepath + type: string + comment: Represents the file path being parsed. This corresponds directly to the `filepath` option given to the various `Prism::parse*` APIs. + comment: | + Represents the use of the `__FILE__` keyword. + + __FILE__ + ^^^^^^^^ + - name: SourceLineNode + comment: | + Represents the use of the `__LINE__` keyword. + + __LINE__ + ^^^^^^^^ + - name: SplatNode + fields: + - name: operator_loc + type: location + - name: expression + type: node? + kind: non-void expression + comment: | + Represents the use of the splat operator. + + [*a] + ^^ + - name: StatementsNode + fields: + - name: body + type: node[] + kind: Node + comment: | + Represents a set of statements contained within some scope. + + foo; bar; baz + ^^^^^^^^^^^^^ + - name: StringNode + flags: StringFlags + fields: + - name: opening_loc + type: location? + - name: content_loc + type: location + - name: closing_loc + type: location? + - name: unescaped + type: string + comment: | + Represents a string literal, a string contained within a `%w` list, or plain string content within an interpolated string. + + "foo" + ^^^^^ + + %w[foo] + ^^^ + + "foo #{bar} baz" + ^^^^ ^^^^ + - name: SuperNode + fields: + - name: keyword_loc + type: location + - name: lparen_loc + type: location? + - name: arguments + type: node? + kind: ArgumentsNode + comment: "Can be only `nil` when there are empty parentheses, like `super()`." + - name: rparen_loc + type: location? + - name: block + type: node? + kind: + - BlockNode + - BlockArgumentNode + comment: | + Represents the use of the `super` keyword with parentheses or arguments. + + super() + ^^^^^^^ + + super foo, bar + ^^^^^^^^^^^^^^ + + If no arguments are provided (except for a block), it would be a `ForwardingSuperNode` instead. + - name: SymbolNode + flags: SymbolFlags + fields: + - name: opening_loc + type: location? + - name: value_loc + type: location? + - name: closing_loc + type: location? + - name: unescaped + type: string + comment: | + Represents a symbol literal or a symbol contained within a `%i` list. + + :foo + ^^^^ + + %i[foo] + ^^^ + - name: TrueNode + comment: | + Represents the use of the literal `true` keyword. + + true + ^^^^ + - name: UndefNode + fields: + - name: names + type: node[] + kind: + - SymbolNode + - InterpolatedSymbolNode + - name: keyword_loc + type: location + comment: | + Represents the use of the `undef` keyword. + + undef :foo, :bar, :baz + ^^^^^^^^^^^^^^^^^^^^^^ + - name: UnlessNode + fields: + - name: keyword_loc + type: location + comment: | + The location of the `unless` keyword. + + unless cond then bar end + ^^^^^^ + + bar unless cond + ^^^^^^ + - name: predicate + type: node + kind: non-void expression + comment: | + The condition to be evaluated for the unless expression. It can be any [non-void expression](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#non-void-expression). + + unless cond then bar end + ^^^^ + + bar unless cond + ^^^^ + - name: then_keyword_loc + type: location? + comment: | + The location of the `then` keyword, if present. + + unless cond then bar end + ^^^^ + - name: statements + type: node? + kind: StatementsNode + comment: | + The body of statements that will executed if the unless condition is + falsey. Will be `nil` if no body is provided. + + unless cond then bar end + ^^^ + - name: else_clause + type: node? + kind: ElseNode + comment: | + The else clause of the unless expression, if present. + + unless cond then bar else baz end + ^^^^^^^^ + - name: end_keyword_loc + type: location? + comment: | + The location of the `end` keyword, if present. + + unless cond then bar end + ^^^ + newline: predicate + comment: | + Represents the use of the `unless` keyword, either in the block form or the modifier form. + + bar unless foo + ^^^^^^^^^^^^^^ + + unless foo then bar end + ^^^^^^^^^^^^^^^^^^^^^^^ + - name: UntilNode + flags: LoopFlags + fields: + - name: keyword_loc + type: location + - name: do_keyword_loc + type: location? + - name: closing_loc + type: location? + - name: predicate + type: node + kind: non-void expression + - name: statements + type: node? + kind: StatementsNode + newline: predicate + comment: | + Represents the use of the `until` keyword, either in the block form or the modifier form. + + bar until foo + ^^^^^^^^^^^^^ + + until foo do bar end + ^^^^^^^^^^^^^^^^^^^^ + - name: WhenNode + fields: + - name: keyword_loc + type: location + - name: conditions + type: node[] + kind: non-void expression + - name: then_keyword_loc + type: location? + - name: statements + type: node? + kind: StatementsNode + comment: | + Represents the use of the `when` keyword within a case statement. + + case true + when true + ^^^^^^^^^ + end + - name: WhileNode + flags: LoopFlags + fields: + - name: keyword_loc + type: location + - name: do_keyword_loc + type: location? + - name: closing_loc + type: location? + - name: predicate + type: node + kind: non-void expression + - name: statements + type: node? + kind: StatementsNode + newline: predicate + comment: | + Represents the use of the `while` keyword, either in the block form or the modifier form. + + bar while foo + ^^^^^^^^^^^^^ + + while foo do bar end + ^^^^^^^^^^^^^^^^^^^^ + - name: XStringNode + flags: EncodingFlags + fields: + - name: opening_loc + type: location + - name: content_loc + type: location + - name: closing_loc + type: location + - name: unescaped + type: string + comment: | + Represents an xstring literal with no interpolation. + + `foo` + ^^^^^ + - name: YieldNode + fields: + - name: keyword_loc + type: location + - name: lparen_loc + type: location? + - name: arguments + type: node? + kind: ArgumentsNode + - name: rparen_loc + type: location? + comment: | + Represents the use of the `yield` keyword. + + yield 1 + ^^^^^^^ diff --git a/prism/defines.h b/prism/defines.h new file mode 100644 index 0000000000..e31429c789 --- /dev/null +++ b/prism/defines.h @@ -0,0 +1,260 @@ +/** + * @file defines.h + * + * Macro definitions used throughout the prism library. + * + * This file should be included first by any *.h or *.c in prism for consistency + * and to ensure that the macros are defined before they are used. + */ +#ifndef PRISM_DEFINES_H +#define PRISM_DEFINES_H + +#include <ctype.h> +#include <limits.h> +#include <math.h> +#include <stdarg.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> + +/** + * We want to be able to use the PRI* macros for printing out integers, but on + * some platforms they aren't included unless this is already defined. + */ +#define __STDC_FORMAT_MACROS +// Include sys/types.h before inttypes.h to work around issue with +// certain versions of GCC and newlib which causes omission of PRIx64 +#include <sys/types.h> +#include <inttypes.h> + +/** + * When we are parsing using recursive descent, we want to protect against + * malicious payloads that could attempt to crash our parser. We do this by + * specifying a maximum depth to which we are allowed to recurse. + */ +#ifndef PRISM_DEPTH_MAXIMUM + #define PRISM_DEPTH_MAXIMUM 10000 +#endif + +/** + * By default, we compile with -fvisibility=hidden. When this is enabled, we + * need to mark certain functions as being publically-visible. This macro does + * that in a compiler-agnostic way. + */ +#ifndef PRISM_EXPORTED_FUNCTION +# ifdef PRISM_EXPORT_SYMBOLS +# ifdef _WIN32 +# define PRISM_EXPORTED_FUNCTION __declspec(dllexport) extern +# else +# define PRISM_EXPORTED_FUNCTION __attribute__((__visibility__("default"))) extern +# endif +# else +# define PRISM_EXPORTED_FUNCTION +# endif +#endif + +/** + * Certain compilers support specifying that a function accepts variadic + * parameters that look like printf format strings to provide a better developer + * experience when someone is using the function. This macro does that in a + * compiler-agnostic way. + */ +#if defined(__GNUC__) +# if defined(__MINGW_PRINTF_FORMAT) +# define PRISM_ATTRIBUTE_FORMAT(string_index, argument_index) __attribute__((format(__MINGW_PRINTF_FORMAT, string_index, argument_index))) +# else +# define PRISM_ATTRIBUTE_FORMAT(string_index, argument_index) __attribute__((format(printf, string_index, argument_index))) +# endif +#elif defined(__clang__) +# define PRISM_ATTRIBUTE_FORMAT(string_index, argument_index) __attribute__((__format__(__printf__, string_index, argument_index))) +#else +# define PRISM_ATTRIBUTE_FORMAT(string_index, argument_index) +#endif + +/** + * GCC will warn if you specify a function or parameter that is unused at + * runtime. This macro allows you to mark a function or parameter as unused in a + * compiler-agnostic way. + */ +#if defined(__GNUC__) +# define PRISM_ATTRIBUTE_UNUSED __attribute__((unused)) +#else +# define PRISM_ATTRIBUTE_UNUSED +#endif + +/** + * Old Visual Studio versions do not support the inline keyword, so we need to + * define it to be __inline. + */ +#if defined(_MSC_VER) && !defined(inline) +# define inline __inline +#endif + +/** + * Old Visual Studio versions before 2015 do not implement sprintf, but instead + * implement _snprintf. We standard that here. + */ +#if !defined(snprintf) && defined(_MSC_VER) && (_MSC_VER < 1900) +# define snprintf _snprintf +#endif + +/** + * A simple utility macro to concatenate two tokens together, necessary when one + * of the tokens is itself a macro. + */ +#define PM_CONCATENATE(left, right) left ## right + +/** + * We want to be able to use static assertions, but they weren't standardized + * until C11. As such, we polyfill it here by making a hacky typedef that will + * fail to compile due to a negative array size if the condition is false. + */ +#if defined(_Static_assert) +# define PM_STATIC_ASSERT(line, condition, message) _Static_assert(condition, message) +#else +# define PM_STATIC_ASSERT(line, condition, message) typedef char PM_CONCATENATE(static_assert_, line)[(condition) ? 1 : -1] +#endif + +/** + * In general, libc for embedded systems does not support memory-mapped files. + * If the target platform is POSIX or Windows, we can map a file in memory and + * read it in a more efficient manner. + */ +#ifdef _WIN32 +# define PRISM_HAS_MMAP +#else +# include <unistd.h> +# ifdef _POSIX_MAPPED_FILES +# define PRISM_HAS_MMAP +# endif +#endif + +/** + * If PRISM_HAS_NO_FILESYSTEM is defined, then we want to exclude all filesystem + * related code from the library. All filesystem related code should be guarded + * by PRISM_HAS_FILESYSTEM. + */ +#ifndef PRISM_HAS_NO_FILESYSTEM +# define PRISM_HAS_FILESYSTEM +#endif + +/** + * isinf on POSIX systems it accepts a float, a double, or a long double. + * But mingw didn't provide an isinf macro, only an isinf function that only + * accepts floats, so we need to use _finite instead. + */ +#ifdef __MINGW64__ + #include <float.h> + #define PRISM_ISINF(x) (!_finite(x)) +#else + #define PRISM_ISINF(x) isinf(x) +#endif + +/** + * If you build prism with a custom allocator, configure it with + * "-D PRISM_XALLOCATOR" to use your own allocator that defines xmalloc, + * xrealloc, xcalloc, and xfree. + * + * For example, your `prism_xallocator.h` file could look like this: + * + * ``` + * #ifndef PRISM_XALLOCATOR_H + * #define PRISM_XALLOCATOR_H + * #define xmalloc my_malloc + * #define xrealloc my_realloc + * #define xcalloc my_calloc + * #define xfree my_free + * #endif + * ``` + */ +#ifdef PRISM_XALLOCATOR + #include "prism_xallocator.h" +#else + #ifndef xmalloc + /** + * The malloc function that should be used. This can be overridden with + * the PRISM_XALLOCATOR define. + */ + #define xmalloc malloc + #endif + + #ifndef xrealloc + /** + * The realloc function that should be used. This can be overridden with + * the PRISM_XALLOCATOR define. + */ + #define xrealloc realloc + #endif + + #ifndef xcalloc + /** + * The calloc function that should be used. This can be overridden with + * the PRISM_XALLOCATOR define. + */ + #define xcalloc calloc + #endif + + #ifndef xfree + /** + * The free function that should be used. This can be overridden with the + * PRISM_XALLOCATOR define. + */ + #define xfree free + #endif +#endif + +/** + * If PRISM_BUILD_MINIMAL is defined, then we're going to define every possible + * switch that will turn off certain features of prism. + */ +#ifdef PRISM_BUILD_MINIMAL + /** Exclude the serialization API. */ + #define PRISM_EXCLUDE_SERIALIZATION + + /** Exclude the JSON serialization API. */ + #define PRISM_EXCLUDE_JSON + + /** Exclude the Array#pack parser API. */ + #define PRISM_EXCLUDE_PACK + + /** Exclude the prettyprint API. */ + #define PRISM_EXCLUDE_PRETTYPRINT + + /** Exclude the full set of encodings, using the minimal only. */ + #define PRISM_ENCODING_EXCLUDE_FULL +#endif + +/** + * Support PRISM_LIKELY and PRISM_UNLIKELY to help the compiler optimize its + * branch predication. + */ +#if defined(__GNUC__) || defined(__clang__) + /** The compiler should predicate that this branch will be taken. */ + #define PRISM_LIKELY(x) __builtin_expect(!!(x), 1) + + /** The compiler should predicate that this branch will not be taken. */ + #define PRISM_UNLIKELY(x) __builtin_expect(!!(x), 0) +#else + /** Void because this platform does not support branch prediction hints. */ + #define PRISM_LIKELY(x) (x) + + /** Void because this platform does not support branch prediction hints. */ + #define PRISM_UNLIKELY(x) (x) +#endif + +/** + * We use -Wimplicit-fallthrough to guard potentially unintended fall-through between cases of a switch. + * Use PRISM_FALLTHROUGH to explicitly annotate cases where the fallthrough is intentional. + */ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L // C23 or later + #define PRISM_FALLTHROUGH [[fallthrough]]; +#elif defined(__GNUC__) || defined(__clang__) + #define PRISM_FALLTHROUGH __attribute__((fallthrough)); +#elif defined(_MSC_VER) + #define PRISM_FALLTHROUGH __fallthrough; +#else + #define PRISM_FALLTHROUGH +#endif + +#endif diff --git a/prism/encoding.c b/prism/encoding.c new file mode 100644 index 0000000000..d7e5616840 --- /dev/null +++ b/prism/encoding.c @@ -0,0 +1,5340 @@ +#include "prism/encoding.h" + +typedef uint32_t pm_unicode_codepoint_t; + +#define UNICODE_ALPHA_CODEPOINTS_LENGTH 1508 +static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEPOINTS_LENGTH] = { + 0x100, 0x2C1, + 0x2C6, 0x2D1, + 0x2E0, 0x2E4, + 0x2EC, 0x2EC, + 0x2EE, 0x2EE, + 0x345, 0x345, + 0x363, 0x374, + 0x376, 0x377, + 0x37A, 0x37D, + 0x37F, 0x37F, + 0x386, 0x386, + 0x388, 0x38A, + 0x38C, 0x38C, + 0x38E, 0x3A1, + 0x3A3, 0x3F5, + 0x3F7, 0x481, + 0x48A, 0x52F, + 0x531, 0x556, + 0x559, 0x559, + 0x560, 0x588, + 0x5B0, 0x5BD, + 0x5BF, 0x5BF, + 0x5C1, 0x5C2, + 0x5C4, 0x5C5, + 0x5C7, 0x5C7, + 0x5D0, 0x5EA, + 0x5EF, 0x5F2, + 0x610, 0x61A, + 0x620, 0x657, + 0x659, 0x65F, + 0x66E, 0x6D3, + 0x6D5, 0x6DC, + 0x6E1, 0x6E8, + 0x6ED, 0x6EF, + 0x6FA, 0x6FC, + 0x6FF, 0x6FF, + 0x710, 0x73F, + 0x74D, 0x7B1, + 0x7CA, 0x7EA, + 0x7F4, 0x7F5, + 0x7FA, 0x7FA, + 0x800, 0x817, + 0x81A, 0x82C, + 0x840, 0x858, + 0x860, 0x86A, + 0x870, 0x887, + 0x889, 0x88F, + 0x897, 0x897, + 0x8A0, 0x8C9, + 0x8D4, 0x8DF, + 0x8E3, 0x8E9, + 0x8F0, 0x93B, + 0x93D, 0x94C, + 0x94E, 0x950, + 0x955, 0x963, + 0x971, 0x983, + 0x985, 0x98C, + 0x98F, 0x990, + 0x993, 0x9A8, + 0x9AA, 0x9B0, + 0x9B2, 0x9B2, + 0x9B6, 0x9B9, + 0x9BD, 0x9C4, + 0x9C7, 0x9C8, + 0x9CB, 0x9CC, + 0x9CE, 0x9CE, + 0x9D7, 0x9D7, + 0x9DC, 0x9DD, + 0x9DF, 0x9E3, + 0x9F0, 0x9F1, + 0x9FC, 0x9FC, + 0xA01, 0xA03, + 0xA05, 0xA0A, + 0xA0F, 0xA10, + 0xA13, 0xA28, + 0xA2A, 0xA30, + 0xA32, 0xA33, + 0xA35, 0xA36, + 0xA38, 0xA39, + 0xA3E, 0xA42, + 0xA47, 0xA48, + 0xA4B, 0xA4C, + 0xA51, 0xA51, + 0xA59, 0xA5C, + 0xA5E, 0xA5E, + 0xA70, 0xA75, + 0xA81, 0xA83, + 0xA85, 0xA8D, + 0xA8F, 0xA91, + 0xA93, 0xAA8, + 0xAAA, 0xAB0, + 0xAB2, 0xAB3, + 0xAB5, 0xAB9, + 0xABD, 0xAC5, + 0xAC7, 0xAC9, + 0xACB, 0xACC, + 0xAD0, 0xAD0, + 0xAE0, 0xAE3, + 0xAF9, 0xAFC, + 0xB01, 0xB03, + 0xB05, 0xB0C, + 0xB0F, 0xB10, + 0xB13, 0xB28, + 0xB2A, 0xB30, + 0xB32, 0xB33, + 0xB35, 0xB39, + 0xB3D, 0xB44, + 0xB47, 0xB48, + 0xB4B, 0xB4C, + 0xB56, 0xB57, + 0xB5C, 0xB5D, + 0xB5F, 0xB63, + 0xB71, 0xB71, + 0xB82, 0xB83, + 0xB85, 0xB8A, + 0xB8E, 0xB90, + 0xB92, 0xB95, + 0xB99, 0xB9A, + 0xB9C, 0xB9C, + 0xB9E, 0xB9F, + 0xBA3, 0xBA4, + 0xBA8, 0xBAA, + 0xBAE, 0xBB9, + 0xBBE, 0xBC2, + 0xBC6, 0xBC8, + 0xBCA, 0xBCC, + 0xBD0, 0xBD0, + 0xBD7, 0xBD7, + 0xC00, 0xC0C, + 0xC0E, 0xC10, + 0xC12, 0xC28, + 0xC2A, 0xC39, + 0xC3D, 0xC44, + 0xC46, 0xC48, + 0xC4A, 0xC4C, + 0xC55, 0xC56, + 0xC58, 0xC5A, + 0xC5C, 0xC5D, + 0xC60, 0xC63, + 0xC80, 0xC83, + 0xC85, 0xC8C, + 0xC8E, 0xC90, + 0xC92, 0xCA8, + 0xCAA, 0xCB3, + 0xCB5, 0xCB9, + 0xCBD, 0xCC4, + 0xCC6, 0xCC8, + 0xCCA, 0xCCC, + 0xCD5, 0xCD6, + 0xCDC, 0xCDE, + 0xCE0, 0xCE3, + 0xCF1, 0xCF3, + 0xD00, 0xD0C, + 0xD0E, 0xD10, + 0xD12, 0xD3A, + 0xD3D, 0xD44, + 0xD46, 0xD48, + 0xD4A, 0xD4C, + 0xD4E, 0xD4E, + 0xD54, 0xD57, + 0xD5F, 0xD63, + 0xD7A, 0xD7F, + 0xD81, 0xD83, + 0xD85, 0xD96, + 0xD9A, 0xDB1, + 0xDB3, 0xDBB, + 0xDBD, 0xDBD, + 0xDC0, 0xDC6, + 0xDCF, 0xDD4, + 0xDD6, 0xDD6, + 0xDD8, 0xDDF, + 0xDF2, 0xDF3, + 0xE01, 0xE3A, + 0xE40, 0xE46, + 0xE4D, 0xE4D, + 0xE81, 0xE82, + 0xE84, 0xE84, + 0xE86, 0xE8A, + 0xE8C, 0xEA3, + 0xEA5, 0xEA5, + 0xEA7, 0xEB9, + 0xEBB, 0xEBD, + 0xEC0, 0xEC4, + 0xEC6, 0xEC6, + 0xECD, 0xECD, + 0xEDC, 0xEDF, + 0xF00, 0xF00, + 0xF40, 0xF47, + 0xF49, 0xF6C, + 0xF71, 0xF83, + 0xF88, 0xF97, + 0xF99, 0xFBC, + 0x1000, 0x1036, + 0x1038, 0x1038, + 0x103B, 0x103F, + 0x1050, 0x108F, + 0x109A, 0x109D, + 0x10A0, 0x10C5, + 0x10C7, 0x10C7, + 0x10CD, 0x10CD, + 0x10D0, 0x10FA, + 0x10FC, 0x1248, + 0x124A, 0x124D, + 0x1250, 0x1256, + 0x1258, 0x1258, + 0x125A, 0x125D, + 0x1260, 0x1288, + 0x128A, 0x128D, + 0x1290, 0x12B0, + 0x12B2, 0x12B5, + 0x12B8, 0x12BE, + 0x12C0, 0x12C0, + 0x12C2, 0x12C5, + 0x12C8, 0x12D6, + 0x12D8, 0x1310, + 0x1312, 0x1315, + 0x1318, 0x135A, + 0x1380, 0x138F, + 0x13A0, 0x13F5, + 0x13F8, 0x13FD, + 0x1401, 0x166C, + 0x166F, 0x167F, + 0x1681, 0x169A, + 0x16A0, 0x16EA, + 0x16EE, 0x16F8, + 0x1700, 0x1713, + 0x171F, 0x1733, + 0x1740, 0x1753, + 0x1760, 0x176C, + 0x176E, 0x1770, + 0x1772, 0x1773, + 0x1780, 0x17B3, + 0x17B6, 0x17C8, + 0x17D7, 0x17D7, + 0x17DC, 0x17DC, + 0x1820, 0x1878, + 0x1880, 0x18AA, + 0x18B0, 0x18F5, + 0x1900, 0x191E, + 0x1920, 0x192B, + 0x1930, 0x1938, + 0x1950, 0x196D, + 0x1970, 0x1974, + 0x1980, 0x19AB, + 0x19B0, 0x19C9, + 0x1A00, 0x1A1B, + 0x1A20, 0x1A5E, + 0x1A61, 0x1A74, + 0x1AA7, 0x1AA7, + 0x1ABF, 0x1AC0, + 0x1ACC, 0x1ACE, + 0x1B00, 0x1B33, + 0x1B35, 0x1B43, + 0x1B45, 0x1B4C, + 0x1B80, 0x1BA9, + 0x1BAC, 0x1BAF, + 0x1BBA, 0x1BE5, + 0x1BE7, 0x1BF1, + 0x1C00, 0x1C36, + 0x1C4D, 0x1C4F, + 0x1C5A, 0x1C7D, + 0x1C80, 0x1C8A, + 0x1C90, 0x1CBA, + 0x1CBD, 0x1CBF, + 0x1CE9, 0x1CEC, + 0x1CEE, 0x1CF3, + 0x1CF5, 0x1CF6, + 0x1CFA, 0x1CFA, + 0x1D00, 0x1DBF, + 0x1DD3, 0x1DF4, + 0x1E00, 0x1F15, + 0x1F18, 0x1F1D, + 0x1F20, 0x1F45, + 0x1F48, 0x1F4D, + 0x1F50, 0x1F57, + 0x1F59, 0x1F59, + 0x1F5B, 0x1F5B, + 0x1F5D, 0x1F5D, + 0x1F5F, 0x1F7D, + 0x1F80, 0x1FB4, + 0x1FB6, 0x1FBC, + 0x1FBE, 0x1FBE, + 0x1FC2, 0x1FC4, + 0x1FC6, 0x1FCC, + 0x1FD0, 0x1FD3, + 0x1FD6, 0x1FDB, + 0x1FE0, 0x1FEC, + 0x1FF2, 0x1FF4, + 0x1FF6, 0x1FFC, + 0x2071, 0x2071, + 0x207F, 0x207F, + 0x2090, 0x209C, + 0x2102, 0x2102, + 0x2107, 0x2107, + 0x210A, 0x2113, + 0x2115, 0x2115, + 0x2119, 0x211D, + 0x2124, 0x2124, + 0x2126, 0x2126, + 0x2128, 0x2128, + 0x212A, 0x212D, + 0x212F, 0x2139, + 0x213C, 0x213F, + 0x2145, 0x2149, + 0x214E, 0x214E, + 0x2160, 0x2188, + 0x24B6, 0x24E9, + 0x2C00, 0x2CE4, + 0x2CEB, 0x2CEE, + 0x2CF2, 0x2CF3, + 0x2D00, 0x2D25, + 0x2D27, 0x2D27, + 0x2D2D, 0x2D2D, + 0x2D30, 0x2D67, + 0x2D6F, 0x2D6F, + 0x2D80, 0x2D96, + 0x2DA0, 0x2DA6, + 0x2DA8, 0x2DAE, + 0x2DB0, 0x2DB6, + 0x2DB8, 0x2DBE, + 0x2DC0, 0x2DC6, + 0x2DC8, 0x2DCE, + 0x2DD0, 0x2DD6, + 0x2DD8, 0x2DDE, + 0x2DE0, 0x2DFF, + 0x2E2F, 0x2E2F, + 0x3005, 0x3007, + 0x3021, 0x3029, + 0x3031, 0x3035, + 0x3038, 0x303C, + 0x3041, 0x3096, + 0x309D, 0x309F, + 0x30A1, 0x30FA, + 0x30FC, 0x30FF, + 0x3105, 0x312F, + 0x3131, 0x318E, + 0x31A0, 0x31BF, + 0x31F0, 0x31FF, + 0x3400, 0x4DBF, + 0x4E00, 0xA48C, + 0xA4D0, 0xA4FD, + 0xA500, 0xA60C, + 0xA610, 0xA61F, + 0xA62A, 0xA62B, + 0xA640, 0xA66E, + 0xA674, 0xA67B, + 0xA67F, 0xA6EF, + 0xA717, 0xA71F, + 0xA722, 0xA788, + 0xA78B, 0xA7DC, + 0xA7F1, 0xA805, + 0xA807, 0xA827, + 0xA840, 0xA873, + 0xA880, 0xA8C3, + 0xA8C5, 0xA8C5, + 0xA8F2, 0xA8F7, + 0xA8FB, 0xA8FB, + 0xA8FD, 0xA8FF, + 0xA90A, 0xA92A, + 0xA930, 0xA952, + 0xA960, 0xA97C, + 0xA980, 0xA9B2, + 0xA9B4, 0xA9BF, + 0xA9CF, 0xA9CF, + 0xA9E0, 0xA9EF, + 0xA9FA, 0xA9FE, + 0xAA00, 0xAA36, + 0xAA40, 0xAA4D, + 0xAA60, 0xAA76, + 0xAA7A, 0xAABE, + 0xAAC0, 0xAAC0, + 0xAAC2, 0xAAC2, + 0xAADB, 0xAADD, + 0xAAE0, 0xAAEF, + 0xAAF2, 0xAAF5, + 0xAB01, 0xAB06, + 0xAB09, 0xAB0E, + 0xAB11, 0xAB16, + 0xAB20, 0xAB26, + 0xAB28, 0xAB2E, + 0xAB30, 0xAB5A, + 0xAB5C, 0xAB69, + 0xAB70, 0xABEA, + 0xAC00, 0xD7A3, + 0xD7B0, 0xD7C6, + 0xD7CB, 0xD7FB, + 0xF900, 0xFA6D, + 0xFA70, 0xFAD9, + 0xFB00, 0xFB06, + 0xFB13, 0xFB17, + 0xFB1D, 0xFB28, + 0xFB2A, 0xFB36, + 0xFB38, 0xFB3C, + 0xFB3E, 0xFB3E, + 0xFB40, 0xFB41, + 0xFB43, 0xFB44, + 0xFB46, 0xFBB1, + 0xFBD3, 0xFD3D, + 0xFD50, 0xFD8F, + 0xFD92, 0xFDC7, + 0xFDF0, 0xFDFB, + 0xFE70, 0xFE74, + 0xFE76, 0xFEFC, + 0xFF21, 0xFF3A, + 0xFF41, 0xFF5A, + 0xFF66, 0xFFBE, + 0xFFC2, 0xFFC7, + 0xFFCA, 0xFFCF, + 0xFFD2, 0xFFD7, + 0xFFDA, 0xFFDC, + 0x10000, 0x1000B, + 0x1000D, 0x10026, + 0x10028, 0x1003A, + 0x1003C, 0x1003D, + 0x1003F, 0x1004D, + 0x10050, 0x1005D, + 0x10080, 0x100FA, + 0x10140, 0x10174, + 0x10280, 0x1029C, + 0x102A0, 0x102D0, + 0x10300, 0x1031F, + 0x1032D, 0x1034A, + 0x10350, 0x1037A, + 0x10380, 0x1039D, + 0x103A0, 0x103C3, + 0x103C8, 0x103CF, + 0x103D1, 0x103D5, + 0x10400, 0x1049D, + 0x104B0, 0x104D3, + 0x104D8, 0x104FB, + 0x10500, 0x10527, + 0x10530, 0x10563, + 0x10570, 0x1057A, + 0x1057C, 0x1058A, + 0x1058C, 0x10592, + 0x10594, 0x10595, + 0x10597, 0x105A1, + 0x105A3, 0x105B1, + 0x105B3, 0x105B9, + 0x105BB, 0x105BC, + 0x105C0, 0x105F3, + 0x10600, 0x10736, + 0x10740, 0x10755, + 0x10760, 0x10767, + 0x10780, 0x10785, + 0x10787, 0x107B0, + 0x107B2, 0x107BA, + 0x10800, 0x10805, + 0x10808, 0x10808, + 0x1080A, 0x10835, + 0x10837, 0x10838, + 0x1083C, 0x1083C, + 0x1083F, 0x10855, + 0x10860, 0x10876, + 0x10880, 0x1089E, + 0x108E0, 0x108F2, + 0x108F4, 0x108F5, + 0x10900, 0x10915, + 0x10920, 0x10939, + 0x10940, 0x10959, + 0x10980, 0x109B7, + 0x109BE, 0x109BF, + 0x10A00, 0x10A03, + 0x10A05, 0x10A06, + 0x10A0C, 0x10A13, + 0x10A15, 0x10A17, + 0x10A19, 0x10A35, + 0x10A60, 0x10A7C, + 0x10A80, 0x10A9C, + 0x10AC0, 0x10AC7, + 0x10AC9, 0x10AE4, + 0x10B00, 0x10B35, + 0x10B40, 0x10B55, + 0x10B60, 0x10B72, + 0x10B80, 0x10B91, + 0x10C00, 0x10C48, + 0x10C80, 0x10CB2, + 0x10CC0, 0x10CF2, + 0x10D00, 0x10D27, + 0x10D4A, 0x10D65, + 0x10D69, 0x10D69, + 0x10D6F, 0x10D85, + 0x10E80, 0x10EA9, + 0x10EAB, 0x10EAC, + 0x10EB0, 0x10EB1, + 0x10EC2, 0x10EC7, + 0x10EFA, 0x10EFC, + 0x10F00, 0x10F1C, + 0x10F27, 0x10F27, + 0x10F30, 0x10F45, + 0x10F70, 0x10F81, + 0x10FB0, 0x10FC4, + 0x10FE0, 0x10FF6, + 0x11000, 0x11045, + 0x11071, 0x11075, + 0x11080, 0x110B8, + 0x110C2, 0x110C2, + 0x110D0, 0x110E8, + 0x11100, 0x11132, + 0x11144, 0x11147, + 0x11150, 0x11172, + 0x11176, 0x11176, + 0x11180, 0x111BF, + 0x111C1, 0x111C4, + 0x111CE, 0x111CF, + 0x111DA, 0x111DA, + 0x111DC, 0x111DC, + 0x11200, 0x11211, + 0x11213, 0x11234, + 0x11237, 0x11237, + 0x1123E, 0x11241, + 0x11280, 0x11286, + 0x11288, 0x11288, + 0x1128A, 0x1128D, + 0x1128F, 0x1129D, + 0x1129F, 0x112A8, + 0x112B0, 0x112E8, + 0x11300, 0x11303, + 0x11305, 0x1130C, + 0x1130F, 0x11310, + 0x11313, 0x11328, + 0x1132A, 0x11330, + 0x11332, 0x11333, + 0x11335, 0x11339, + 0x1133D, 0x11344, + 0x11347, 0x11348, + 0x1134B, 0x1134C, + 0x11350, 0x11350, + 0x11357, 0x11357, + 0x1135D, 0x11363, + 0x11380, 0x11389, + 0x1138B, 0x1138B, + 0x1138E, 0x1138E, + 0x11390, 0x113B5, + 0x113B7, 0x113C0, + 0x113C2, 0x113C2, + 0x113C5, 0x113C5, + 0x113C7, 0x113CA, + 0x113CC, 0x113CD, + 0x113D1, 0x113D1, + 0x113D3, 0x113D3, + 0x11400, 0x11441, + 0x11443, 0x11445, + 0x11447, 0x1144A, + 0x1145F, 0x11461, + 0x11480, 0x114C1, + 0x114C4, 0x114C5, + 0x114C7, 0x114C7, + 0x11580, 0x115B5, + 0x115B8, 0x115BE, + 0x115D8, 0x115DD, + 0x11600, 0x1163E, + 0x11640, 0x11640, + 0x11644, 0x11644, + 0x11680, 0x116B5, + 0x116B8, 0x116B8, + 0x11700, 0x1171A, + 0x1171D, 0x1172A, + 0x11740, 0x11746, + 0x11800, 0x11838, + 0x118A0, 0x118DF, + 0x118FF, 0x11906, + 0x11909, 0x11909, + 0x1190C, 0x11913, + 0x11915, 0x11916, + 0x11918, 0x11935, + 0x11937, 0x11938, + 0x1193B, 0x1193C, + 0x1193F, 0x11942, + 0x119A0, 0x119A7, + 0x119AA, 0x119D7, + 0x119DA, 0x119DF, + 0x119E1, 0x119E1, + 0x119E3, 0x119E4, + 0x11A00, 0x11A32, + 0x11A35, 0x11A3E, + 0x11A50, 0x11A97, + 0x11A9D, 0x11A9D, + 0x11AB0, 0x11AF8, + 0x11B60, 0x11B67, + 0x11BC0, 0x11BE0, + 0x11C00, 0x11C08, + 0x11C0A, 0x11C36, + 0x11C38, 0x11C3E, + 0x11C40, 0x11C40, + 0x11C72, 0x11C8F, + 0x11C92, 0x11CA7, + 0x11CA9, 0x11CB6, + 0x11D00, 0x11D06, + 0x11D08, 0x11D09, + 0x11D0B, 0x11D36, + 0x11D3A, 0x11D3A, + 0x11D3C, 0x11D3D, + 0x11D3F, 0x11D41, + 0x11D43, 0x11D43, + 0x11D46, 0x11D47, + 0x11D60, 0x11D65, + 0x11D67, 0x11D68, + 0x11D6A, 0x11D8E, + 0x11D90, 0x11D91, + 0x11D93, 0x11D96, + 0x11D98, 0x11D98, + 0x11DB0, 0x11DDB, + 0x11EE0, 0x11EF6, + 0x11F00, 0x11F10, + 0x11F12, 0x11F3A, + 0x11F3E, 0x11F40, + 0x11FB0, 0x11FB0, + 0x12000, 0x12399, + 0x12400, 0x1246E, + 0x12480, 0x12543, + 0x12F90, 0x12FF0, + 0x13000, 0x1342F, + 0x13441, 0x13446, + 0x13460, 0x143FA, + 0x14400, 0x14646, + 0x16100, 0x1612E, + 0x16800, 0x16A38, + 0x16A40, 0x16A5E, + 0x16A70, 0x16ABE, + 0x16AD0, 0x16AED, + 0x16B00, 0x16B2F, + 0x16B40, 0x16B43, + 0x16B63, 0x16B77, + 0x16B7D, 0x16B8F, + 0x16D40, 0x16D6C, + 0x16E40, 0x16E7F, + 0x16EA0, 0x16EB8, + 0x16EBB, 0x16ED3, + 0x16F00, 0x16F4A, + 0x16F4F, 0x16F87, + 0x16F8F, 0x16F9F, + 0x16FE0, 0x16FE1, + 0x16FE3, 0x16FE3, + 0x16FF0, 0x16FF6, + 0x17000, 0x18CD5, + 0x18CFF, 0x18D1E, + 0x18D80, 0x18DF2, + 0x1AFF0, 0x1AFF3, + 0x1AFF5, 0x1AFFB, + 0x1AFFD, 0x1AFFE, + 0x1B000, 0x1B122, + 0x1B132, 0x1B132, + 0x1B150, 0x1B152, + 0x1B155, 0x1B155, + 0x1B164, 0x1B167, + 0x1B170, 0x1B2FB, + 0x1BC00, 0x1BC6A, + 0x1BC70, 0x1BC7C, + 0x1BC80, 0x1BC88, + 0x1BC90, 0x1BC99, + 0x1BC9E, 0x1BC9E, + 0x1D400, 0x1D454, + 0x1D456, 0x1D49C, + 0x1D49E, 0x1D49F, + 0x1D4A2, 0x1D4A2, + 0x1D4A5, 0x1D4A6, + 0x1D4A9, 0x1D4AC, + 0x1D4AE, 0x1D4B9, + 0x1D4BB, 0x1D4BB, + 0x1D4BD, 0x1D4C3, + 0x1D4C5, 0x1D505, + 0x1D507, 0x1D50A, + 0x1D50D, 0x1D514, + 0x1D516, 0x1D51C, + 0x1D51E, 0x1D539, + 0x1D53B, 0x1D53E, + 0x1D540, 0x1D544, + 0x1D546, 0x1D546, + 0x1D54A, 0x1D550, + 0x1D552, 0x1D6A5, + 0x1D6A8, 0x1D6C0, + 0x1D6C2, 0x1D6DA, + 0x1D6DC, 0x1D6FA, + 0x1D6FC, 0x1D714, + 0x1D716, 0x1D734, + 0x1D736, 0x1D74E, + 0x1D750, 0x1D76E, + 0x1D770, 0x1D788, + 0x1D78A, 0x1D7A8, + 0x1D7AA, 0x1D7C2, + 0x1D7C4, 0x1D7CB, + 0x1DF00, 0x1DF1E, + 0x1DF25, 0x1DF2A, + 0x1E000, 0x1E006, + 0x1E008, 0x1E018, + 0x1E01B, 0x1E021, + 0x1E023, 0x1E024, + 0x1E026, 0x1E02A, + 0x1E030, 0x1E06D, + 0x1E08F, 0x1E08F, + 0x1E100, 0x1E12C, + 0x1E137, 0x1E13D, + 0x1E14E, 0x1E14E, + 0x1E290, 0x1E2AD, + 0x1E2C0, 0x1E2EB, + 0x1E4D0, 0x1E4EB, + 0x1E5D0, 0x1E5ED, + 0x1E5F0, 0x1E5F0, + 0x1E6C0, 0x1E6DE, + 0x1E6E0, 0x1E6F5, + 0x1E6FE, 0x1E6FF, + 0x1E7E0, 0x1E7E6, + 0x1E7E8, 0x1E7EB, + 0x1E7ED, 0x1E7EE, + 0x1E7F0, 0x1E7FE, + 0x1E800, 0x1E8C4, + 0x1E900, 0x1E943, + 0x1E947, 0x1E947, + 0x1E94B, 0x1E94B, + 0x1EE00, 0x1EE03, + 0x1EE05, 0x1EE1F, + 0x1EE21, 0x1EE22, + 0x1EE24, 0x1EE24, + 0x1EE27, 0x1EE27, + 0x1EE29, 0x1EE32, + 0x1EE34, 0x1EE37, + 0x1EE39, 0x1EE39, + 0x1EE3B, 0x1EE3B, + 0x1EE42, 0x1EE42, + 0x1EE47, 0x1EE47, + 0x1EE49, 0x1EE49, + 0x1EE4B, 0x1EE4B, + 0x1EE4D, 0x1EE4F, + 0x1EE51, 0x1EE52, + 0x1EE54, 0x1EE54, + 0x1EE57, 0x1EE57, + 0x1EE59, 0x1EE59, + 0x1EE5B, 0x1EE5B, + 0x1EE5D, 0x1EE5D, + 0x1EE5F, 0x1EE5F, + 0x1EE61, 0x1EE62, + 0x1EE64, 0x1EE64, + 0x1EE67, 0x1EE6A, + 0x1EE6C, 0x1EE72, + 0x1EE74, 0x1EE77, + 0x1EE79, 0x1EE7C, + 0x1EE7E, 0x1EE7E, + 0x1EE80, 0x1EE89, + 0x1EE8B, 0x1EE9B, + 0x1EEA1, 0x1EEA3, + 0x1EEA5, 0x1EEA9, + 0x1EEAB, 0x1EEBB, + 0x1F130, 0x1F149, + 0x1F150, 0x1F169, + 0x1F170, 0x1F189, + 0x20000, 0x2A6DF, + 0x2A700, 0x2B81D, + 0x2B820, 0x2CEAD, + 0x2CEB0, 0x2EBE0, + 0x2EBF0, 0x2EE5D, + 0x2F800, 0x2FA1D, + 0x30000, 0x3134A, + 0x31350, 0x33479, +}; + +#define UNICODE_ALNUM_CODEPOINTS_LENGTH 1598 +static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEPOINTS_LENGTH] = { + 0x100, 0x2C1, + 0x2C6, 0x2D1, + 0x2E0, 0x2E4, + 0x2EC, 0x2EC, + 0x2EE, 0x2EE, + 0x345, 0x345, + 0x363, 0x374, + 0x376, 0x377, + 0x37A, 0x37D, + 0x37F, 0x37F, + 0x386, 0x386, + 0x388, 0x38A, + 0x38C, 0x38C, + 0x38E, 0x3A1, + 0x3A3, 0x3F5, + 0x3F7, 0x481, + 0x48A, 0x52F, + 0x531, 0x556, + 0x559, 0x559, + 0x560, 0x588, + 0x5B0, 0x5BD, + 0x5BF, 0x5BF, + 0x5C1, 0x5C2, + 0x5C4, 0x5C5, + 0x5C7, 0x5C7, + 0x5D0, 0x5EA, + 0x5EF, 0x5F2, + 0x610, 0x61A, + 0x620, 0x657, + 0x659, 0x669, + 0x66E, 0x6D3, + 0x6D5, 0x6DC, + 0x6E1, 0x6E8, + 0x6ED, 0x6FC, + 0x6FF, 0x6FF, + 0x710, 0x73F, + 0x74D, 0x7B1, + 0x7C0, 0x7EA, + 0x7F4, 0x7F5, + 0x7FA, 0x7FA, + 0x800, 0x817, + 0x81A, 0x82C, + 0x840, 0x858, + 0x860, 0x86A, + 0x870, 0x887, + 0x889, 0x88F, + 0x897, 0x897, + 0x8A0, 0x8C9, + 0x8D4, 0x8DF, + 0x8E3, 0x8E9, + 0x8F0, 0x93B, + 0x93D, 0x94C, + 0x94E, 0x950, + 0x955, 0x963, + 0x966, 0x96F, + 0x971, 0x983, + 0x985, 0x98C, + 0x98F, 0x990, + 0x993, 0x9A8, + 0x9AA, 0x9B0, + 0x9B2, 0x9B2, + 0x9B6, 0x9B9, + 0x9BD, 0x9C4, + 0x9C7, 0x9C8, + 0x9CB, 0x9CC, + 0x9CE, 0x9CE, + 0x9D7, 0x9D7, + 0x9DC, 0x9DD, + 0x9DF, 0x9E3, + 0x9E6, 0x9F1, + 0x9FC, 0x9FC, + 0xA01, 0xA03, + 0xA05, 0xA0A, + 0xA0F, 0xA10, + 0xA13, 0xA28, + 0xA2A, 0xA30, + 0xA32, 0xA33, + 0xA35, 0xA36, + 0xA38, 0xA39, + 0xA3E, 0xA42, + 0xA47, 0xA48, + 0xA4B, 0xA4C, + 0xA51, 0xA51, + 0xA59, 0xA5C, + 0xA5E, 0xA5E, + 0xA66, 0xA75, + 0xA81, 0xA83, + 0xA85, 0xA8D, + 0xA8F, 0xA91, + 0xA93, 0xAA8, + 0xAAA, 0xAB0, + 0xAB2, 0xAB3, + 0xAB5, 0xAB9, + 0xABD, 0xAC5, + 0xAC7, 0xAC9, + 0xACB, 0xACC, + 0xAD0, 0xAD0, + 0xAE0, 0xAE3, + 0xAE6, 0xAEF, + 0xAF9, 0xAFC, + 0xB01, 0xB03, + 0xB05, 0xB0C, + 0xB0F, 0xB10, + 0xB13, 0xB28, + 0xB2A, 0xB30, + 0xB32, 0xB33, + 0xB35, 0xB39, + 0xB3D, 0xB44, + 0xB47, 0xB48, + 0xB4B, 0xB4C, + 0xB56, 0xB57, + 0xB5C, 0xB5D, + 0xB5F, 0xB63, + 0xB66, 0xB6F, + 0xB71, 0xB71, + 0xB82, 0xB83, + 0xB85, 0xB8A, + 0xB8E, 0xB90, + 0xB92, 0xB95, + 0xB99, 0xB9A, + 0xB9C, 0xB9C, + 0xB9E, 0xB9F, + 0xBA3, 0xBA4, + 0xBA8, 0xBAA, + 0xBAE, 0xBB9, + 0xBBE, 0xBC2, + 0xBC6, 0xBC8, + 0xBCA, 0xBCC, + 0xBD0, 0xBD0, + 0xBD7, 0xBD7, + 0xBE6, 0xBEF, + 0xC00, 0xC0C, + 0xC0E, 0xC10, + 0xC12, 0xC28, + 0xC2A, 0xC39, + 0xC3D, 0xC44, + 0xC46, 0xC48, + 0xC4A, 0xC4C, + 0xC55, 0xC56, + 0xC58, 0xC5A, + 0xC5C, 0xC5D, + 0xC60, 0xC63, + 0xC66, 0xC6F, + 0xC80, 0xC83, + 0xC85, 0xC8C, + 0xC8E, 0xC90, + 0xC92, 0xCA8, + 0xCAA, 0xCB3, + 0xCB5, 0xCB9, + 0xCBD, 0xCC4, + 0xCC6, 0xCC8, + 0xCCA, 0xCCC, + 0xCD5, 0xCD6, + 0xCDC, 0xCDE, + 0xCE0, 0xCE3, + 0xCE6, 0xCEF, + 0xCF1, 0xCF3, + 0xD00, 0xD0C, + 0xD0E, 0xD10, + 0xD12, 0xD3A, + 0xD3D, 0xD44, + 0xD46, 0xD48, + 0xD4A, 0xD4C, + 0xD4E, 0xD4E, + 0xD54, 0xD57, + 0xD5F, 0xD63, + 0xD66, 0xD6F, + 0xD7A, 0xD7F, + 0xD81, 0xD83, + 0xD85, 0xD96, + 0xD9A, 0xDB1, + 0xDB3, 0xDBB, + 0xDBD, 0xDBD, + 0xDC0, 0xDC6, + 0xDCF, 0xDD4, + 0xDD6, 0xDD6, + 0xDD8, 0xDDF, + 0xDE6, 0xDEF, + 0xDF2, 0xDF3, + 0xE01, 0xE3A, + 0xE40, 0xE46, + 0xE4D, 0xE4D, + 0xE50, 0xE59, + 0xE81, 0xE82, + 0xE84, 0xE84, + 0xE86, 0xE8A, + 0xE8C, 0xEA3, + 0xEA5, 0xEA5, + 0xEA7, 0xEB9, + 0xEBB, 0xEBD, + 0xEC0, 0xEC4, + 0xEC6, 0xEC6, + 0xECD, 0xECD, + 0xED0, 0xED9, + 0xEDC, 0xEDF, + 0xF00, 0xF00, + 0xF20, 0xF29, + 0xF40, 0xF47, + 0xF49, 0xF6C, + 0xF71, 0xF83, + 0xF88, 0xF97, + 0xF99, 0xFBC, + 0x1000, 0x1036, + 0x1038, 0x1038, + 0x103B, 0x1049, + 0x1050, 0x109D, + 0x10A0, 0x10C5, + 0x10C7, 0x10C7, + 0x10CD, 0x10CD, + 0x10D0, 0x10FA, + 0x10FC, 0x1248, + 0x124A, 0x124D, + 0x1250, 0x1256, + 0x1258, 0x1258, + 0x125A, 0x125D, + 0x1260, 0x1288, + 0x128A, 0x128D, + 0x1290, 0x12B0, + 0x12B2, 0x12B5, + 0x12B8, 0x12BE, + 0x12C0, 0x12C0, + 0x12C2, 0x12C5, + 0x12C8, 0x12D6, + 0x12D8, 0x1310, + 0x1312, 0x1315, + 0x1318, 0x135A, + 0x1380, 0x138F, + 0x13A0, 0x13F5, + 0x13F8, 0x13FD, + 0x1401, 0x166C, + 0x166F, 0x167F, + 0x1681, 0x169A, + 0x16A0, 0x16EA, + 0x16EE, 0x16F8, + 0x1700, 0x1713, + 0x171F, 0x1733, + 0x1740, 0x1753, + 0x1760, 0x176C, + 0x176E, 0x1770, + 0x1772, 0x1773, + 0x1780, 0x17B3, + 0x17B6, 0x17C8, + 0x17D7, 0x17D7, + 0x17DC, 0x17DC, + 0x17E0, 0x17E9, + 0x1810, 0x1819, + 0x1820, 0x1878, + 0x1880, 0x18AA, + 0x18B0, 0x18F5, + 0x1900, 0x191E, + 0x1920, 0x192B, + 0x1930, 0x1938, + 0x1946, 0x196D, + 0x1970, 0x1974, + 0x1980, 0x19AB, + 0x19B0, 0x19C9, + 0x19D0, 0x19D9, + 0x1A00, 0x1A1B, + 0x1A20, 0x1A5E, + 0x1A61, 0x1A74, + 0x1A80, 0x1A89, + 0x1A90, 0x1A99, + 0x1AA7, 0x1AA7, + 0x1ABF, 0x1AC0, + 0x1ACC, 0x1ACE, + 0x1B00, 0x1B33, + 0x1B35, 0x1B43, + 0x1B45, 0x1B4C, + 0x1B50, 0x1B59, + 0x1B80, 0x1BA9, + 0x1BAC, 0x1BE5, + 0x1BE7, 0x1BF1, + 0x1C00, 0x1C36, + 0x1C40, 0x1C49, + 0x1C4D, 0x1C7D, + 0x1C80, 0x1C8A, + 0x1C90, 0x1CBA, + 0x1CBD, 0x1CBF, + 0x1CE9, 0x1CEC, + 0x1CEE, 0x1CF3, + 0x1CF5, 0x1CF6, + 0x1CFA, 0x1CFA, + 0x1D00, 0x1DBF, + 0x1DD3, 0x1DF4, + 0x1E00, 0x1F15, + 0x1F18, 0x1F1D, + 0x1F20, 0x1F45, + 0x1F48, 0x1F4D, + 0x1F50, 0x1F57, + 0x1F59, 0x1F59, + 0x1F5B, 0x1F5B, + 0x1F5D, 0x1F5D, + 0x1F5F, 0x1F7D, + 0x1F80, 0x1FB4, + 0x1FB6, 0x1FBC, + 0x1FBE, 0x1FBE, + 0x1FC2, 0x1FC4, + 0x1FC6, 0x1FCC, + 0x1FD0, 0x1FD3, + 0x1FD6, 0x1FDB, + 0x1FE0, 0x1FEC, + 0x1FF2, 0x1FF4, + 0x1FF6, 0x1FFC, + 0x2071, 0x2071, + 0x207F, 0x207F, + 0x2090, 0x209C, + 0x2102, 0x2102, + 0x2107, 0x2107, + 0x210A, 0x2113, + 0x2115, 0x2115, + 0x2119, 0x211D, + 0x2124, 0x2124, + 0x2126, 0x2126, + 0x2128, 0x2128, + 0x212A, 0x212D, + 0x212F, 0x2139, + 0x213C, 0x213F, + 0x2145, 0x2149, + 0x214E, 0x214E, + 0x2160, 0x2188, + 0x24B6, 0x24E9, + 0x2C00, 0x2CE4, + 0x2CEB, 0x2CEE, + 0x2CF2, 0x2CF3, + 0x2D00, 0x2D25, + 0x2D27, 0x2D27, + 0x2D2D, 0x2D2D, + 0x2D30, 0x2D67, + 0x2D6F, 0x2D6F, + 0x2D80, 0x2D96, + 0x2DA0, 0x2DA6, + 0x2DA8, 0x2DAE, + 0x2DB0, 0x2DB6, + 0x2DB8, 0x2DBE, + 0x2DC0, 0x2DC6, + 0x2DC8, 0x2DCE, + 0x2DD0, 0x2DD6, + 0x2DD8, 0x2DDE, + 0x2DE0, 0x2DFF, + 0x2E2F, 0x2E2F, + 0x3005, 0x3007, + 0x3021, 0x3029, + 0x3031, 0x3035, + 0x3038, 0x303C, + 0x3041, 0x3096, + 0x309D, 0x309F, + 0x30A1, 0x30FA, + 0x30FC, 0x30FF, + 0x3105, 0x312F, + 0x3131, 0x318E, + 0x31A0, 0x31BF, + 0x31F0, 0x31FF, + 0x3400, 0x4DBF, + 0x4E00, 0xA48C, + 0xA4D0, 0xA4FD, + 0xA500, 0xA60C, + 0xA610, 0xA62B, + 0xA640, 0xA66E, + 0xA674, 0xA67B, + 0xA67F, 0xA6EF, + 0xA717, 0xA71F, + 0xA722, 0xA788, + 0xA78B, 0xA7DC, + 0xA7F1, 0xA805, + 0xA807, 0xA827, + 0xA840, 0xA873, + 0xA880, 0xA8C3, + 0xA8C5, 0xA8C5, + 0xA8D0, 0xA8D9, + 0xA8F2, 0xA8F7, + 0xA8FB, 0xA8FB, + 0xA8FD, 0xA92A, + 0xA930, 0xA952, + 0xA960, 0xA97C, + 0xA980, 0xA9B2, + 0xA9B4, 0xA9BF, + 0xA9CF, 0xA9D9, + 0xA9E0, 0xA9FE, + 0xAA00, 0xAA36, + 0xAA40, 0xAA4D, + 0xAA50, 0xAA59, + 0xAA60, 0xAA76, + 0xAA7A, 0xAABE, + 0xAAC0, 0xAAC0, + 0xAAC2, 0xAAC2, + 0xAADB, 0xAADD, + 0xAAE0, 0xAAEF, + 0xAAF2, 0xAAF5, + 0xAB01, 0xAB06, + 0xAB09, 0xAB0E, + 0xAB11, 0xAB16, + 0xAB20, 0xAB26, + 0xAB28, 0xAB2E, + 0xAB30, 0xAB5A, + 0xAB5C, 0xAB69, + 0xAB70, 0xABEA, + 0xABF0, 0xABF9, + 0xAC00, 0xD7A3, + 0xD7B0, 0xD7C6, + 0xD7CB, 0xD7FB, + 0xF900, 0xFA6D, + 0xFA70, 0xFAD9, + 0xFB00, 0xFB06, + 0xFB13, 0xFB17, + 0xFB1D, 0xFB28, + 0xFB2A, 0xFB36, + 0xFB38, 0xFB3C, + 0xFB3E, 0xFB3E, + 0xFB40, 0xFB41, + 0xFB43, 0xFB44, + 0xFB46, 0xFBB1, + 0xFBD3, 0xFD3D, + 0xFD50, 0xFD8F, + 0xFD92, 0xFDC7, + 0xFDF0, 0xFDFB, + 0xFE70, 0xFE74, + 0xFE76, 0xFEFC, + 0xFF10, 0xFF19, + 0xFF21, 0xFF3A, + 0xFF41, 0xFF5A, + 0xFF66, 0xFFBE, + 0xFFC2, 0xFFC7, + 0xFFCA, 0xFFCF, + 0xFFD2, 0xFFD7, + 0xFFDA, 0xFFDC, + 0x10000, 0x1000B, + 0x1000D, 0x10026, + 0x10028, 0x1003A, + 0x1003C, 0x1003D, + 0x1003F, 0x1004D, + 0x10050, 0x1005D, + 0x10080, 0x100FA, + 0x10140, 0x10174, + 0x10280, 0x1029C, + 0x102A0, 0x102D0, + 0x10300, 0x1031F, + 0x1032D, 0x1034A, + 0x10350, 0x1037A, + 0x10380, 0x1039D, + 0x103A0, 0x103C3, + 0x103C8, 0x103CF, + 0x103D1, 0x103D5, + 0x10400, 0x1049D, + 0x104A0, 0x104A9, + 0x104B0, 0x104D3, + 0x104D8, 0x104FB, + 0x10500, 0x10527, + 0x10530, 0x10563, + 0x10570, 0x1057A, + 0x1057C, 0x1058A, + 0x1058C, 0x10592, + 0x10594, 0x10595, + 0x10597, 0x105A1, + 0x105A3, 0x105B1, + 0x105B3, 0x105B9, + 0x105BB, 0x105BC, + 0x105C0, 0x105F3, + 0x10600, 0x10736, + 0x10740, 0x10755, + 0x10760, 0x10767, + 0x10780, 0x10785, + 0x10787, 0x107B0, + 0x107B2, 0x107BA, + 0x10800, 0x10805, + 0x10808, 0x10808, + 0x1080A, 0x10835, + 0x10837, 0x10838, + 0x1083C, 0x1083C, + 0x1083F, 0x10855, + 0x10860, 0x10876, + 0x10880, 0x1089E, + 0x108E0, 0x108F2, + 0x108F4, 0x108F5, + 0x10900, 0x10915, + 0x10920, 0x10939, + 0x10940, 0x10959, + 0x10980, 0x109B7, + 0x109BE, 0x109BF, + 0x10A00, 0x10A03, + 0x10A05, 0x10A06, + 0x10A0C, 0x10A13, + 0x10A15, 0x10A17, + 0x10A19, 0x10A35, + 0x10A60, 0x10A7C, + 0x10A80, 0x10A9C, + 0x10AC0, 0x10AC7, + 0x10AC9, 0x10AE4, + 0x10B00, 0x10B35, + 0x10B40, 0x10B55, + 0x10B60, 0x10B72, + 0x10B80, 0x10B91, + 0x10C00, 0x10C48, + 0x10C80, 0x10CB2, + 0x10CC0, 0x10CF2, + 0x10D00, 0x10D27, + 0x10D30, 0x10D39, + 0x10D40, 0x10D65, + 0x10D69, 0x10D69, + 0x10D6F, 0x10D85, + 0x10E80, 0x10EA9, + 0x10EAB, 0x10EAC, + 0x10EB0, 0x10EB1, + 0x10EC2, 0x10EC7, + 0x10EFA, 0x10EFC, + 0x10F00, 0x10F1C, + 0x10F27, 0x10F27, + 0x10F30, 0x10F45, + 0x10F70, 0x10F81, + 0x10FB0, 0x10FC4, + 0x10FE0, 0x10FF6, + 0x11000, 0x11045, + 0x11066, 0x1106F, + 0x11071, 0x11075, + 0x11080, 0x110B8, + 0x110C2, 0x110C2, + 0x110D0, 0x110E8, + 0x110F0, 0x110F9, + 0x11100, 0x11132, + 0x11136, 0x1113F, + 0x11144, 0x11147, + 0x11150, 0x11172, + 0x11176, 0x11176, + 0x11180, 0x111BF, + 0x111C1, 0x111C4, + 0x111CE, 0x111DA, + 0x111DC, 0x111DC, + 0x11200, 0x11211, + 0x11213, 0x11234, + 0x11237, 0x11237, + 0x1123E, 0x11241, + 0x11280, 0x11286, + 0x11288, 0x11288, + 0x1128A, 0x1128D, + 0x1128F, 0x1129D, + 0x1129F, 0x112A8, + 0x112B0, 0x112E8, + 0x112F0, 0x112F9, + 0x11300, 0x11303, + 0x11305, 0x1130C, + 0x1130F, 0x11310, + 0x11313, 0x11328, + 0x1132A, 0x11330, + 0x11332, 0x11333, + 0x11335, 0x11339, + 0x1133D, 0x11344, + 0x11347, 0x11348, + 0x1134B, 0x1134C, + 0x11350, 0x11350, + 0x11357, 0x11357, + 0x1135D, 0x11363, + 0x11380, 0x11389, + 0x1138B, 0x1138B, + 0x1138E, 0x1138E, + 0x11390, 0x113B5, + 0x113B7, 0x113C0, + 0x113C2, 0x113C2, + 0x113C5, 0x113C5, + 0x113C7, 0x113CA, + 0x113CC, 0x113CD, + 0x113D1, 0x113D1, + 0x113D3, 0x113D3, + 0x11400, 0x11441, + 0x11443, 0x11445, + 0x11447, 0x1144A, + 0x11450, 0x11459, + 0x1145F, 0x11461, + 0x11480, 0x114C1, + 0x114C4, 0x114C5, + 0x114C7, 0x114C7, + 0x114D0, 0x114D9, + 0x11580, 0x115B5, + 0x115B8, 0x115BE, + 0x115D8, 0x115DD, + 0x11600, 0x1163E, + 0x11640, 0x11640, + 0x11644, 0x11644, + 0x11650, 0x11659, + 0x11680, 0x116B5, + 0x116B8, 0x116B8, + 0x116C0, 0x116C9, + 0x116D0, 0x116E3, + 0x11700, 0x1171A, + 0x1171D, 0x1172A, + 0x11730, 0x11739, + 0x11740, 0x11746, + 0x11800, 0x11838, + 0x118A0, 0x118E9, + 0x118FF, 0x11906, + 0x11909, 0x11909, + 0x1190C, 0x11913, + 0x11915, 0x11916, + 0x11918, 0x11935, + 0x11937, 0x11938, + 0x1193B, 0x1193C, + 0x1193F, 0x11942, + 0x11950, 0x11959, + 0x119A0, 0x119A7, + 0x119AA, 0x119D7, + 0x119DA, 0x119DF, + 0x119E1, 0x119E1, + 0x119E3, 0x119E4, + 0x11A00, 0x11A32, + 0x11A35, 0x11A3E, + 0x11A50, 0x11A97, + 0x11A9D, 0x11A9D, + 0x11AB0, 0x11AF8, + 0x11B60, 0x11B67, + 0x11BC0, 0x11BE0, + 0x11BF0, 0x11BF9, + 0x11C00, 0x11C08, + 0x11C0A, 0x11C36, + 0x11C38, 0x11C3E, + 0x11C40, 0x11C40, + 0x11C50, 0x11C59, + 0x11C72, 0x11C8F, + 0x11C92, 0x11CA7, + 0x11CA9, 0x11CB6, + 0x11D00, 0x11D06, + 0x11D08, 0x11D09, + 0x11D0B, 0x11D36, + 0x11D3A, 0x11D3A, + 0x11D3C, 0x11D3D, + 0x11D3F, 0x11D41, + 0x11D43, 0x11D43, + 0x11D46, 0x11D47, + 0x11D50, 0x11D59, + 0x11D60, 0x11D65, + 0x11D67, 0x11D68, + 0x11D6A, 0x11D8E, + 0x11D90, 0x11D91, + 0x11D93, 0x11D96, + 0x11D98, 0x11D98, + 0x11DA0, 0x11DA9, + 0x11DB0, 0x11DDB, + 0x11DE0, 0x11DE9, + 0x11EE0, 0x11EF6, + 0x11F00, 0x11F10, + 0x11F12, 0x11F3A, + 0x11F3E, 0x11F40, + 0x11F50, 0x11F59, + 0x11FB0, 0x11FB0, + 0x12000, 0x12399, + 0x12400, 0x1246E, + 0x12480, 0x12543, + 0x12F90, 0x12FF0, + 0x13000, 0x1342F, + 0x13441, 0x13446, + 0x13460, 0x143FA, + 0x14400, 0x14646, + 0x16100, 0x1612E, + 0x16130, 0x16139, + 0x16800, 0x16A38, + 0x16A40, 0x16A5E, + 0x16A60, 0x16A69, + 0x16A70, 0x16ABE, + 0x16AC0, 0x16AC9, + 0x16AD0, 0x16AED, + 0x16B00, 0x16B2F, + 0x16B40, 0x16B43, + 0x16B50, 0x16B59, + 0x16B63, 0x16B77, + 0x16B7D, 0x16B8F, + 0x16D40, 0x16D6C, + 0x16D70, 0x16D79, + 0x16E40, 0x16E7F, + 0x16EA0, 0x16EB8, + 0x16EBB, 0x16ED3, + 0x16F00, 0x16F4A, + 0x16F4F, 0x16F87, + 0x16F8F, 0x16F9F, + 0x16FE0, 0x16FE1, + 0x16FE3, 0x16FE3, + 0x16FF0, 0x16FF6, + 0x17000, 0x18CD5, + 0x18CFF, 0x18D1E, + 0x18D80, 0x18DF2, + 0x1AFF0, 0x1AFF3, + 0x1AFF5, 0x1AFFB, + 0x1AFFD, 0x1AFFE, + 0x1B000, 0x1B122, + 0x1B132, 0x1B132, + 0x1B150, 0x1B152, + 0x1B155, 0x1B155, + 0x1B164, 0x1B167, + 0x1B170, 0x1B2FB, + 0x1BC00, 0x1BC6A, + 0x1BC70, 0x1BC7C, + 0x1BC80, 0x1BC88, + 0x1BC90, 0x1BC99, + 0x1BC9E, 0x1BC9E, + 0x1CCF0, 0x1CCF9, + 0x1D400, 0x1D454, + 0x1D456, 0x1D49C, + 0x1D49E, 0x1D49F, + 0x1D4A2, 0x1D4A2, + 0x1D4A5, 0x1D4A6, + 0x1D4A9, 0x1D4AC, + 0x1D4AE, 0x1D4B9, + 0x1D4BB, 0x1D4BB, + 0x1D4BD, 0x1D4C3, + 0x1D4C5, 0x1D505, + 0x1D507, 0x1D50A, + 0x1D50D, 0x1D514, + 0x1D516, 0x1D51C, + 0x1D51E, 0x1D539, + 0x1D53B, 0x1D53E, + 0x1D540, 0x1D544, + 0x1D546, 0x1D546, + 0x1D54A, 0x1D550, + 0x1D552, 0x1D6A5, + 0x1D6A8, 0x1D6C0, + 0x1D6C2, 0x1D6DA, + 0x1D6DC, 0x1D6FA, + 0x1D6FC, 0x1D714, + 0x1D716, 0x1D734, + 0x1D736, 0x1D74E, + 0x1D750, 0x1D76E, + 0x1D770, 0x1D788, + 0x1D78A, 0x1D7A8, + 0x1D7AA, 0x1D7C2, + 0x1D7C4, 0x1D7CB, + 0x1D7CE, 0x1D7FF, + 0x1DF00, 0x1DF1E, + 0x1DF25, 0x1DF2A, + 0x1E000, 0x1E006, + 0x1E008, 0x1E018, + 0x1E01B, 0x1E021, + 0x1E023, 0x1E024, + 0x1E026, 0x1E02A, + 0x1E030, 0x1E06D, + 0x1E08F, 0x1E08F, + 0x1E100, 0x1E12C, + 0x1E137, 0x1E13D, + 0x1E140, 0x1E149, + 0x1E14E, 0x1E14E, + 0x1E290, 0x1E2AD, + 0x1E2C0, 0x1E2EB, + 0x1E2F0, 0x1E2F9, + 0x1E4D0, 0x1E4EB, + 0x1E4F0, 0x1E4F9, + 0x1E5D0, 0x1E5ED, + 0x1E5F0, 0x1E5FA, + 0x1E6C0, 0x1E6DE, + 0x1E6E0, 0x1E6F5, + 0x1E6FE, 0x1E6FF, + 0x1E7E0, 0x1E7E6, + 0x1E7E8, 0x1E7EB, + 0x1E7ED, 0x1E7EE, + 0x1E7F0, 0x1E7FE, + 0x1E800, 0x1E8C4, + 0x1E900, 0x1E943, + 0x1E947, 0x1E947, + 0x1E94B, 0x1E94B, + 0x1E950, 0x1E959, + 0x1EE00, 0x1EE03, + 0x1EE05, 0x1EE1F, + 0x1EE21, 0x1EE22, + 0x1EE24, 0x1EE24, + 0x1EE27, 0x1EE27, + 0x1EE29, 0x1EE32, + 0x1EE34, 0x1EE37, + 0x1EE39, 0x1EE39, + 0x1EE3B, 0x1EE3B, + 0x1EE42, 0x1EE42, + 0x1EE47, 0x1EE47, + 0x1EE49, 0x1EE49, + 0x1EE4B, 0x1EE4B, + 0x1EE4D, 0x1EE4F, + 0x1EE51, 0x1EE52, + 0x1EE54, 0x1EE54, + 0x1EE57, 0x1EE57, + 0x1EE59, 0x1EE59, + 0x1EE5B, 0x1EE5B, + 0x1EE5D, 0x1EE5D, + 0x1EE5F, 0x1EE5F, + 0x1EE61, 0x1EE62, + 0x1EE64, 0x1EE64, + 0x1EE67, 0x1EE6A, + 0x1EE6C, 0x1EE72, + 0x1EE74, 0x1EE77, + 0x1EE79, 0x1EE7C, + 0x1EE7E, 0x1EE7E, + 0x1EE80, 0x1EE89, + 0x1EE8B, 0x1EE9B, + 0x1EEA1, 0x1EEA3, + 0x1EEA5, 0x1EEA9, + 0x1EEAB, 0x1EEBB, + 0x1F130, 0x1F149, + 0x1F150, 0x1F169, + 0x1F170, 0x1F189, + 0x1FBF0, 0x1FBF9, + 0x20000, 0x2A6DF, + 0x2A700, 0x2B81D, + 0x2B820, 0x2CEAD, + 0x2CEB0, 0x2EBE0, + 0x2EBF0, 0x2EE5D, + 0x2F800, 0x2FA1D, + 0x30000, 0x3134A, + 0x31350, 0x33479, +}; + +#define UNICODE_ISUPPER_CODEPOINTS_LENGTH 1320 +static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_CODEPOINTS_LENGTH] = { + 0x100, 0x100, + 0x102, 0x102, + 0x104, 0x104, + 0x106, 0x106, + 0x108, 0x108, + 0x10A, 0x10A, + 0x10C, 0x10C, + 0x10E, 0x10E, + 0x110, 0x110, + 0x112, 0x112, + 0x114, 0x114, + 0x116, 0x116, + 0x118, 0x118, + 0x11A, 0x11A, + 0x11C, 0x11C, + 0x11E, 0x11E, + 0x120, 0x120, + 0x122, 0x122, + 0x124, 0x124, + 0x126, 0x126, + 0x128, 0x128, + 0x12A, 0x12A, + 0x12C, 0x12C, + 0x12E, 0x12E, + 0x130, 0x130, + 0x132, 0x132, + 0x134, 0x134, + 0x136, 0x136, + 0x139, 0x139, + 0x13B, 0x13B, + 0x13D, 0x13D, + 0x13F, 0x13F, + 0x141, 0x141, + 0x143, 0x143, + 0x145, 0x145, + 0x147, 0x147, + 0x14A, 0x14A, + 0x14C, 0x14C, + 0x14E, 0x14E, + 0x150, 0x150, + 0x152, 0x152, + 0x154, 0x154, + 0x156, 0x156, + 0x158, 0x158, + 0x15A, 0x15A, + 0x15C, 0x15C, + 0x15E, 0x15E, + 0x160, 0x160, + 0x162, 0x162, + 0x164, 0x164, + 0x166, 0x166, + 0x168, 0x168, + 0x16A, 0x16A, + 0x16C, 0x16C, + 0x16E, 0x16E, + 0x170, 0x170, + 0x172, 0x172, + 0x174, 0x174, + 0x176, 0x176, + 0x178, 0x179, + 0x17B, 0x17B, + 0x17D, 0x17D, + 0x181, 0x182, + 0x184, 0x184, + 0x186, 0x187, + 0x189, 0x18B, + 0x18E, 0x191, + 0x193, 0x194, + 0x196, 0x198, + 0x19C, 0x19D, + 0x19F, 0x1A0, + 0x1A2, 0x1A2, + 0x1A4, 0x1A4, + 0x1A6, 0x1A7, + 0x1A9, 0x1A9, + 0x1AC, 0x1AC, + 0x1AE, 0x1AF, + 0x1B1, 0x1B3, + 0x1B5, 0x1B5, + 0x1B7, 0x1B8, + 0x1BC, 0x1BC, + 0x1C4, 0x1C5, + 0x1C7, 0x1C8, + 0x1CA, 0x1CB, + 0x1CD, 0x1CD, + 0x1CF, 0x1CF, + 0x1D1, 0x1D1, + 0x1D3, 0x1D3, + 0x1D5, 0x1D5, + 0x1D7, 0x1D7, + 0x1D9, 0x1D9, + 0x1DB, 0x1DB, + 0x1DE, 0x1DE, + 0x1E0, 0x1E0, + 0x1E2, 0x1E2, + 0x1E4, 0x1E4, + 0x1E6, 0x1E6, + 0x1E8, 0x1E8, + 0x1EA, 0x1EA, + 0x1EC, 0x1EC, + 0x1EE, 0x1EE, + 0x1F1, 0x1F2, + 0x1F4, 0x1F4, + 0x1F6, 0x1F8, + 0x1FA, 0x1FA, + 0x1FC, 0x1FC, + 0x1FE, 0x1FE, + 0x200, 0x200, + 0x202, 0x202, + 0x204, 0x204, + 0x206, 0x206, + 0x208, 0x208, + 0x20A, 0x20A, + 0x20C, 0x20C, + 0x20E, 0x20E, + 0x210, 0x210, + 0x212, 0x212, + 0x214, 0x214, + 0x216, 0x216, + 0x218, 0x218, + 0x21A, 0x21A, + 0x21C, 0x21C, + 0x21E, 0x21E, + 0x220, 0x220, + 0x222, 0x222, + 0x224, 0x224, + 0x226, 0x226, + 0x228, 0x228, + 0x22A, 0x22A, + 0x22C, 0x22C, + 0x22E, 0x22E, + 0x230, 0x230, + 0x232, 0x232, + 0x23A, 0x23B, + 0x23D, 0x23E, + 0x241, 0x241, + 0x243, 0x246, + 0x248, 0x248, + 0x24A, 0x24A, + 0x24C, 0x24C, + 0x24E, 0x24E, + 0x370, 0x370, + 0x372, 0x372, + 0x376, 0x376, + 0x37F, 0x37F, + 0x386, 0x386, + 0x388, 0x38A, + 0x38C, 0x38C, + 0x38E, 0x38F, + 0x391, 0x3A1, + 0x3A3, 0x3AB, + 0x3CF, 0x3CF, + 0x3D2, 0x3D4, + 0x3D8, 0x3D8, + 0x3DA, 0x3DA, + 0x3DC, 0x3DC, + 0x3DE, 0x3DE, + 0x3E0, 0x3E0, + 0x3E2, 0x3E2, + 0x3E4, 0x3E4, + 0x3E6, 0x3E6, + 0x3E8, 0x3E8, + 0x3EA, 0x3EA, + 0x3EC, 0x3EC, + 0x3EE, 0x3EE, + 0x3F4, 0x3F4, + 0x3F7, 0x3F7, + 0x3F9, 0x3FA, + 0x3FD, 0x42F, + 0x460, 0x460, + 0x462, 0x462, + 0x464, 0x464, + 0x466, 0x466, + 0x468, 0x468, + 0x46A, 0x46A, + 0x46C, 0x46C, + 0x46E, 0x46E, + 0x470, 0x470, + 0x472, 0x472, + 0x474, 0x474, + 0x476, 0x476, + 0x478, 0x478, + 0x47A, 0x47A, + 0x47C, 0x47C, + 0x47E, 0x47E, + 0x480, 0x480, + 0x48A, 0x48A, + 0x48C, 0x48C, + 0x48E, 0x48E, + 0x490, 0x490, + 0x492, 0x492, + 0x494, 0x494, + 0x496, 0x496, + 0x498, 0x498, + 0x49A, 0x49A, + 0x49C, 0x49C, + 0x49E, 0x49E, + 0x4A0, 0x4A0, + 0x4A2, 0x4A2, + 0x4A4, 0x4A4, + 0x4A6, 0x4A6, + 0x4A8, 0x4A8, + 0x4AA, 0x4AA, + 0x4AC, 0x4AC, + 0x4AE, 0x4AE, + 0x4B0, 0x4B0, + 0x4B2, 0x4B2, + 0x4B4, 0x4B4, + 0x4B6, 0x4B6, + 0x4B8, 0x4B8, + 0x4BA, 0x4BA, + 0x4BC, 0x4BC, + 0x4BE, 0x4BE, + 0x4C0, 0x4C1, + 0x4C3, 0x4C3, + 0x4C5, 0x4C5, + 0x4C7, 0x4C7, + 0x4C9, 0x4C9, + 0x4CB, 0x4CB, + 0x4CD, 0x4CD, + 0x4D0, 0x4D0, + 0x4D2, 0x4D2, + 0x4D4, 0x4D4, + 0x4D6, 0x4D6, + 0x4D8, 0x4D8, + 0x4DA, 0x4DA, + 0x4DC, 0x4DC, + 0x4DE, 0x4DE, + 0x4E0, 0x4E0, + 0x4E2, 0x4E2, + 0x4E4, 0x4E4, + 0x4E6, 0x4E6, + 0x4E8, 0x4E8, + 0x4EA, 0x4EA, + 0x4EC, 0x4EC, + 0x4EE, 0x4EE, + 0x4F0, 0x4F0, + 0x4F2, 0x4F2, + 0x4F4, 0x4F4, + 0x4F6, 0x4F6, + 0x4F8, 0x4F8, + 0x4FA, 0x4FA, + 0x4FC, 0x4FC, + 0x4FE, 0x4FE, + 0x500, 0x500, + 0x502, 0x502, + 0x504, 0x504, + 0x506, 0x506, + 0x508, 0x508, + 0x50A, 0x50A, + 0x50C, 0x50C, + 0x50E, 0x50E, + 0x510, 0x510, + 0x512, 0x512, + 0x514, 0x514, + 0x516, 0x516, + 0x518, 0x518, + 0x51A, 0x51A, + 0x51C, 0x51C, + 0x51E, 0x51E, + 0x520, 0x520, + 0x522, 0x522, + 0x524, 0x524, + 0x526, 0x526, + 0x528, 0x528, + 0x52A, 0x52A, + 0x52C, 0x52C, + 0x52E, 0x52E, + 0x531, 0x556, + 0x10A0, 0x10C5, + 0x10C7, 0x10C7, + 0x10CD, 0x10CD, + 0x13A0, 0x13F5, + 0x1C89, 0x1C89, + 0x1C90, 0x1CBA, + 0x1CBD, 0x1CBF, + 0x1E00, 0x1E00, + 0x1E02, 0x1E02, + 0x1E04, 0x1E04, + 0x1E06, 0x1E06, + 0x1E08, 0x1E08, + 0x1E0A, 0x1E0A, + 0x1E0C, 0x1E0C, + 0x1E0E, 0x1E0E, + 0x1E10, 0x1E10, + 0x1E12, 0x1E12, + 0x1E14, 0x1E14, + 0x1E16, 0x1E16, + 0x1E18, 0x1E18, + 0x1E1A, 0x1E1A, + 0x1E1C, 0x1E1C, + 0x1E1E, 0x1E1E, + 0x1E20, 0x1E20, + 0x1E22, 0x1E22, + 0x1E24, 0x1E24, + 0x1E26, 0x1E26, + 0x1E28, 0x1E28, + 0x1E2A, 0x1E2A, + 0x1E2C, 0x1E2C, + 0x1E2E, 0x1E2E, + 0x1E30, 0x1E30, + 0x1E32, 0x1E32, + 0x1E34, 0x1E34, + 0x1E36, 0x1E36, + 0x1E38, 0x1E38, + 0x1E3A, 0x1E3A, + 0x1E3C, 0x1E3C, + 0x1E3E, 0x1E3E, + 0x1E40, 0x1E40, + 0x1E42, 0x1E42, + 0x1E44, 0x1E44, + 0x1E46, 0x1E46, + 0x1E48, 0x1E48, + 0x1E4A, 0x1E4A, + 0x1E4C, 0x1E4C, + 0x1E4E, 0x1E4E, + 0x1E50, 0x1E50, + 0x1E52, 0x1E52, + 0x1E54, 0x1E54, + 0x1E56, 0x1E56, + 0x1E58, 0x1E58, + 0x1E5A, 0x1E5A, + 0x1E5C, 0x1E5C, + 0x1E5E, 0x1E5E, + 0x1E60, 0x1E60, + 0x1E62, 0x1E62, + 0x1E64, 0x1E64, + 0x1E66, 0x1E66, + 0x1E68, 0x1E68, + 0x1E6A, 0x1E6A, + 0x1E6C, 0x1E6C, + 0x1E6E, 0x1E6E, + 0x1E70, 0x1E70, + 0x1E72, 0x1E72, + 0x1E74, 0x1E74, + 0x1E76, 0x1E76, + 0x1E78, 0x1E78, + 0x1E7A, 0x1E7A, + 0x1E7C, 0x1E7C, + 0x1E7E, 0x1E7E, + 0x1E80, 0x1E80, + 0x1E82, 0x1E82, + 0x1E84, 0x1E84, + 0x1E86, 0x1E86, + 0x1E88, 0x1E88, + 0x1E8A, 0x1E8A, + 0x1E8C, 0x1E8C, + 0x1E8E, 0x1E8E, + 0x1E90, 0x1E90, + 0x1E92, 0x1E92, + 0x1E94, 0x1E94, + 0x1E9E, 0x1E9E, + 0x1EA0, 0x1EA0, + 0x1EA2, 0x1EA2, + 0x1EA4, 0x1EA4, + 0x1EA6, 0x1EA6, + 0x1EA8, 0x1EA8, + 0x1EAA, 0x1EAA, + 0x1EAC, 0x1EAC, + 0x1EAE, 0x1EAE, + 0x1EB0, 0x1EB0, + 0x1EB2, 0x1EB2, + 0x1EB4, 0x1EB4, + 0x1EB6, 0x1EB6, + 0x1EB8, 0x1EB8, + 0x1EBA, 0x1EBA, + 0x1EBC, 0x1EBC, + 0x1EBE, 0x1EBE, + 0x1EC0, 0x1EC0, + 0x1EC2, 0x1EC2, + 0x1EC4, 0x1EC4, + 0x1EC6, 0x1EC6, + 0x1EC8, 0x1EC8, + 0x1ECA, 0x1ECA, + 0x1ECC, 0x1ECC, + 0x1ECE, 0x1ECE, + 0x1ED0, 0x1ED0, + 0x1ED2, 0x1ED2, + 0x1ED4, 0x1ED4, + 0x1ED6, 0x1ED6, + 0x1ED8, 0x1ED8, + 0x1EDA, 0x1EDA, + 0x1EDC, 0x1EDC, + 0x1EDE, 0x1EDE, + 0x1EE0, 0x1EE0, + 0x1EE2, 0x1EE2, + 0x1EE4, 0x1EE4, + 0x1EE6, 0x1EE6, + 0x1EE8, 0x1EE8, + 0x1EEA, 0x1EEA, + 0x1EEC, 0x1EEC, + 0x1EEE, 0x1EEE, + 0x1EF0, 0x1EF0, + 0x1EF2, 0x1EF2, + 0x1EF4, 0x1EF4, + 0x1EF6, 0x1EF6, + 0x1EF8, 0x1EF8, + 0x1EFA, 0x1EFA, + 0x1EFC, 0x1EFC, + 0x1EFE, 0x1EFE, + 0x1F08, 0x1F0F, + 0x1F18, 0x1F1D, + 0x1F28, 0x1F2F, + 0x1F38, 0x1F3F, + 0x1F48, 0x1F4D, + 0x1F59, 0x1F59, + 0x1F5B, 0x1F5B, + 0x1F5D, 0x1F5D, + 0x1F5F, 0x1F5F, + 0x1F68, 0x1F6F, + 0x1F88, 0x1F8F, + 0x1F98, 0x1F9F, + 0x1FA8, 0x1FAF, + 0x1FB8, 0x1FBC, + 0x1FC8, 0x1FCC, + 0x1FD8, 0x1FDB, + 0x1FE8, 0x1FEC, + 0x1FF8, 0x1FFC, + 0x2102, 0x2102, + 0x2107, 0x2107, + 0x210B, 0x210D, + 0x2110, 0x2112, + 0x2115, 0x2115, + 0x2119, 0x211D, + 0x2124, 0x2124, + 0x2126, 0x2126, + 0x2128, 0x2128, + 0x212A, 0x212D, + 0x2130, 0x2133, + 0x213E, 0x213F, + 0x2145, 0x2145, + 0x2160, 0x216F, + 0x2183, 0x2183, + 0x24B6, 0x24CF, + 0x2C00, 0x2C2F, + 0x2C60, 0x2C60, + 0x2C62, 0x2C64, + 0x2C67, 0x2C67, + 0x2C69, 0x2C69, + 0x2C6B, 0x2C6B, + 0x2C6D, 0x2C70, + 0x2C72, 0x2C72, + 0x2C75, 0x2C75, + 0x2C7E, 0x2C80, + 0x2C82, 0x2C82, + 0x2C84, 0x2C84, + 0x2C86, 0x2C86, + 0x2C88, 0x2C88, + 0x2C8A, 0x2C8A, + 0x2C8C, 0x2C8C, + 0x2C8E, 0x2C8E, + 0x2C90, 0x2C90, + 0x2C92, 0x2C92, + 0x2C94, 0x2C94, + 0x2C96, 0x2C96, + 0x2C98, 0x2C98, + 0x2C9A, 0x2C9A, + 0x2C9C, 0x2C9C, + 0x2C9E, 0x2C9E, + 0x2CA0, 0x2CA0, + 0x2CA2, 0x2CA2, + 0x2CA4, 0x2CA4, + 0x2CA6, 0x2CA6, + 0x2CA8, 0x2CA8, + 0x2CAA, 0x2CAA, + 0x2CAC, 0x2CAC, + 0x2CAE, 0x2CAE, + 0x2CB0, 0x2CB0, + 0x2CB2, 0x2CB2, + 0x2CB4, 0x2CB4, + 0x2CB6, 0x2CB6, + 0x2CB8, 0x2CB8, + 0x2CBA, 0x2CBA, + 0x2CBC, 0x2CBC, + 0x2CBE, 0x2CBE, + 0x2CC0, 0x2CC0, + 0x2CC2, 0x2CC2, + 0x2CC4, 0x2CC4, + 0x2CC6, 0x2CC6, + 0x2CC8, 0x2CC8, + 0x2CCA, 0x2CCA, + 0x2CCC, 0x2CCC, + 0x2CCE, 0x2CCE, + 0x2CD0, 0x2CD0, + 0x2CD2, 0x2CD2, + 0x2CD4, 0x2CD4, + 0x2CD6, 0x2CD6, + 0x2CD8, 0x2CD8, + 0x2CDA, 0x2CDA, + 0x2CDC, 0x2CDC, + 0x2CDE, 0x2CDE, + 0x2CE0, 0x2CE0, + 0x2CE2, 0x2CE2, + 0x2CEB, 0x2CEB, + 0x2CED, 0x2CED, + 0x2CF2, 0x2CF2, + 0xA640, 0xA640, + 0xA642, 0xA642, + 0xA644, 0xA644, + 0xA646, 0xA646, + 0xA648, 0xA648, + 0xA64A, 0xA64A, + 0xA64C, 0xA64C, + 0xA64E, 0xA64E, + 0xA650, 0xA650, + 0xA652, 0xA652, + 0xA654, 0xA654, + 0xA656, 0xA656, + 0xA658, 0xA658, + 0xA65A, 0xA65A, + 0xA65C, 0xA65C, + 0xA65E, 0xA65E, + 0xA660, 0xA660, + 0xA662, 0xA662, + 0xA664, 0xA664, + 0xA666, 0xA666, + 0xA668, 0xA668, + 0xA66A, 0xA66A, + 0xA66C, 0xA66C, + 0xA680, 0xA680, + 0xA682, 0xA682, + 0xA684, 0xA684, + 0xA686, 0xA686, + 0xA688, 0xA688, + 0xA68A, 0xA68A, + 0xA68C, 0xA68C, + 0xA68E, 0xA68E, + 0xA690, 0xA690, + 0xA692, 0xA692, + 0xA694, 0xA694, + 0xA696, 0xA696, + 0xA698, 0xA698, + 0xA69A, 0xA69A, + 0xA722, 0xA722, + 0xA724, 0xA724, + 0xA726, 0xA726, + 0xA728, 0xA728, + 0xA72A, 0xA72A, + 0xA72C, 0xA72C, + 0xA72E, 0xA72E, + 0xA732, 0xA732, + 0xA734, 0xA734, + 0xA736, 0xA736, + 0xA738, 0xA738, + 0xA73A, 0xA73A, + 0xA73C, 0xA73C, + 0xA73E, 0xA73E, + 0xA740, 0xA740, + 0xA742, 0xA742, + 0xA744, 0xA744, + 0xA746, 0xA746, + 0xA748, 0xA748, + 0xA74A, 0xA74A, + 0xA74C, 0xA74C, + 0xA74E, 0xA74E, + 0xA750, 0xA750, + 0xA752, 0xA752, + 0xA754, 0xA754, + 0xA756, 0xA756, + 0xA758, 0xA758, + 0xA75A, 0xA75A, + 0xA75C, 0xA75C, + 0xA75E, 0xA75E, + 0xA760, 0xA760, + 0xA762, 0xA762, + 0xA764, 0xA764, + 0xA766, 0xA766, + 0xA768, 0xA768, + 0xA76A, 0xA76A, + 0xA76C, 0xA76C, + 0xA76E, 0xA76E, + 0xA779, 0xA779, + 0xA77B, 0xA77B, + 0xA77D, 0xA77E, + 0xA780, 0xA780, + 0xA782, 0xA782, + 0xA784, 0xA784, + 0xA786, 0xA786, + 0xA78B, 0xA78B, + 0xA78D, 0xA78D, + 0xA790, 0xA790, + 0xA792, 0xA792, + 0xA796, 0xA796, + 0xA798, 0xA798, + 0xA79A, 0xA79A, + 0xA79C, 0xA79C, + 0xA79E, 0xA79E, + 0xA7A0, 0xA7A0, + 0xA7A2, 0xA7A2, + 0xA7A4, 0xA7A4, + 0xA7A6, 0xA7A6, + 0xA7A8, 0xA7A8, + 0xA7AA, 0xA7AE, + 0xA7B0, 0xA7B4, + 0xA7B6, 0xA7B6, + 0xA7B8, 0xA7B8, + 0xA7BA, 0xA7BA, + 0xA7BC, 0xA7BC, + 0xA7BE, 0xA7BE, + 0xA7C0, 0xA7C0, + 0xA7C2, 0xA7C2, + 0xA7C4, 0xA7C7, + 0xA7C9, 0xA7C9, + 0xA7CB, 0xA7CC, + 0xA7CE, 0xA7CE, + 0xA7D0, 0xA7D0, + 0xA7D2, 0xA7D2, + 0xA7D4, 0xA7D4, + 0xA7D6, 0xA7D6, + 0xA7D8, 0xA7D8, + 0xA7DA, 0xA7DA, + 0xA7DC, 0xA7DC, + 0xA7F5, 0xA7F5, + 0xFF21, 0xFF3A, + 0x10400, 0x10427, + 0x104B0, 0x104D3, + 0x10570, 0x1057A, + 0x1057C, 0x1058A, + 0x1058C, 0x10592, + 0x10594, 0x10595, + 0x10C80, 0x10CB2, + 0x10D50, 0x10D65, + 0x118A0, 0x118BF, + 0x16E40, 0x16E5F, + 0x16EA0, 0x16EB8, + 0x1D400, 0x1D419, + 0x1D434, 0x1D44D, + 0x1D468, 0x1D481, + 0x1D49C, 0x1D49C, + 0x1D49E, 0x1D49F, + 0x1D4A2, 0x1D4A2, + 0x1D4A5, 0x1D4A6, + 0x1D4A9, 0x1D4AC, + 0x1D4AE, 0x1D4B5, + 0x1D4D0, 0x1D4E9, + 0x1D504, 0x1D505, + 0x1D507, 0x1D50A, + 0x1D50D, 0x1D514, + 0x1D516, 0x1D51C, + 0x1D538, 0x1D539, + 0x1D53B, 0x1D53E, + 0x1D540, 0x1D544, + 0x1D546, 0x1D546, + 0x1D54A, 0x1D550, + 0x1D56C, 0x1D585, + 0x1D5A0, 0x1D5B9, + 0x1D5D4, 0x1D5ED, + 0x1D608, 0x1D621, + 0x1D63C, 0x1D655, + 0x1D670, 0x1D689, + 0x1D6A8, 0x1D6C0, + 0x1D6E2, 0x1D6FA, + 0x1D71C, 0x1D734, + 0x1D756, 0x1D76E, + 0x1D790, 0x1D7A8, + 0x1D7CA, 0x1D7CA, + 0x1E900, 0x1E921, + 0x1F130, 0x1F149, + 0x1F150, 0x1F169, + 0x1F170, 0x1F189, +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding unicode codepoint. Note that + * this table is different from other encodings where we used a lookup table + * because the indices of those tables are the byte representations, not the + * codepoints themselves. + */ +const uint8_t pm_encoding_unicode_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, // Bx + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx + 7, 7, 7, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 3, // Dx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex + 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, // Fx +}; + +/** + * Binary search through the given list of codepoints to see if the given + * codepoint is in the list. + */ +static bool +pm_unicode_codepoint_match(pm_unicode_codepoint_t codepoint, const pm_unicode_codepoint_t *codepoints, size_t size) { + size_t start = 0; + size_t end = size; + + while (start < end) { + size_t middle = start + (end - start) / 2; + if ((middle % 2) != 0) middle--; + + if (codepoint >= codepoints[middle] && codepoint <= codepoints[middle + 1]) { + return true; + } + + if (codepoint < codepoints[middle]) { + end = middle; + } else { + start = middle + 2; + } + } + + return false; +} + +/** + * A state transition table for decoding UTF-8. + * + * Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +static const uint8_t pm_utf_8_dfa[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df + 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef + 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff + 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 + 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 + 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 + 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 +}; + +/** + * Given a pointer to a string and the number of bytes remaining in the string, + * decode the next UTF-8 codepoint and return it. The number of bytes consumed + * is returned in the width out parameter. + */ +static pm_unicode_codepoint_t +pm_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) { + assert(n >= 0); + + size_t maximum = (n > 4) ? 4 : ((size_t) n); + uint32_t codepoint; + uint32_t state = 0; + + for (size_t index = 0; index < maximum; index++) { + uint32_t byte = b[index]; + uint32_t type = pm_utf_8_dfa[byte]; + + codepoint = (state != 0) ? + (byte & 0x3fu) | (codepoint << 6) : + (0xffu >> type) & (byte); + + state = pm_utf_8_dfa[256 + (state * 16) + type]; + if (state == 0) { + *width = index + 1; + return (pm_unicode_codepoint_t) codepoint; + } + } + + *width = 0; + return 0; +} + +/** + * Return the size of the next character in the UTF-8 encoding. + */ +size_t +pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n) { + assert(n >= 0); + + size_t maximum = (n > 4) ? 4 : ((size_t) n); + uint32_t state = 0; + + for (size_t index = 0; index < maximum; index++) { + state = pm_utf_8_dfa[256 + (state * 16) + pm_utf_8_dfa[b[index]]]; + if (state == 0) return index + 1; + } + + return 0; +} + +/** + * Return the size of the next character in the UTF-8 encoding if it is an + * alphabetical character. + */ +size_t +pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n) { + if (n == 0) { + return 0; + } + + if (*b < 0x80) { + return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT) ? 1 : 0; + } + + size_t width; + pm_unicode_codepoint_t codepoint = pm_utf_8_codepoint(b, n, &width); + + if (codepoint <= 0xFF) { + return (pm_encoding_unicode_table[(uint8_t) codepoint] & PRISM_ENCODING_ALPHABETIC_BIT) ? width : 0; + } else { + return pm_unicode_codepoint_match(codepoint, unicode_alpha_codepoints, UNICODE_ALPHA_CODEPOINTS_LENGTH) ? width : 0; + } +} + +/** + * Return the size of the next character in the UTF-8 encoding if it is an + * alphanumeric character. + */ +size_t +pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n) { + if (n == 0) { + return 0; + } + + if (*b < 0x80) { + return (pm_encoding_unicode_table[*b] & (PRISM_ENCODING_ALPHANUMERIC_BIT)) ? 1 : 0; + } + + size_t width; + pm_unicode_codepoint_t codepoint = pm_utf_8_codepoint(b, n, &width); + + if (codepoint <= 0xFF) { + return (pm_encoding_unicode_table[(uint8_t) codepoint] & (PRISM_ENCODING_ALPHANUMERIC_BIT)) ? width : 0; + } else { + return pm_unicode_codepoint_match(codepoint, unicode_alnum_codepoints, UNICODE_ALNUM_CODEPOINTS_LENGTH) ? width : 0; + } +} + +/** + * Return true if the next character in the UTF-8 encoding if it is an uppercase + * character. + */ +bool +pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) { + if (n == 0) { + return 0; + } + + if (*b < 0x80) { + return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_UPPERCASE_BIT) ? true : false; + } + + size_t width; + pm_unicode_codepoint_t codepoint = pm_utf_8_codepoint(b, n, &width); + + if (codepoint <= 0xFF) { + return (pm_encoding_unicode_table[(uint8_t) codepoint] & PRISM_ENCODING_UPPERCASE_BIT) ? true : false; + } else { + return pm_unicode_codepoint_match(codepoint, unicode_isupper_codepoints, UNICODE_ISUPPER_CODEPOINTS_LENGTH) ? true : false; + } +} + +#ifndef PRISM_ENCODING_EXCLUDE_FULL + +static pm_unicode_codepoint_t +pm_cesu_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) { + + if ((n > 0) && (b[0] < 0x80)) { + *width = 1; + return (pm_unicode_codepoint_t) b[0]; + } + + if (n > 1 && b[0] >= 0xC2 && b[0] <= 0xDF && b[1] >= 0x80 && b[1] <= 0xBF) { + *width = 2; + + // 110xxxxx 10xxxxxx + return (pm_unicode_codepoint_t) (((b[0] & 0x1F) << 6) | (b[1] & 0x3F)); + } + + if (n > 5 && b[0] == 0xED && b[1] >= 0xA0 && b[1] <= 0xAF && b[2] >= 0x80 && b[2] <= 0xBF && b[3] == 0xED && b[4] >= 0xB0 && b[4] <= 0xBF && b[5] >= 0x80 && b[5] <= 0xBF) { + *width = 6; + + // 11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx + return (pm_unicode_codepoint_t) (0x10000 + (((b[1] & 0xF) << 16) | ((b[2] & 0x3F) << 10) | ((b[4] & 0xF) << 6) | (b[5] & 0x3F))); + } + + if (n > 2 && b[0] == 0xED && b[1] >= 0xA0 && b[1] <= 0xBF) { + *width = 3; + + // 11101101 1010xxxx 10xxxxx + return (pm_unicode_codepoint_t) (0x10000 + (((b[0] & 0x03) << 16) | ((b[1] & 0x3F) << 10) | (b[2] & 0x3F))); + } + + if (n > 2 && ((b[0] == 0xE0 && b[1] >= 0xA0) || (b[0] >= 0xE1 && b[0] <= 0xEF && b[1] >= 0x80)) && b[1] <= 0xBF && b[2] >= 0x80 && b[2] <= 0xBF) { + *width = 3; + + // 1110xxxx 10xxxxxx 10xxxxx + return (pm_unicode_codepoint_t) (((b[0] & 0xF) << 12) | ((b[1] & 0x3F) << 6) | (b[2] & 0x3F)); + } + + *width = 0; + return 0; +} + +static size_t +pm_encoding_cesu_8_char_width(const uint8_t *b, ptrdiff_t n) { + if (n == 0) { + return 0; + } + + size_t width; + pm_cesu_8_codepoint(b, n, &width); + return width; +} + +static size_t +pm_encoding_cesu_8_alpha_char(const uint8_t *b, ptrdiff_t n) { + if (n == 0) { + return 0; + } + + if (*b < 0x80) { + return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT) ? 1 : 0; + } + + size_t width; + pm_unicode_codepoint_t codepoint = pm_cesu_8_codepoint(b, n, &width); + + if (codepoint <= 0xFF) { + return (pm_encoding_unicode_table[(uint8_t) codepoint] & PRISM_ENCODING_ALPHABETIC_BIT) ? width : 0; + } else { + return pm_unicode_codepoint_match(codepoint, unicode_alpha_codepoints, UNICODE_ALPHA_CODEPOINTS_LENGTH) ? width : 0; + } +} + +static size_t +pm_encoding_cesu_8_alnum_char(const uint8_t *b, ptrdiff_t n) { + if (n == 0) { + return 0; + } + + if (*b < 0x80) { + return (pm_encoding_unicode_table[*b] & (PRISM_ENCODING_ALPHANUMERIC_BIT)) ? 1 : 0; + } + + size_t width; + pm_unicode_codepoint_t codepoint = pm_cesu_8_codepoint(b, n, &width); + + if (codepoint <= 0xFF) { + return (pm_encoding_unicode_table[(uint8_t) codepoint] & (PRISM_ENCODING_ALPHANUMERIC_BIT)) ? width : 0; + } else { + return pm_unicode_codepoint_match(codepoint, unicode_alnum_codepoints, UNICODE_ALNUM_CODEPOINTS_LENGTH) ? width : 0; + } +} + +static bool +pm_encoding_cesu_8_isupper_char(const uint8_t *b, ptrdiff_t n) { + if (n == 0) { + return 0; + } + + if (*b < 0x80) { + return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_UPPERCASE_BIT) ? true : false; + } + + size_t width; + pm_unicode_codepoint_t codepoint = pm_cesu_8_codepoint(b, n, &width); + + if (codepoint <= 0xFF) { + return (pm_encoding_unicode_table[(uint8_t) codepoint] & PRISM_ENCODING_UPPERCASE_BIT) ? true : false; + } else { + return pm_unicode_codepoint_match(codepoint, unicode_isupper_codepoints, UNICODE_ISUPPER_CODEPOINTS_LENGTH) ? true : false; + } +} + +#endif + +#undef UNICODE_ALPHA_CODEPOINTS_LENGTH +#undef UNICODE_ALNUM_CODEPOINTS_LENGTH +#undef UNICODE_ISUPPER_CODEPOINTS_LENGTH + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding US-ASCII character. + */ +static const uint8_t pm_encoding_ascii_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +#ifndef PRISM_ENCODING_EXCLUDE_FULL + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding CP850 character. + */ +static const uint8_t pm_encoding_cp850_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding CP852 character. + */ +static const uint8_t pm_encoding_cp852_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding CP855 character. + */ +static const uint8_t pm_encoding_cp855_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding GB1988 character. + */ +static const uint8_t pm_encoding_gb1988_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding IBM437 character. + */ +static const uint8_t pm_encoding_ibm437_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding IBM720 character. + */ +static const uint8_t pm_encoding_ibm720_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding IBM737 character. + */ +static const uint8_t pm_encoding_ibm737_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding IBM775 character. + */ +static const uint8_t pm_encoding_ibm775_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding IBM852 character. + */ +static const uint8_t pm_encoding_ibm852_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding IBM855 character. + */ +static const uint8_t pm_encoding_ibm855_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding IBM857 character. + */ +static const uint8_t pm_encoding_ibm857_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding IBM860 character. + */ +static const uint8_t pm_encoding_ibm860_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding IBM861 character. + */ +static const uint8_t pm_encoding_ibm861_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding IBM862 character. + */ +static const uint8_t pm_encoding_ibm862_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding IBM863 character. + */ +static const uint8_t pm_encoding_ibm863_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding IBM864 character. + */ +static const uint8_t pm_encoding_ibm864_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding IBM865 character. + */ +static const uint8_t pm_encoding_ibm865_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding IBM866 character. + */ +static const uint8_t pm_encoding_ibm866_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding IBM869 character. + */ +static const uint8_t pm_encoding_ibm869_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding ISO-8859-1 character. + */ +static const uint8_t pm_encoding_iso_8859_1_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, // Bx + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx + 7, 7, 7, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 3, // Dx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex + 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding ISO-8859-2 character. + */ +static const uint8_t pm_encoding_iso_8859_2_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 7, 0, 7, 0, 7, 7, 0, 0, 7, 7, 7, 7, 0, 7, 7, // Ax + 0, 3, 0, 3, 0, 3, 3, 0, 0, 3, 3, 3, 3, 0, 3, 3, // Bx + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx + 7, 7, 7, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 3, // Dx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex + 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding ISO-8859-3 character. + */ +static const uint8_t pm_encoding_iso_8859_3_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 7, 0, 0, 0, 0, 7, 0, 0, 7, 7, 7, 7, 0, 0, 7, // Ax + 0, 3, 0, 0, 0, 3, 3, 0, 0, 3, 3, 3, 3, 0, 0, 3, // Bx + 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx + 0, 7, 7, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 3, // Dx + 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex + 0, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding ISO-8859-4 character. + */ +static const uint8_t pm_encoding_iso_8859_4_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 7, 3, 7, 0, 7, 7, 0, 0, 7, 7, 7, 7, 0, 7, 0, // Ax + 0, 3, 0, 3, 0, 3, 3, 0, 0, 3, 3, 3, 3, 7, 3, 3, // Bx + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx + 7, 7, 7, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 3, // Dx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex + 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding ISO-8859-5 character. + */ +static const uint8_t pm_encoding_iso_8859_5_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 7, 7, // Ax + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Bx + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Dx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding ISO-8859-6 character. + */ +static const uint8_t pm_encoding_iso_8859_6_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Cx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // Dx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex + 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding ISO-8859-7 character. + */ +static const uint8_t pm_encoding_iso_8859_7_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 7, 0, 7, 7, 7, 0, 7, 0, 7, 7, // Bx + 3, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx + 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3, // Dx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding ISO-8859-8 character. + */ +static const uint8_t pm_encoding_iso_8859_8_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding ISO-8859-9 character. + */ +static const uint8_t pm_encoding_iso_8859_9_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, // Bx + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx + 7, 7, 7, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 3, // Dx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex + 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding ISO-8859-10 character. + */ +static const uint8_t pm_encoding_iso_8859_10_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 7, 7, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 0, 7, 7, // Ax + 0, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 0, 3, 3, // Bx + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3, // Dx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding ISO-8859-11 character. + */ +static const uint8_t pm_encoding_iso_8859_11_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ax + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Bx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Cx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 3, // Dx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding ISO-8859-13 character. + */ +static const uint8_t pm_encoding_iso_8859_13_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 7, 0, 0, 0, 0, 7, // Ax + 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 3, 0, 0, 0, 0, 3, // Bx + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx + 7, 7, 7, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 3, // Dx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex + 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding ISO-8859-14 character. + */ +static const uint8_t pm_encoding_iso_8859_14_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 7, 3, 0, 7, 3, 7, 0, 7, 0, 7, 3, 7, 0, 0, 7, // Ax + 7, 3, 7, 3, 7, 3, 0, 7, 3, 3, 3, 7, 3, 7, 3, 3, // Bx + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3, // Dx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding ISO-8859-15 character. + */ +static const uint8_t pm_encoding_iso_8859_15_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 7, 0, 3, 0, 3, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 7, 3, 0, 0, 3, 0, 3, 0, 7, 3, 7, 0, // Bx + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx + 7, 7, 7, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 3, // Dx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex + 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding ISO-8859-16 character. + */ +static const uint8_t pm_encoding_iso_8859_16_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 7, 3, 7, 0, 0, 7, 0, 3, 0, 7, 0, 7, 0, 3, 7, // Ax + 0, 0, 7, 3, 7, 0, 0, 0, 3, 3, 3, 0, 7, 3, 7, 3, // Bx + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3, // Dx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding KOI8-R character. + */ +static const uint8_t pm_encoding_koi8_r_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Cx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Dx + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Ex + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding KOI8-U character. + */ +static const uint8_t pm_encoding_koi8_u_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 3, 3, 0, 3, 3, 0, 0, 0, 0, 0, 3, 0, 0, // Ax + 0, 0, 0, 7, 7, 0, 7, 7, 0, 0, 0, 0, 0, 7, 0, 0, // Bx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Cx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Dx + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Ex + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding macCentEuro character. + */ +static const uint8_t pm_encoding_mac_cent_euro_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding macCroatian character. + */ +static const uint8_t pm_encoding_mac_croatian_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + + /** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding macCyrillic character. + */ +static const uint8_t pm_encoding_mac_cyrillic_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding macGreek character. + */ +static const uint8_t pm_encoding_mac_greek_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding macIceland character. + */ +static const uint8_t pm_encoding_mac_iceland_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding macRoman character. + */ +static const uint8_t pm_encoding_mac_roman_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding macRomania character. + */ +static const uint8_t pm_encoding_mac_romania_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding macThai character. + */ +static const uint8_t pm_encoding_mac_thai_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding TIS-620 character. + */ +static const uint8_t pm_encoding_tis_620_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ax + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Bx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Cx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 3, // Dx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding macTurkish character. + */ +static const uint8_t pm_encoding_mac_turkish_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding macUkraine character. + */ +static const uint8_t pm_encoding_mac_ukraine_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding windows-1250 character. + */ +static const uint8_t pm_encoding_windows_1250_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 7, 7, 7, 7, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 3, 3, 3, // 9x + 0, 0, 0, 7, 0, 7, 0, 0, 0, 0, 7, 0, 0, 0, 0, 7, // Ax + 0, 0, 0, 3, 0, 3, 0, 0, 0, 3, 3, 0, 7, 0, 3, 3, // Bx + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3, // Dx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex + 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding windows-1251 character. + */ +static const uint8_t pm_encoding_windows_1251_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 7, 7, 0, 3, 0, 0, 0, 0, 0, 0, 7, 0, 7, 7, 7, 7, // 8x + 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 3, 3, 3, // 9x + 0, 7, 3, 7, 0, 7, 0, 0, 7, 0, 7, 0, 0, 0, 0, 7, // Ax + 0, 0, 7, 3, 3, 3, 0, 0, 3, 0, 3, 0, 3, 7, 3, 3, // Bx + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Dx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding windows-1252 character. + */ +static const uint8_t pm_encoding_windows_1252_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 7, 0, 7, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 0, 3, 7, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, // Bx + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3, // Dx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex + 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding windows-1253 character. + */ +static const uint8_t pm_encoding_windows_1253_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 3, 7, 0, 7, 7, 7, 0, 7, 0, 7, 7, // Bx + 3, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx + 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3, // Dx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding windows-1254 character. + */ +static const uint8_t pm_encoding_windows_1254_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 7, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 0, 0, 7, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, // Bx + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx + 7, 7, 7, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 3, // Dx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex + 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding windows-1255 character. + */ +static const uint8_t pm_encoding_windows_1255_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding windows-1256 character. + */ +static const uint8_t pm_encoding_windows_1256_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Cx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // Dx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex + 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding windows-1257 character. + */ +static const uint8_t pm_encoding_windows_1257_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 7, 0, 0, 0, 0, 7, // Ax + 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 3, 0, 0, 0, 0, 3, // Bx + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx + 7, 7, 7, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 3, // Dx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex + 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding windows-1258 character. + */ +static const uint8_t pm_encoding_windows_1258_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding windows-874 character. + */ +static const uint8_t pm_encoding_windows_874_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +#define PRISM_ENCODING_TABLE(name) \ + static size_t pm_encoding_ ##name ## _alpha_char(const uint8_t *b, ptrdiff_t n) { \ + return ((n > 0) && (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_ALPHABETIC_BIT)); \ + } \ + static size_t pm_encoding_ ##name ## _alnum_char(const uint8_t *b, ptrdiff_t n) { \ + return ((n > 0) && (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT)) ? 1 : 0; \ + } \ + static bool pm_encoding_ ##name ## _isupper_char(const uint8_t *b, ptrdiff_t n) { \ + return ((n > 0) && (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_UPPERCASE_BIT)); \ + } + +PRISM_ENCODING_TABLE(cp850) +PRISM_ENCODING_TABLE(cp852) +PRISM_ENCODING_TABLE(cp855) +PRISM_ENCODING_TABLE(gb1988) +PRISM_ENCODING_TABLE(ibm437) +PRISM_ENCODING_TABLE(ibm720) +PRISM_ENCODING_TABLE(ibm737) +PRISM_ENCODING_TABLE(ibm775) +PRISM_ENCODING_TABLE(ibm852) +PRISM_ENCODING_TABLE(ibm855) +PRISM_ENCODING_TABLE(ibm857) +PRISM_ENCODING_TABLE(ibm860) +PRISM_ENCODING_TABLE(ibm861) +PRISM_ENCODING_TABLE(ibm862) +PRISM_ENCODING_TABLE(ibm863) +PRISM_ENCODING_TABLE(ibm864) +PRISM_ENCODING_TABLE(ibm865) +PRISM_ENCODING_TABLE(ibm866) +PRISM_ENCODING_TABLE(ibm869) +PRISM_ENCODING_TABLE(iso_8859_1) +PRISM_ENCODING_TABLE(iso_8859_2) +PRISM_ENCODING_TABLE(iso_8859_3) +PRISM_ENCODING_TABLE(iso_8859_4) +PRISM_ENCODING_TABLE(iso_8859_5) +PRISM_ENCODING_TABLE(iso_8859_6) +PRISM_ENCODING_TABLE(iso_8859_7) +PRISM_ENCODING_TABLE(iso_8859_8) +PRISM_ENCODING_TABLE(iso_8859_9) +PRISM_ENCODING_TABLE(iso_8859_10) +PRISM_ENCODING_TABLE(iso_8859_11) +PRISM_ENCODING_TABLE(iso_8859_13) +PRISM_ENCODING_TABLE(iso_8859_14) +PRISM_ENCODING_TABLE(iso_8859_15) +PRISM_ENCODING_TABLE(iso_8859_16) +PRISM_ENCODING_TABLE(koi8_r) +PRISM_ENCODING_TABLE(koi8_u) +PRISM_ENCODING_TABLE(mac_cent_euro) +PRISM_ENCODING_TABLE(mac_croatian) +PRISM_ENCODING_TABLE(mac_cyrillic) +PRISM_ENCODING_TABLE(mac_greek) +PRISM_ENCODING_TABLE(mac_iceland) +PRISM_ENCODING_TABLE(mac_roman) +PRISM_ENCODING_TABLE(mac_romania) +PRISM_ENCODING_TABLE(mac_thai) +PRISM_ENCODING_TABLE(mac_turkish) +PRISM_ENCODING_TABLE(mac_ukraine) +PRISM_ENCODING_TABLE(tis_620) +PRISM_ENCODING_TABLE(windows_1250) +PRISM_ENCODING_TABLE(windows_1251) +PRISM_ENCODING_TABLE(windows_1252) +PRISM_ENCODING_TABLE(windows_1253) +PRISM_ENCODING_TABLE(windows_1254) +PRISM_ENCODING_TABLE(windows_1255) +PRISM_ENCODING_TABLE(windows_1256) +PRISM_ENCODING_TABLE(windows_1257) +PRISM_ENCODING_TABLE(windows_1258) +PRISM_ENCODING_TABLE(windows_874) + +#undef PRISM_ENCODING_TABLE +#endif + +/** + * Returns the size of the next character in the ASCII encoding. This basically + * means that if the top bit is not set, the character is 1 byte long. + */ +static size_t +pm_encoding_ascii_char_width(const uint8_t *b, ptrdiff_t n) { + return ((n > 0) && (*b < 0x80)) ? 1 : 0; +} + +/** + * Return the size of the next character in the ASCII encoding if it is an + * alphabetical character. + */ +static size_t +pm_encoding_ascii_alpha_char(const uint8_t *b, ptrdiff_t n) { + return (n > 0) ? (pm_encoding_ascii_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT) : 0; +} + +/** + * Certain encodings are equivalent to ASCII below 0x80, so it works for our + * purposes to have a function here that first checks the bounds and then falls + * back to checking the ASCII lookup table. + */ +static size_t +pm_encoding_ascii_alpha_char_7bit(const uint8_t *b, ptrdiff_t n) { + return ((n > 0) && (*b < 0x80)) ? pm_encoding_ascii_alpha_char(b, n) : 0; +} + +/** + * Return the size of the next character in the ASCII encoding if it is an + * alphanumeric character. + */ +static size_t +pm_encoding_ascii_alnum_char(const uint8_t *b, ptrdiff_t n) { + return ((n > 0) && (pm_encoding_ascii_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT)) ? 1 : 0; +} + +/** + * Certain encodings are equivalent to ASCII below 0x80, so it works for our + * purposes to have a function here that first checks the bounds and then falls + * back to checking the ASCII lookup table. + */ +static size_t +pm_encoding_ascii_alnum_char_7bit(const uint8_t *b, ptrdiff_t n) { + return ((n > 0) && (*b < 0x80)) ? pm_encoding_ascii_alnum_char(b, n) : 0; +} + +/** + * Return true if the next character in the ASCII encoding if it is an uppercase + * character. + */ +static bool +pm_encoding_ascii_isupper_char(const uint8_t *b, ptrdiff_t n) { + return (n > 0) && (pm_encoding_ascii_table[*b] & PRISM_ENCODING_UPPERCASE_BIT); +} + +/** + * For a lot of encodings the default is that they are a single byte long no + * matter what the codepoint, so this function is shared between them. + */ +static size_t +pm_encoding_single_char_width(PRISM_ATTRIBUTE_UNUSED const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { + return 1; +} + +/** + * Returns the size of the next character in the EUC-JP encoding, or 0 if a + * character cannot be decoded from the given bytes. + */ +static size_t +pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) { + // These are the single byte characters. + if ((n > 0) && (*b < 0x80)) { + return 1; + } + + // These are the double byte characters. + if ((n > 1) && ((b[0] == 0x8E) || (b[0] >= 0xA1 && b[0] <= 0xFE)) && (b[1] >= 0xA1 && b[1] <= 0xFE)) { + return 2; + } + + // These are the triple byte characters. + if ((n > 2) && (b[0] == 0x8F) && (b[1] >= 0xA1 && b[2] <= 0xFE) && (b[2] >= 0xA1 && b[2] <= 0xFE)) { + return 3; + } + + return 0; +} + +/** + * Returns the size of the next character in the EUC-JP encoding if it is an + * uppercase character. + */ +static bool +pm_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) { + size_t width = pm_encoding_euc_jp_char_width(b, n); + + if (width == 1) { + return pm_encoding_ascii_isupper_char(b, n); + } else if (width == 2) { + return ( + (b[0] == 0xA3 && b[1] >= 0xC1 && b[1] <= 0xDA) || + (b[0] == 0xA6 && b[1] >= 0xA1 && b[1] <= 0xB8) || + (b[0] == 0xA7 && b[1] >= 0xA1 && b[1] <= 0xC1) + ); + } else { + return false; + } +} + +/** + * Returns the size of the next character in the Shift_JIS encoding, or 0 if a + * character cannot be decoded from the given bytes. + */ +static size_t +pm_encoding_shift_jis_char_width(const uint8_t *b, ptrdiff_t n) { + if (n == 0) { + return 0; + } + // These are the single byte characters. + if (b[0] < 0x80 || (b[0] >= 0xA1 && b[0] <= 0xDF)) { + return 1; + } + + // These are the double byte characters. + if ((n > 1) && ((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) && (b[1] >= 0x40 && b[1] <= 0xFC && b[1] != 0x7F)) { + return 2; + } + + return 0; +} + +/** + * Returns the size of the next character in the Shift_JIS encoding if it is an + * alphanumeric character. + */ +static size_t +pm_encoding_shift_jis_alnum_char(const uint8_t *b, ptrdiff_t n) { + size_t width = pm_encoding_shift_jis_char_width(b, n); + return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alnum_char(b, n)) : width; +} + +/** + * Returns the size of the next character in the Shift_JIS encoding if it is an + * alphabetical character. + */ +static size_t +pm_encoding_shift_jis_alpha_char(const uint8_t *b, ptrdiff_t n) { + size_t width = pm_encoding_shift_jis_char_width(b, n); + return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alpha_char(b, n)) : width; +} + +/** + * Returns the size of the next character in the Shift_JIS encoding if it is an + * uppercase character. + */ +static bool +pm_encoding_shift_jis_isupper_char(const uint8_t *b, ptrdiff_t n) { + size_t width = pm_encoding_shift_jis_char_width(b, n); + + if (width == 1) { + return pm_encoding_ascii_isupper_char(b, n); + } else if (width == 2) { + return ( + ((b[0] == 0x82) && (b[1] >= 0x60 && b[1] <= 0x79)) || + ((b[0] == 0x83) && (b[1] >= 0x9F && b[1] <= 0xB6)) || + ((b[0] == 0x84) && (b[1] >= 0x40 && b[1] <= 0x60)) + ); + } else { + return width; + } +} + +#ifndef PRISM_ENCODING_EXCLUDE_FULL + +/** + * Certain encodings are equivalent to ASCII below 0x80, so it works for our + * purposes to have a function here that first checks the bounds and then falls + * back to checking the ASCII lookup table. + */ +static bool +pm_encoding_ascii_isupper_char_7bit(const uint8_t *b, ptrdiff_t n) { + return (n > 0) && (*b < 0x80) && pm_encoding_ascii_isupper_char(b, n); +} + +/** + * Returns the size of the next character in the Big5 encoding, or 0 if a + * character cannot be decoded from the given bytes. + */ +static size_t +pm_encoding_big5_char_width(const uint8_t *b, ptrdiff_t n) { + // These are the single byte characters. + if ((n > 0) && (*b < 0x80)) { + return 1; + } + + // These are the double byte characters. + if ((n > 1) && (b[0] >= 0xA1 && b[0] <= 0xFE) && ((b[1] >= 0x40 && b[1] <= 0x7E) || (b[1] >= 0xA1 && b[1] <= 0xFE))) { + return 2; + } + + return 0; +} + +/** + * Returns the size of the next character in the CP949 encoding, or 0 if a + * character cannot be decoded from the given bytes. + */ +static size_t +pm_encoding_cp949_char_width(const uint8_t *b, ptrdiff_t n) { + // These are the single byte characters + if ((n > 0) && (*b <= 0x80)) { + return 1; + } + + // These are the double byte characters + if ((n > 1) && (b[0] >= 0x81 && b[0] <= 0xFE) && ((b[1] >= 0x41 && b[1] <= 0x5A) || (b[1] >= 0x61 && b[1] <= 0x7A) || (b[1] >= 0x81 && b[1] <= 0xFE))) { + return 2; + } + + return 0; +} + +/** + * Returns the size of the next character in the Emacs MULE encoding, or 0 if a + * character cannot be decoded from the given bytes. + */ +static size_t +pm_encoding_emacs_mule_char_width(const uint8_t *b, ptrdiff_t n) { + // These are the 1 byte characters. + if ((n > 0) && (*b < 0x80)) { + return 1; + } + + // These are the 2 byte characters. + if ((n > 1) && (b[0] >= 0x81 && b[0] <= 0x8F) && (b[1] >= 0xA0)) { + return 2; + } + + // These are the 3 byte characters. + if ( + (n > 2) && + ( + ((b[0] >= 0x90 && b[0] <= 0x99) && (b[1] >= 0xA0)) || + ((b[0] == 0x9A || b[0] == 0x9B) && (b[1] >= 0xE0 && b[1] <= 0xEF)) + ) && + (b[2] >= 0xA0) + ) { + return 3; + } + + // These are the 4 byte characters. + if ( + (n > 3) && + ( + ((b[0] == 0x9C) && (b[1] >= 0xF0) && (b[1] <= 0xF4)) || + ((b[0] == 0x9D) && (b[1] >= 0xF5) && (b[1] <= 0xFE)) + ) && + (b[2] >= 0xA0) && (b[3] >= 0xA0) + ) { + return 4; + } + + return 0; +} + +/** + * Returns the size of the next character in the EUC-KR encoding, or 0 if a + * character cannot be decoded from the given bytes. + */ +static size_t +pm_encoding_euc_kr_char_width(const uint8_t *b, ptrdiff_t n) { + // These are the single byte characters. + if ((n > 0) && (*b < 0x80)) { + return 1; + } + + // These are the double byte characters. + if ((n > 1) && (b[0] >= 0xA1 && b[0] <= 0xFE) && (b[1] >= 0xA1 && b[1] <= 0xFE)) { + return 2; + } + + return 0; +} + +/** + * Returns the size of the next character in the EUC-TW encoding, or 0 if a + * character cannot be decoded from the given bytes. + */ +static size_t +pm_encoding_euc_tw_char_width(const uint8_t *b, ptrdiff_t n) { + // These are the single byte characters. + if ((n > 0) && (*b < 0x80)) { + return 1; + } + + // These are the double byte characters. + if ((n > 1) && (b[0] >= 0xA1) && (b[0] <= 0xFE) && (b[1] >= 0xA1) && (b[1] <= 0xFE)) { + return 2; + } + + // These are the quadruple byte characters. + if ((n > 3) && (b[0] == 0x8E) && (b[1] >= 0xA1) && (b[1] <= 0xB0) && (b[2] >= 0xA1) && (b[2] <= 0xFE) && (b[3] >= 0xA1) && (b[3] <= 0xFE)) { + return 4; + } + + return 0; +} + +/** + * Returns the size of the next character in the GB18030 encoding, or 0 if a + * character cannot be decoded from the given bytes. + */ +static size_t +pm_encoding_gb18030_char_width(const uint8_t *b, ptrdiff_t n) { + // These are the 1 byte characters. + if ((n > 0) && (*b < 0x80)) { + return 1; + } + + // These are the 2 byte characters. + if ((n > 1) && (b[0] >= 0x81 && b[0] <= 0xFE) && (b[1] >= 0x40 && b[1] <= 0xFE && b[1] != 0x7F)) { + return 2; + } + + // These are the 4 byte characters. + if ((n > 3) && ((b[0] >= 0x81 && b[0] <= 0xFE) && (b[1] >= 0x30 && b[1] <= 0x39) && (b[2] >= 0x81 && b[2] <= 0xFE) && (b[3] >= 0x30 && b[3] <= 0x39))) { + return 4; + } + + return 0; +} + +/** + * Returns the size of the next character in the GBK encoding, or 0 if a + * character cannot be decoded from the given bytes. + */ +static size_t +pm_encoding_gbk_char_width(const uint8_t *b, ptrdiff_t n) { + // These are the single byte characters. + if ((n > 0) && (*b <= 0x80)) { + return 1; + } + + // These are the double byte characters. + if ( + (n > 1) && + ( + ((b[0] >= 0xA1 && b[0] <= 0xA9) && (b[1] >= 0xA1 && b[1] <= 0xFE)) || // GBK/1 + ((b[0] >= 0xB0 && b[0] <= 0xF7) && (b[1] >= 0xA1 && b[1] <= 0xFE)) || // GBK/2 + ((b[0] >= 0x81 && b[0] <= 0xA0) && (b[1] >= 0x40 && b[1] <= 0xFE) && (b[1] != 0x7F)) || // GBK/3 + ((b[0] >= 0xAA && b[0] <= 0xFE) && (b[1] >= 0x40 && b[1] <= 0xA0) && (b[1] != 0x7F)) || // GBK/4 + ((b[0] >= 0xA8 && b[0] <= 0xA9) && (b[1] >= 0x40 && b[1] <= 0xA0) && (b[1] != 0x7F)) || // GBK/5 + ((b[0] >= 0xAA && b[0] <= 0xAF) && (b[1] >= 0xA1 && b[1] <= 0xFE)) || // user-defined 1 + ((b[0] >= 0xF8 && b[0] <= 0xFE) && (b[1] >= 0xA1 && b[1] <= 0xFE)) || // user-defined 2 + ((b[0] >= 0xA1 && b[0] <= 0xA7) && (b[1] >= 0x40 && b[1] <= 0xA0) && (b[1] != 0x7F)) // user-defined 3 + ) + ) { + return 2; + } + + return 0; +} + +#endif + +/** + * This is the table of all of the encodings that prism supports. + */ +const pm_encoding_t pm_encodings[] = { + [PM_ENCODING_UTF_8] = { + .name = "UTF-8", + .char_width = pm_encoding_utf_8_char_width, + .alnum_char = pm_encoding_utf_8_alnum_char, + .alpha_char = pm_encoding_utf_8_alpha_char, + .isupper_char = pm_encoding_utf_8_isupper_char, + .multibyte = true + }, + [PM_ENCODING_US_ASCII] = { + .name = "US-ASCII", + .char_width = pm_encoding_ascii_char_width, + .alnum_char = pm_encoding_ascii_alnum_char, + .alpha_char = pm_encoding_ascii_alpha_char, + .isupper_char = pm_encoding_ascii_isupper_char, + .multibyte = false + }, + [PM_ENCODING_ASCII_8BIT] = { + .name = "ASCII-8BIT", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_ascii_alnum_char, + .alpha_char = pm_encoding_ascii_alpha_char, + .isupper_char = pm_encoding_ascii_isupper_char, + .multibyte = false + }, + [PM_ENCODING_EUC_JP] = { + .name = "EUC-JP", + .char_width = pm_encoding_euc_jp_char_width, + .alnum_char = pm_encoding_ascii_alnum_char_7bit, + .alpha_char = pm_encoding_ascii_alpha_char_7bit, + .isupper_char = pm_encoding_euc_jp_isupper_char, + .multibyte = true + }, + [PM_ENCODING_WINDOWS_31J] = { + .name = "Windows-31J", + .char_width = pm_encoding_shift_jis_char_width, + .alnum_char = pm_encoding_shift_jis_alnum_char, + .alpha_char = pm_encoding_shift_jis_alpha_char, + .isupper_char = pm_encoding_shift_jis_isupper_char, + .multibyte = true + }, + +#ifndef PRISM_ENCODING_EXCLUDE_FULL + [PM_ENCODING_BIG5] = { + .name = "Big5", + .char_width = pm_encoding_big5_char_width, + .alnum_char = pm_encoding_ascii_alnum_char_7bit, + .alpha_char = pm_encoding_ascii_alpha_char_7bit, + .isupper_char = pm_encoding_ascii_isupper_char_7bit, + .multibyte = true + }, + [PM_ENCODING_BIG5_HKSCS] = { + .name = "Big5-HKSCS", + .char_width = pm_encoding_big5_char_width, + .alnum_char = pm_encoding_ascii_alnum_char_7bit, + .alpha_char = pm_encoding_ascii_alpha_char_7bit, + .isupper_char = pm_encoding_ascii_isupper_char_7bit, + .multibyte = true + }, + [PM_ENCODING_BIG5_UAO] = { + .name = "Big5-UAO", + .char_width = pm_encoding_big5_char_width, + .alnum_char = pm_encoding_ascii_alnum_char_7bit, + .alpha_char = pm_encoding_ascii_alpha_char_7bit, + .isupper_char = pm_encoding_ascii_isupper_char_7bit, + .multibyte = true + }, + [PM_ENCODING_CESU_8] = { + .name = "CESU-8", + .char_width = pm_encoding_cesu_8_char_width, + .alnum_char = pm_encoding_cesu_8_alnum_char, + .alpha_char = pm_encoding_cesu_8_alpha_char, + .isupper_char = pm_encoding_cesu_8_isupper_char, + .multibyte = true + }, + [PM_ENCODING_CP51932] = { + .name = "CP51932", + .char_width = pm_encoding_euc_jp_char_width, + .alnum_char = pm_encoding_ascii_alnum_char_7bit, + .alpha_char = pm_encoding_ascii_alpha_char_7bit, + .isupper_char = pm_encoding_euc_jp_isupper_char, + .multibyte = true + }, + [PM_ENCODING_CP850] = { + .name = "CP850", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_cp850_alnum_char, + .alpha_char = pm_encoding_cp850_alpha_char, + .isupper_char = pm_encoding_cp850_isupper_char, + .multibyte = false + }, + [PM_ENCODING_CP852] = { + .name = "CP852", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_cp852_alnum_char, + .alpha_char = pm_encoding_cp852_alpha_char, + .isupper_char = pm_encoding_cp852_isupper_char, + .multibyte = false + }, + [PM_ENCODING_CP855] = { + .name = "CP855", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_cp855_alnum_char, + .alpha_char = pm_encoding_cp855_alpha_char, + .isupper_char = pm_encoding_cp855_isupper_char, + .multibyte = false + }, + [PM_ENCODING_CP949] = { + .name = "CP949", + .char_width = pm_encoding_cp949_char_width, + .alnum_char = pm_encoding_ascii_alnum_char_7bit, + .alpha_char = pm_encoding_ascii_alpha_char_7bit, + .isupper_char = pm_encoding_ascii_isupper_char_7bit, + .multibyte = true + }, + [PM_ENCODING_CP950] = { + .name = "CP950", + .char_width = pm_encoding_big5_char_width, + .alnum_char = pm_encoding_ascii_alnum_char_7bit, + .alpha_char = pm_encoding_ascii_alpha_char_7bit, + .isupper_char = pm_encoding_ascii_isupper_char_7bit, + .multibyte = true + }, + [PM_ENCODING_CP951] = { + .name = "CP951", + .char_width = pm_encoding_big5_char_width, + .alnum_char = pm_encoding_ascii_alnum_char_7bit, + .alpha_char = pm_encoding_ascii_alpha_char_7bit, + .isupper_char = pm_encoding_ascii_isupper_char_7bit, + .multibyte = true + }, + [PM_ENCODING_EMACS_MULE] = { + .name = "Emacs-Mule", + .char_width = pm_encoding_emacs_mule_char_width, + .alnum_char = pm_encoding_ascii_alnum_char_7bit, + .alpha_char = pm_encoding_ascii_alpha_char_7bit, + .isupper_char = pm_encoding_ascii_isupper_char_7bit, + .multibyte = true + }, + [PM_ENCODING_EUC_JP_MS] = { + .name = "eucJP-ms", + .char_width = pm_encoding_euc_jp_char_width, + .alnum_char = pm_encoding_ascii_alnum_char_7bit, + .alpha_char = pm_encoding_ascii_alpha_char_7bit, + .isupper_char = pm_encoding_euc_jp_isupper_char, + .multibyte = true + }, + [PM_ENCODING_EUC_JIS_2004] = { + .name = "EUC-JIS-2004", + .char_width = pm_encoding_euc_jp_char_width, + .alnum_char = pm_encoding_ascii_alnum_char_7bit, + .alpha_char = pm_encoding_ascii_alpha_char_7bit, + .isupper_char = pm_encoding_euc_jp_isupper_char, + .multibyte = true + }, + [PM_ENCODING_EUC_KR] = { + .name = "EUC-KR", + .char_width = pm_encoding_euc_kr_char_width, + .alnum_char = pm_encoding_ascii_alnum_char_7bit, + .alpha_char = pm_encoding_ascii_alpha_char_7bit, + .isupper_char = pm_encoding_ascii_isupper_char_7bit, + .multibyte = true + }, + [PM_ENCODING_EUC_TW] = { + .name = "EUC-TW", + .char_width = pm_encoding_euc_tw_char_width, + .alnum_char = pm_encoding_ascii_alnum_char_7bit, + .alpha_char = pm_encoding_ascii_alpha_char_7bit, + .isupper_char = pm_encoding_ascii_isupper_char_7bit, + .multibyte = true + }, + [PM_ENCODING_GB12345] = { + .name = "GB12345", + .char_width = pm_encoding_euc_kr_char_width, + .alnum_char = pm_encoding_ascii_alnum_char_7bit, + .alpha_char = pm_encoding_ascii_alpha_char_7bit, + .isupper_char = pm_encoding_ascii_isupper_char_7bit, + .multibyte = true + }, + [PM_ENCODING_GB18030] = { + .name = "GB18030", + .char_width = pm_encoding_gb18030_char_width, + .alnum_char = pm_encoding_ascii_alnum_char_7bit, + .alpha_char = pm_encoding_ascii_alpha_char_7bit, + .isupper_char = pm_encoding_ascii_isupper_char_7bit, + .multibyte = true + }, + [PM_ENCODING_GB1988] = { + .name = "GB1988", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_gb1988_alnum_char, + .alpha_char = pm_encoding_gb1988_alpha_char, + .isupper_char = pm_encoding_gb1988_isupper_char, + .multibyte = false + }, + [PM_ENCODING_GB2312] = { + .name = "GB2312", + .char_width = pm_encoding_euc_kr_char_width, + .alnum_char = pm_encoding_ascii_alnum_char_7bit, + .alpha_char = pm_encoding_ascii_alpha_char_7bit, + .isupper_char = pm_encoding_ascii_isupper_char_7bit, + .multibyte = true + }, + [PM_ENCODING_GBK] = { + .name = "GBK", + .char_width = pm_encoding_gbk_char_width, + .alnum_char = pm_encoding_ascii_alnum_char_7bit, + .alpha_char = pm_encoding_ascii_alpha_char_7bit, + .isupper_char = pm_encoding_ascii_isupper_char_7bit, + .multibyte = true + }, + [PM_ENCODING_IBM437] = { + .name = "IBM437", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_ibm437_alnum_char, + .alpha_char = pm_encoding_ibm437_alpha_char, + .isupper_char = pm_encoding_ibm437_isupper_char, + .multibyte = false + }, + [PM_ENCODING_IBM720] = { + .name = "IBM720", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_ibm720_alnum_char, + .alpha_char = pm_encoding_ibm720_alpha_char, + .isupper_char = pm_encoding_ibm720_isupper_char, + .multibyte = false + }, + [PM_ENCODING_IBM737] = { + .name = "IBM737", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_ibm737_alnum_char, + .alpha_char = pm_encoding_ibm737_alpha_char, + .isupper_char = pm_encoding_ibm737_isupper_char, + .multibyte = false + }, + [PM_ENCODING_IBM775] = { + .name = "IBM775", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_ibm775_alnum_char, + .alpha_char = pm_encoding_ibm775_alpha_char, + .isupper_char = pm_encoding_ibm775_isupper_char, + .multibyte = false + }, + [PM_ENCODING_IBM852] = { + .name = "IBM852", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_ibm852_alnum_char, + .alpha_char = pm_encoding_ibm852_alpha_char, + .isupper_char = pm_encoding_ibm852_isupper_char, + .multibyte = false + }, + [PM_ENCODING_IBM855] = { + .name = "IBM855", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_ibm855_alnum_char, + .alpha_char = pm_encoding_ibm855_alpha_char, + .isupper_char = pm_encoding_ibm855_isupper_char, + .multibyte = false + }, + [PM_ENCODING_IBM857] = { + .name = "IBM857", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_ibm857_alnum_char, + .alpha_char = pm_encoding_ibm857_alpha_char, + .isupper_char = pm_encoding_ibm857_isupper_char, + .multibyte = false + }, + [PM_ENCODING_IBM860] = { + .name = "IBM860", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_ibm860_alnum_char, + .alpha_char = pm_encoding_ibm860_alpha_char, + .isupper_char = pm_encoding_ibm860_isupper_char, + .multibyte = false + }, + [PM_ENCODING_IBM861] = { + .name = "IBM861", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_ibm861_alnum_char, + .alpha_char = pm_encoding_ibm861_alpha_char, + .isupper_char = pm_encoding_ibm861_isupper_char, + .multibyte = false + }, + [PM_ENCODING_IBM862] = { + .name = "IBM862", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_ibm862_alnum_char, + .alpha_char = pm_encoding_ibm862_alpha_char, + .isupper_char = pm_encoding_ibm862_isupper_char, + .multibyte = false + }, + [PM_ENCODING_IBM863] = { + .name = "IBM863", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_ibm863_alnum_char, + .alpha_char = pm_encoding_ibm863_alpha_char, + .isupper_char = pm_encoding_ibm863_isupper_char, + .multibyte = false + }, + [PM_ENCODING_IBM864] = { + .name = "IBM864", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_ibm864_alnum_char, + .alpha_char = pm_encoding_ibm864_alpha_char, + .isupper_char = pm_encoding_ibm864_isupper_char, + .multibyte = false + }, + [PM_ENCODING_IBM865] = { + .name = "IBM865", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_ibm865_alnum_char, + .alpha_char = pm_encoding_ibm865_alpha_char, + .isupper_char = pm_encoding_ibm865_isupper_char, + .multibyte = false + }, + [PM_ENCODING_IBM866] = { + .name = "IBM866", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_ibm866_alnum_char, + .alpha_char = pm_encoding_ibm866_alpha_char, + .isupper_char = pm_encoding_ibm866_isupper_char, + .multibyte = false + }, + [PM_ENCODING_IBM869] = { + .name = "IBM869", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_ibm869_alnum_char, + .alpha_char = pm_encoding_ibm869_alpha_char, + .isupper_char = pm_encoding_ibm869_isupper_char, + .multibyte = false + }, + [PM_ENCODING_ISO_8859_1] = { + .name = "ISO-8859-1", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_iso_8859_1_alnum_char, + .alpha_char = pm_encoding_iso_8859_1_alpha_char, + .isupper_char = pm_encoding_iso_8859_1_isupper_char, + .multibyte = false + }, + [PM_ENCODING_ISO_8859_2] = { + .name = "ISO-8859-2", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_iso_8859_2_alnum_char, + .alpha_char = pm_encoding_iso_8859_2_alpha_char, + .isupper_char = pm_encoding_iso_8859_2_isupper_char, + .multibyte = false + }, + [PM_ENCODING_ISO_8859_3] = { + .name = "ISO-8859-3", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_iso_8859_3_alnum_char, + .alpha_char = pm_encoding_iso_8859_3_alpha_char, + .isupper_char = pm_encoding_iso_8859_3_isupper_char, + .multibyte = false + }, + [PM_ENCODING_ISO_8859_4] = { + .name = "ISO-8859-4", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_iso_8859_4_alnum_char, + .alpha_char = pm_encoding_iso_8859_4_alpha_char, + .isupper_char = pm_encoding_iso_8859_4_isupper_char, + .multibyte = false + }, + [PM_ENCODING_ISO_8859_5] = { + .name = "ISO-8859-5", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_iso_8859_5_alnum_char, + .alpha_char = pm_encoding_iso_8859_5_alpha_char, + .isupper_char = pm_encoding_iso_8859_5_isupper_char, + .multibyte = false + }, + [PM_ENCODING_ISO_8859_6] = { + .name = "ISO-8859-6", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_iso_8859_6_alnum_char, + .alpha_char = pm_encoding_iso_8859_6_alpha_char, + .isupper_char = pm_encoding_iso_8859_6_isupper_char, + .multibyte = false + }, + [PM_ENCODING_ISO_8859_7] = { + .name = "ISO-8859-7", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_iso_8859_7_alnum_char, + .alpha_char = pm_encoding_iso_8859_7_alpha_char, + .isupper_char = pm_encoding_iso_8859_7_isupper_char, + .multibyte = false + }, + [PM_ENCODING_ISO_8859_8] = { + .name = "ISO-8859-8", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_iso_8859_8_alnum_char, + .alpha_char = pm_encoding_iso_8859_8_alpha_char, + .isupper_char = pm_encoding_iso_8859_8_isupper_char, + .multibyte = false + }, + [PM_ENCODING_ISO_8859_9] = { + .name = "ISO-8859-9", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_iso_8859_9_alnum_char, + .alpha_char = pm_encoding_iso_8859_9_alpha_char, + .isupper_char = pm_encoding_iso_8859_9_isupper_char, + .multibyte = false + }, + [PM_ENCODING_ISO_8859_10] = { + .name = "ISO-8859-10", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_iso_8859_10_alnum_char, + .alpha_char = pm_encoding_iso_8859_10_alpha_char, + .isupper_char = pm_encoding_iso_8859_10_isupper_char, + .multibyte = false + }, + [PM_ENCODING_ISO_8859_11] = { + .name = "ISO-8859-11", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_iso_8859_11_alnum_char, + .alpha_char = pm_encoding_iso_8859_11_alpha_char, + .isupper_char = pm_encoding_iso_8859_11_isupper_char, + .multibyte = false + }, + [PM_ENCODING_ISO_8859_13] = { + .name = "ISO-8859-13", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_iso_8859_13_alnum_char, + .alpha_char = pm_encoding_iso_8859_13_alpha_char, + .isupper_char = pm_encoding_iso_8859_13_isupper_char, + .multibyte = false + }, + [PM_ENCODING_ISO_8859_14] = { + .name = "ISO-8859-14", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_iso_8859_14_alnum_char, + .alpha_char = pm_encoding_iso_8859_14_alpha_char, + .isupper_char = pm_encoding_iso_8859_14_isupper_char, + .multibyte = false + }, + [PM_ENCODING_ISO_8859_15] = { + .name = "ISO-8859-15", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_iso_8859_15_alnum_char, + .alpha_char = pm_encoding_iso_8859_15_alpha_char, + .isupper_char = pm_encoding_iso_8859_15_isupper_char, + .multibyte = false + }, + [PM_ENCODING_ISO_8859_16] = { + .name = "ISO-8859-16", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_iso_8859_16_alnum_char, + .alpha_char = pm_encoding_iso_8859_16_alpha_char, + .isupper_char = pm_encoding_iso_8859_16_isupper_char, + .multibyte = false + }, + [PM_ENCODING_KOI8_R] = { + .name = "KOI8-R", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_koi8_r_alnum_char, + .alpha_char = pm_encoding_koi8_r_alpha_char, + .isupper_char = pm_encoding_koi8_r_isupper_char, + .multibyte = false + }, + [PM_ENCODING_KOI8_U] = { + .name = "KOI8-U", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_koi8_u_alnum_char, + .alpha_char = pm_encoding_koi8_u_alpha_char, + .isupper_char = pm_encoding_koi8_u_isupper_char, + .multibyte = false + }, + [PM_ENCODING_MAC_CENT_EURO] = { + .name = "macCentEuro", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_mac_cent_euro_alnum_char, + .alpha_char = pm_encoding_mac_cent_euro_alpha_char, + .isupper_char = pm_encoding_mac_cent_euro_isupper_char, + .multibyte = false + }, + [PM_ENCODING_MAC_CROATIAN] = { + .name = "macCroatian", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_mac_croatian_alnum_char, + .alpha_char = pm_encoding_mac_croatian_alpha_char, + .isupper_char = pm_encoding_mac_croatian_isupper_char, + .multibyte = false + }, + [PM_ENCODING_MAC_CYRILLIC] = { + .name = "macCyrillic", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_mac_cyrillic_alnum_char, + .alpha_char = pm_encoding_mac_cyrillic_alpha_char, + .isupper_char = pm_encoding_mac_cyrillic_isupper_char, + .multibyte = false + }, + [PM_ENCODING_MAC_GREEK] = { + .name = "macGreek", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_mac_greek_alnum_char, + .alpha_char = pm_encoding_mac_greek_alpha_char, + .isupper_char = pm_encoding_mac_greek_isupper_char, + .multibyte = false + }, + [PM_ENCODING_MAC_ICELAND] = { + .name = "macIceland", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_mac_iceland_alnum_char, + .alpha_char = pm_encoding_mac_iceland_alpha_char, + .isupper_char = pm_encoding_mac_iceland_isupper_char, + .multibyte = false + }, + [PM_ENCODING_MAC_JAPANESE] = { + .name = "MacJapanese", + .char_width = pm_encoding_shift_jis_char_width, + .alnum_char = pm_encoding_shift_jis_alnum_char, + .alpha_char = pm_encoding_shift_jis_alpha_char, + .isupper_char = pm_encoding_shift_jis_isupper_char, + .multibyte = true + }, + [PM_ENCODING_MAC_ROMAN] = { + .name = "macRoman", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_mac_roman_alnum_char, + .alpha_char = pm_encoding_mac_roman_alpha_char, + .isupper_char = pm_encoding_mac_roman_isupper_char, + .multibyte = false + }, + [PM_ENCODING_MAC_ROMANIA] = { + .name = "macRomania", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_mac_romania_alnum_char, + .alpha_char = pm_encoding_mac_romania_alpha_char, + .isupper_char = pm_encoding_mac_romania_isupper_char, + .multibyte = false + }, + [PM_ENCODING_MAC_THAI] = { + .name = "macThai", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_mac_thai_alnum_char, + .alpha_char = pm_encoding_mac_thai_alpha_char, + .isupper_char = pm_encoding_mac_thai_isupper_char, + .multibyte = false + }, + [PM_ENCODING_MAC_TURKISH] = { + .name = "macTurkish", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_mac_turkish_alnum_char, + .alpha_char = pm_encoding_mac_turkish_alpha_char, + .isupper_char = pm_encoding_mac_turkish_isupper_char, + .multibyte = false + }, + [PM_ENCODING_MAC_UKRAINE] = { + .name = "macUkraine", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_mac_ukraine_alnum_char, + .alpha_char = pm_encoding_mac_ukraine_alpha_char, + .isupper_char = pm_encoding_mac_ukraine_isupper_char, + .multibyte = false + }, + [PM_ENCODING_SHIFT_JIS] = { + .name = "Shift_JIS", + .char_width = pm_encoding_shift_jis_char_width, + .alnum_char = pm_encoding_shift_jis_alnum_char, + .alpha_char = pm_encoding_shift_jis_alpha_char, + .isupper_char = pm_encoding_shift_jis_isupper_char, + .multibyte = true + }, + [PM_ENCODING_SJIS_DOCOMO] = { + .name = "SJIS-DoCoMo", + .char_width = pm_encoding_shift_jis_char_width, + .alnum_char = pm_encoding_shift_jis_alnum_char, + .alpha_char = pm_encoding_shift_jis_alpha_char, + .isupper_char = pm_encoding_shift_jis_isupper_char, + .multibyte = true + }, + [PM_ENCODING_SJIS_KDDI] = { + .name = "SJIS-KDDI", + .char_width = pm_encoding_shift_jis_char_width, + .alnum_char = pm_encoding_shift_jis_alnum_char, + .alpha_char = pm_encoding_shift_jis_alpha_char, + .isupper_char = pm_encoding_shift_jis_isupper_char, + .multibyte = true + }, + [PM_ENCODING_SJIS_SOFTBANK] = { + .name = "SJIS-SoftBank", + .char_width = pm_encoding_shift_jis_char_width, + .alnum_char = pm_encoding_shift_jis_alnum_char, + .alpha_char = pm_encoding_shift_jis_alpha_char, + .isupper_char = pm_encoding_shift_jis_isupper_char, + .multibyte = true + }, + [PM_ENCODING_STATELESS_ISO_2022_JP] = { + .name = "stateless-ISO-2022-JP", + .char_width = pm_encoding_emacs_mule_char_width, + .alnum_char = pm_encoding_ascii_alnum_char_7bit, + .alpha_char = pm_encoding_ascii_alpha_char_7bit, + .isupper_char = pm_encoding_ascii_isupper_char_7bit, + .multibyte = true + }, + [PM_ENCODING_STATELESS_ISO_2022_JP_KDDI] = { + .name = "stateless-ISO-2022-JP-KDDI", + .char_width = pm_encoding_emacs_mule_char_width, + .alnum_char = pm_encoding_ascii_alnum_char_7bit, + .alpha_char = pm_encoding_ascii_alpha_char_7bit, + .isupper_char = pm_encoding_ascii_isupper_char_7bit, + .multibyte = true + }, + [PM_ENCODING_TIS_620] = { + .name = "TIS-620", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_tis_620_alnum_char, + .alpha_char = pm_encoding_tis_620_alpha_char, + .isupper_char = pm_encoding_tis_620_isupper_char, + .multibyte = false + }, + [PM_ENCODING_UTF8_MAC] = { + .name = "UTF8-MAC", + .char_width = pm_encoding_utf_8_char_width, + .alnum_char = pm_encoding_utf_8_alnum_char, + .alpha_char = pm_encoding_utf_8_alpha_char, + .isupper_char = pm_encoding_utf_8_isupper_char, + .multibyte = true + }, + [PM_ENCODING_UTF8_DOCOMO] = { + .name = "UTF8-DoCoMo", + .char_width = pm_encoding_utf_8_char_width, + .alnum_char = pm_encoding_utf_8_alnum_char, + .alpha_char = pm_encoding_utf_8_alpha_char, + .isupper_char = pm_encoding_utf_8_isupper_char, + .multibyte = true + }, + [PM_ENCODING_UTF8_KDDI] = { + .name = "UTF8-KDDI", + .char_width = pm_encoding_utf_8_char_width, + .alnum_char = pm_encoding_utf_8_alnum_char, + .alpha_char = pm_encoding_utf_8_alpha_char, + .isupper_char = pm_encoding_utf_8_isupper_char, + .multibyte = true + }, + [PM_ENCODING_UTF8_SOFTBANK] = { + .name = "UTF8-SoftBank", + .char_width = pm_encoding_utf_8_char_width, + .alnum_char = pm_encoding_utf_8_alnum_char, + .alpha_char = pm_encoding_utf_8_alpha_char, + .isupper_char = pm_encoding_utf_8_isupper_char, + .multibyte = true + }, + [PM_ENCODING_WINDOWS_1250] = { + .name = "Windows-1250", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_windows_1250_alnum_char, + .alpha_char = pm_encoding_windows_1250_alpha_char, + .isupper_char = pm_encoding_windows_1250_isupper_char, + .multibyte = false + }, + [PM_ENCODING_WINDOWS_1251] = { + .name = "Windows-1251", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_windows_1251_alnum_char, + .alpha_char = pm_encoding_windows_1251_alpha_char, + .isupper_char = pm_encoding_windows_1251_isupper_char, + .multibyte = false + }, + [PM_ENCODING_WINDOWS_1252] = { + .name = "Windows-1252", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_windows_1252_alnum_char, + .alpha_char = pm_encoding_windows_1252_alpha_char, + .isupper_char = pm_encoding_windows_1252_isupper_char, + .multibyte = false + }, + [PM_ENCODING_WINDOWS_1253] = { + .name = "Windows-1253", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_windows_1253_alnum_char, + .alpha_char = pm_encoding_windows_1253_alpha_char, + .isupper_char = pm_encoding_windows_1253_isupper_char, + .multibyte = false + }, + [PM_ENCODING_WINDOWS_1254] = { + .name = "Windows-1254", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_windows_1254_alnum_char, + .alpha_char = pm_encoding_windows_1254_alpha_char, + .isupper_char = pm_encoding_windows_1254_isupper_char, + .multibyte = false + }, + [PM_ENCODING_WINDOWS_1255] = { + .name = "Windows-1255", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_windows_1255_alnum_char, + .alpha_char = pm_encoding_windows_1255_alpha_char, + .isupper_char = pm_encoding_windows_1255_isupper_char, + .multibyte = false + }, + [PM_ENCODING_WINDOWS_1256] = { + .name = "Windows-1256", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_windows_1256_alnum_char, + .alpha_char = pm_encoding_windows_1256_alpha_char, + .isupper_char = pm_encoding_windows_1256_isupper_char, + .multibyte = false + }, + [PM_ENCODING_WINDOWS_1257] = { + .name = "Windows-1257", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_windows_1257_alnum_char, + .alpha_char = pm_encoding_windows_1257_alpha_char, + .isupper_char = pm_encoding_windows_1257_isupper_char, + .multibyte = false + }, + [PM_ENCODING_WINDOWS_1258] = { + .name = "Windows-1258", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_windows_1258_alnum_char, + .alpha_char = pm_encoding_windows_1258_alpha_char, + .isupper_char = pm_encoding_windows_1258_isupper_char, + .multibyte = false + }, + [PM_ENCODING_WINDOWS_874] = { + .name = "Windows-874", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_windows_874_alnum_char, + .alpha_char = pm_encoding_windows_874_alpha_char, + .isupper_char = pm_encoding_windows_874_isupper_char, + .multibyte = false + } +#endif +}; + +/** + * Parse the given name of an encoding and return a pointer to the corresponding + * encoding struct if one can be found, otherwise return NULL. + */ +const pm_encoding_t * +pm_encoding_find(const uint8_t *start, const uint8_t *end) { + size_t width = (size_t) (end - start); + + // First, we're going to check for UTF-8. This is the most common encoding. + // UTF-8 can contain extra information at the end about the platform it is + // encoded on, such as UTF-8-MAC or UTF-8-UNIX. We'll ignore those suffixes. + if ((start + 5 <= end) && (pm_strncasecmp(start, (const uint8_t *) "UTF-8", 5) == 0)) { +#ifndef PRISM_ENCODING_EXCLUDE_FULL + // We need to explicitly handle UTF-8-HFS, as that one needs to switch + // over to being UTF8-MAC. + if (width == 9 && (pm_strncasecmp(start + 5, (const uint8_t *) "-HFS", 4) == 0)) { + return &pm_encodings[PM_ENCODING_UTF8_MAC]; + } +#endif + + // Otherwise we'll return the default UTF-8 encoding. + return PM_ENCODING_UTF_8_ENTRY; + } + + // Next, we're going to loop through each of the encodings that we handle + // explicitly. If we found one that we understand, we'll use that value. +#define ENCODING1(name, encoding) if (width == sizeof(name) - 1 && pm_strncasecmp(start, (const uint8_t *) name, width) == 0) return &pm_encodings[encoding]; +#define ENCODING2(name1, name2, encoding) ENCODING1(name1, encoding) ENCODING1(name2, encoding) + + if (width >= 3) { + switch (*start) { + case 'A': case 'a': + ENCODING1("ASCII", PM_ENCODING_US_ASCII); + ENCODING1("ASCII-8BIT", PM_ENCODING_ASCII_8BIT); + ENCODING1("ANSI_X3.4-1968", PM_ENCODING_US_ASCII); + break; + case 'B': case 'b': + ENCODING1("BINARY", PM_ENCODING_ASCII_8BIT); +#ifndef PRISM_ENCODING_EXCLUDE_FULL + ENCODING1("Big5", PM_ENCODING_BIG5); + ENCODING2("Big5-HKSCS", "Big5-HKSCS:2008", PM_ENCODING_BIG5_HKSCS); + ENCODING1("Big5-UAO", PM_ENCODING_BIG5_UAO); +#endif + break; + case 'C': case 'c': + ENCODING1("CP65001", PM_ENCODING_UTF_8); + ENCODING2("CP932", "csWindows31J", PM_ENCODING_WINDOWS_31J); +#ifndef PRISM_ENCODING_EXCLUDE_FULL + ENCODING1("CESU-8", PM_ENCODING_CESU_8); + ENCODING1("CP437", PM_ENCODING_IBM437); + ENCODING1("CP720", PM_ENCODING_IBM720); + ENCODING1("CP737", PM_ENCODING_IBM737); + ENCODING1("CP775", PM_ENCODING_IBM775); + ENCODING1("CP850", PM_ENCODING_CP850); + ENCODING1("CP852", PM_ENCODING_CP852); + ENCODING1("CP855", PM_ENCODING_CP855); + ENCODING1("CP857", PM_ENCODING_IBM857); + ENCODING1("CP860", PM_ENCODING_IBM860); + ENCODING1("CP861", PM_ENCODING_IBM861); + ENCODING1("CP862", PM_ENCODING_IBM862); + ENCODING1("CP864", PM_ENCODING_IBM864); + ENCODING1("CP865", PM_ENCODING_IBM865); + ENCODING1("CP866", PM_ENCODING_IBM866); + ENCODING1("CP869", PM_ENCODING_IBM869); + ENCODING1("CP874", PM_ENCODING_WINDOWS_874); + ENCODING1("CP878", PM_ENCODING_KOI8_R); + ENCODING1("CP863", PM_ENCODING_IBM863); + ENCODING1("CP936", PM_ENCODING_GBK); + ENCODING1("CP949", PM_ENCODING_CP949); + ENCODING1("CP950", PM_ENCODING_CP950); + ENCODING1("CP951", PM_ENCODING_CP951); + ENCODING1("CP1250", PM_ENCODING_WINDOWS_1250); + ENCODING1("CP1251", PM_ENCODING_WINDOWS_1251); + ENCODING1("CP1252", PM_ENCODING_WINDOWS_1252); + ENCODING1("CP1253", PM_ENCODING_WINDOWS_1253); + ENCODING1("CP1254", PM_ENCODING_WINDOWS_1254); + ENCODING1("CP1255", PM_ENCODING_WINDOWS_1255); + ENCODING1("CP1256", PM_ENCODING_WINDOWS_1256); + ENCODING1("CP1257", PM_ENCODING_WINDOWS_1257); + ENCODING1("CP1258", PM_ENCODING_WINDOWS_1258); + ENCODING1("CP51932", PM_ENCODING_CP51932); +#endif + break; + case 'E': case 'e': + ENCODING2("EUC-JP", "eucJP", PM_ENCODING_EUC_JP); +#ifndef PRISM_ENCODING_EXCLUDE_FULL + ENCODING2("eucJP-ms", "euc-jp-ms", PM_ENCODING_EUC_JP_MS); + ENCODING2("EUC-JIS-2004", "EUC-JISX0213", PM_ENCODING_EUC_JIS_2004); + ENCODING2("EUC-KR", "eucKR", PM_ENCODING_EUC_KR); + ENCODING2("EUC-CN", "eucCN", PM_ENCODING_GB2312); + ENCODING2("EUC-TW", "eucTW", PM_ENCODING_EUC_TW); + ENCODING1("Emacs-Mule", PM_ENCODING_EMACS_MULE); +#endif + break; + case 'G': case 'g': +#ifndef PRISM_ENCODING_EXCLUDE_FULL + ENCODING1("GBK", PM_ENCODING_GBK); + ENCODING1("GB12345", PM_ENCODING_GB12345); + ENCODING1("GB18030", PM_ENCODING_GB18030); + ENCODING1("GB1988", PM_ENCODING_GB1988); + ENCODING1("GB2312", PM_ENCODING_GB2312); +#endif + break; + case 'I': case 'i': +#ifndef PRISM_ENCODING_EXCLUDE_FULL + ENCODING1("IBM437", PM_ENCODING_IBM437); + ENCODING1("IBM720", PM_ENCODING_IBM720); + ENCODING1("IBM737", PM_ENCODING_IBM737); + ENCODING1("IBM775", PM_ENCODING_IBM775); + ENCODING1("IBM850", PM_ENCODING_CP850); + ENCODING1("IBM852", PM_ENCODING_IBM852); + ENCODING1("IBM855", PM_ENCODING_IBM855); + ENCODING1("IBM857", PM_ENCODING_IBM857); + ENCODING1("IBM860", PM_ENCODING_IBM860); + ENCODING1("IBM861", PM_ENCODING_IBM861); + ENCODING1("IBM862", PM_ENCODING_IBM862); + ENCODING1("IBM863", PM_ENCODING_IBM863); + ENCODING1("IBM864", PM_ENCODING_IBM864); + ENCODING1("IBM865", PM_ENCODING_IBM865); + ENCODING1("IBM866", PM_ENCODING_IBM866); + ENCODING1("IBM869", PM_ENCODING_IBM869); + ENCODING2("ISO-8859-1", "ISO8859-1", PM_ENCODING_ISO_8859_1); + ENCODING2("ISO-8859-2", "ISO8859-2", PM_ENCODING_ISO_8859_2); + ENCODING2("ISO-8859-3", "ISO8859-3", PM_ENCODING_ISO_8859_3); + ENCODING2("ISO-8859-4", "ISO8859-4", PM_ENCODING_ISO_8859_4); + ENCODING2("ISO-8859-5", "ISO8859-5", PM_ENCODING_ISO_8859_5); + ENCODING2("ISO-8859-6", "ISO8859-6", PM_ENCODING_ISO_8859_6); + ENCODING2("ISO-8859-7", "ISO8859-7", PM_ENCODING_ISO_8859_7); + ENCODING2("ISO-8859-8", "ISO8859-8", PM_ENCODING_ISO_8859_8); + ENCODING2("ISO-8859-9", "ISO8859-9", PM_ENCODING_ISO_8859_9); + ENCODING2("ISO-8859-10", "ISO8859-10", PM_ENCODING_ISO_8859_10); + ENCODING2("ISO-8859-11", "ISO8859-11", PM_ENCODING_ISO_8859_11); + ENCODING2("ISO-8859-13", "ISO8859-13", PM_ENCODING_ISO_8859_13); + ENCODING2("ISO-8859-14", "ISO8859-14", PM_ENCODING_ISO_8859_14); + ENCODING2("ISO-8859-15", "ISO8859-15", PM_ENCODING_ISO_8859_15); + ENCODING2("ISO-8859-16", "ISO8859-16", PM_ENCODING_ISO_8859_16); +#endif + break; + case 'K': case 'k': +#ifndef PRISM_ENCODING_EXCLUDE_FULL + ENCODING1("KOI8-R", PM_ENCODING_KOI8_R); + ENCODING1("KOI8-U", PM_ENCODING_KOI8_U); +#endif + break; + case 'M': case 'm': +#ifndef PRISM_ENCODING_EXCLUDE_FULL + ENCODING1("macCentEuro", PM_ENCODING_MAC_CENT_EURO); + ENCODING1("macCroatian", PM_ENCODING_MAC_CROATIAN); + ENCODING1("macCyrillic", PM_ENCODING_MAC_CYRILLIC); + ENCODING1("macGreek", PM_ENCODING_MAC_GREEK); + ENCODING1("macIceland", PM_ENCODING_MAC_ICELAND); + ENCODING1("MacJapanese", PM_ENCODING_MAC_JAPANESE); + ENCODING1("MacJapan", PM_ENCODING_MAC_JAPANESE); + ENCODING1("macRoman", PM_ENCODING_MAC_ROMAN); + ENCODING1("macRomania", PM_ENCODING_MAC_ROMANIA); + ENCODING1("macThai", PM_ENCODING_MAC_THAI); + ENCODING1("macTurkish", PM_ENCODING_MAC_TURKISH); + ENCODING1("macUkraine", PM_ENCODING_MAC_UKRAINE); +#endif + break; + case 'P': case 'p': + ENCODING1("PCK", PM_ENCODING_WINDOWS_31J); + break; + case 'S': case 's': + ENCODING1("SJIS", PM_ENCODING_WINDOWS_31J); +#ifndef PRISM_ENCODING_EXCLUDE_FULL + ENCODING1("Shift_JIS", PM_ENCODING_SHIFT_JIS); + ENCODING1("SJIS-DoCoMo", PM_ENCODING_SJIS_DOCOMO); + ENCODING1("SJIS-KDDI", PM_ENCODING_SJIS_KDDI); + ENCODING1("SJIS-SoftBank", PM_ENCODING_SJIS_SOFTBANK); + ENCODING1("stateless-ISO-2022-JP", PM_ENCODING_STATELESS_ISO_2022_JP); + ENCODING1("stateless-ISO-2022-JP-KDDI", PM_ENCODING_STATELESS_ISO_2022_JP_KDDI); +#endif + break; + case 'T': case 't': +#ifndef PRISM_ENCODING_EXCLUDE_FULL + ENCODING1("TIS-620", PM_ENCODING_TIS_620); +#endif + break; + case 'U': case 'u': + ENCODING1("US-ASCII", PM_ENCODING_US_ASCII); +#ifndef PRISM_ENCODING_EXCLUDE_FULL + ENCODING2("UTF8-MAC", "UTF-8-HFS", PM_ENCODING_UTF8_MAC); + ENCODING1("UTF8-DoCoMo", PM_ENCODING_UTF8_DOCOMO); + ENCODING1("UTF8-KDDI", PM_ENCODING_UTF8_KDDI); + ENCODING1("UTF8-SoftBank", PM_ENCODING_UTF8_SOFTBANK); +#endif + break; + case 'W': case 'w': + ENCODING1("Windows-31J", PM_ENCODING_WINDOWS_31J); +#ifndef PRISM_ENCODING_EXCLUDE_FULL + ENCODING1("Windows-874", PM_ENCODING_WINDOWS_874); + ENCODING1("Windows-1250", PM_ENCODING_WINDOWS_1250); + ENCODING1("Windows-1251", PM_ENCODING_WINDOWS_1251); + ENCODING1("Windows-1252", PM_ENCODING_WINDOWS_1252); + ENCODING1("Windows-1253", PM_ENCODING_WINDOWS_1253); + ENCODING1("Windows-1254", PM_ENCODING_WINDOWS_1254); + ENCODING1("Windows-1255", PM_ENCODING_WINDOWS_1255); + ENCODING1("Windows-1256", PM_ENCODING_WINDOWS_1256); + ENCODING1("Windows-1257", PM_ENCODING_WINDOWS_1257); + ENCODING1("Windows-1258", PM_ENCODING_WINDOWS_1258); +#endif + break; + case '6': + ENCODING1("646", PM_ENCODING_US_ASCII); + break; + } + } + +#undef ENCODING2 +#undef ENCODING1 + + // If we didn't match any encodings, return NULL. + return NULL; +} diff --git a/prism/encoding.h b/prism/encoding.h new file mode 100644 index 0000000000..5f7724821f --- /dev/null +++ b/prism/encoding.h @@ -0,0 +1,283 @@ +/** + * @file encoding.h + * + * The encoding interface and implementations used by the parser. + */ +#ifndef PRISM_ENCODING_H +#define PRISM_ENCODING_H + +#include "prism/defines.h" +#include "prism/util/pm_strncasecmp.h" + +#include <assert.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> + +/** + * This struct defines the functions necessary to implement the encoding + * interface so we can determine how many bytes the subsequent character takes. + * Each callback should return the number of bytes, or 0 if the next bytes are + * invalid for the encoding and type. + */ +typedef struct { + /** + * Return the number of bytes that the next character takes if it is valid + * in the encoding. Does not read more than n bytes. It is assumed that n is + * at least 1. + */ + size_t (*char_width)(const uint8_t *b, ptrdiff_t n); + + /** + * Return the number of bytes that the next character takes if it is valid + * in the encoding and is alphabetical. Does not read more than n bytes. It + * is assumed that n is at least 1. + */ + size_t (*alpha_char)(const uint8_t *b, ptrdiff_t n); + + /** + * Return the number of bytes that the next character takes if it is valid + * in the encoding and is alphanumeric. Does not read more than n bytes. It + * is assumed that n is at least 1. + */ + size_t (*alnum_char)(const uint8_t *b, ptrdiff_t n); + + /** + * Return true if the next character is valid in the encoding and is an + * uppercase character. Does not read more than n bytes. It is assumed that + * n is at least 1. + */ + bool (*isupper_char)(const uint8_t *b, ptrdiff_t n); + + /** + * The name of the encoding. This should correspond to a value that can be + * passed to Encoding.find in Ruby. + */ + const char *name; + + /** + * Return true if the encoding is a multibyte encoding. + */ + bool multibyte; +} pm_encoding_t; + +/** + * All of the lookup tables use the first bit of each embedded byte to indicate + * whether the codepoint is alphabetical. + */ +#define PRISM_ENCODING_ALPHABETIC_BIT 1 << 0 + +/** + * All of the lookup tables use the second bit of each embedded byte to indicate + * whether the codepoint is alphanumeric. + */ +#define PRISM_ENCODING_ALPHANUMERIC_BIT 1 << 1 + +/** + * All of the lookup tables use the third bit of each embedded byte to indicate + * whether the codepoint is uppercase. + */ +#define PRISM_ENCODING_UPPERCASE_BIT 1 << 2 + +/** + * Return the size of the next character in the UTF-8 encoding. + * + * @param b The bytes to read. + * @param n The number of bytes that can be read. + * @returns The number of bytes that the next character takes if it is valid in + * the encoding, or 0 if it is not. + */ +size_t pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n); + +/** + * Return the size of the next character in the UTF-8 encoding if it is an + * alphabetical character. + * + * @param b The bytes to read. + * @param n The number of bytes that can be read. + * @returns The number of bytes that the next character takes if it is valid in + * the encoding, or 0 if it is not. + */ +size_t pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n); + +/** + * Return the size of the next character in the UTF-8 encoding if it is an + * alphanumeric character. + * + * @param b The bytes to read. + * @param n The number of bytes that can be read. + * @returns The number of bytes that the next character takes if it is valid in + * the encoding, or 0 if it is not. + */ +size_t pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n); + +/** + * Return true if the next character in the UTF-8 encoding if it is an uppercase + * character. + * + * @param b The bytes to read. + * @param n The number of bytes that can be read. + * @returns True if the next character is valid in the encoding and is an + * uppercase character, or false if it is not. + */ +bool pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n); + +/** + * This lookup table is referenced in both the UTF-8 encoding file and the + * parser directly in order to speed up the default encoding processing. It is + * used to indicate whether a character is alphabetical, alphanumeric, or + * uppercase in unicode mappings. + */ +extern const uint8_t pm_encoding_unicode_table[256]; + +/** + * These are all of the encodings that prism supports. + */ +typedef enum { + PM_ENCODING_UTF_8 = 0, + PM_ENCODING_US_ASCII, + PM_ENCODING_ASCII_8BIT, + PM_ENCODING_EUC_JP, + PM_ENCODING_WINDOWS_31J, + +// We optionally support excluding the full set of encodings to only support the +// minimum necessary to process Ruby code without encoding comments. +#ifndef PRISM_ENCODING_EXCLUDE_FULL + PM_ENCODING_BIG5, + PM_ENCODING_BIG5_HKSCS, + PM_ENCODING_BIG5_UAO, + PM_ENCODING_CESU_8, + PM_ENCODING_CP51932, + PM_ENCODING_CP850, + PM_ENCODING_CP852, + PM_ENCODING_CP855, + PM_ENCODING_CP949, + PM_ENCODING_CP950, + PM_ENCODING_CP951, + PM_ENCODING_EMACS_MULE, + PM_ENCODING_EUC_JP_MS, + PM_ENCODING_EUC_JIS_2004, + PM_ENCODING_EUC_KR, + PM_ENCODING_EUC_TW, + PM_ENCODING_GB12345, + PM_ENCODING_GB18030, + PM_ENCODING_GB1988, + PM_ENCODING_GB2312, + PM_ENCODING_GBK, + PM_ENCODING_IBM437, + PM_ENCODING_IBM720, + PM_ENCODING_IBM737, + PM_ENCODING_IBM775, + PM_ENCODING_IBM852, + PM_ENCODING_IBM855, + PM_ENCODING_IBM857, + PM_ENCODING_IBM860, + PM_ENCODING_IBM861, + PM_ENCODING_IBM862, + PM_ENCODING_IBM863, + PM_ENCODING_IBM864, + PM_ENCODING_IBM865, + PM_ENCODING_IBM866, + PM_ENCODING_IBM869, + PM_ENCODING_ISO_8859_1, + PM_ENCODING_ISO_8859_2, + PM_ENCODING_ISO_8859_3, + PM_ENCODING_ISO_8859_4, + PM_ENCODING_ISO_8859_5, + PM_ENCODING_ISO_8859_6, + PM_ENCODING_ISO_8859_7, + PM_ENCODING_ISO_8859_8, + PM_ENCODING_ISO_8859_9, + PM_ENCODING_ISO_8859_10, + PM_ENCODING_ISO_8859_11, + PM_ENCODING_ISO_8859_13, + PM_ENCODING_ISO_8859_14, + PM_ENCODING_ISO_8859_15, + PM_ENCODING_ISO_8859_16, + PM_ENCODING_KOI8_R, + PM_ENCODING_KOI8_U, + PM_ENCODING_MAC_CENT_EURO, + PM_ENCODING_MAC_CROATIAN, + PM_ENCODING_MAC_CYRILLIC, + PM_ENCODING_MAC_GREEK, + PM_ENCODING_MAC_ICELAND, + PM_ENCODING_MAC_JAPANESE, + PM_ENCODING_MAC_ROMAN, + PM_ENCODING_MAC_ROMANIA, + PM_ENCODING_MAC_THAI, + PM_ENCODING_MAC_TURKISH, + PM_ENCODING_MAC_UKRAINE, + PM_ENCODING_SHIFT_JIS, + PM_ENCODING_SJIS_DOCOMO, + PM_ENCODING_SJIS_KDDI, + PM_ENCODING_SJIS_SOFTBANK, + PM_ENCODING_STATELESS_ISO_2022_JP, + PM_ENCODING_STATELESS_ISO_2022_JP_KDDI, + PM_ENCODING_TIS_620, + PM_ENCODING_UTF8_MAC, + PM_ENCODING_UTF8_DOCOMO, + PM_ENCODING_UTF8_KDDI, + PM_ENCODING_UTF8_SOFTBANK, + PM_ENCODING_WINDOWS_1250, + PM_ENCODING_WINDOWS_1251, + PM_ENCODING_WINDOWS_1252, + PM_ENCODING_WINDOWS_1253, + PM_ENCODING_WINDOWS_1254, + PM_ENCODING_WINDOWS_1255, + PM_ENCODING_WINDOWS_1256, + PM_ENCODING_WINDOWS_1257, + PM_ENCODING_WINDOWS_1258, + PM_ENCODING_WINDOWS_874, +#endif + + PM_ENCODING_MAXIMUM +} pm_encoding_type_t; + +/** + * This is the table of all of the encodings that prism supports. + */ +extern const pm_encoding_t pm_encodings[PM_ENCODING_MAXIMUM]; + +/** + * This is the default UTF-8 encoding. We need a reference to it to quickly + * create parsers. + */ +#define PM_ENCODING_UTF_8_ENTRY (&pm_encodings[PM_ENCODING_UTF_8]) + +/** + * This is the US-ASCII encoding. We need a reference to it to be able to + * compare against it when a string is being created because it could possibly + * need to fall back to ASCII-8BIT. + */ +#define PM_ENCODING_US_ASCII_ENTRY (&pm_encodings[PM_ENCODING_US_ASCII]) + +/** + * This is the ASCII-8BIT encoding. We need a reference to it so that pm_strpbrk + * can compare against it because invalid multibyte characters are not a thing + * in this encoding. It is also needed for handling Regexp encoding flags. + */ +#define PM_ENCODING_ASCII_8BIT_ENTRY (&pm_encodings[PM_ENCODING_ASCII_8BIT]) + +/** + * This is the EUC-JP encoding. We need a reference to it to quickly process + * regular expression modifiers. + */ +#define PM_ENCODING_EUC_JP_ENTRY (&pm_encodings[PM_ENCODING_EUC_JP]) + +/** + * This is the Windows-31J encoding. We need a reference to it to quickly + * process regular expression modifiers. + */ +#define PM_ENCODING_WINDOWS_31J_ENTRY (&pm_encodings[PM_ENCODING_WINDOWS_31J]) + +/** + * Parse the given name of an encoding and return a pointer to the corresponding + * encoding struct if one can be found, otherwise return NULL. + * + * @param start A pointer to the first byte of the name. + * @param end A pointer to the last byte of the name. + * @returns A pointer to the encoding struct if one is found, otherwise NULL. + */ +const pm_encoding_t * pm_encoding_find(const uint8_t *start, const uint8_t *end); + +#endif diff --git a/prism/extension.c b/prism/extension.c new file mode 100644 index 0000000000..71c2d91b98 --- /dev/null +++ b/prism/extension.c @@ -0,0 +1,1427 @@ +#include "prism/extension.h" + +#ifdef _WIN32 +#include <ruby/win32.h> +#endif + +// NOTE: this file should contain only bindings. All non-trivial logic should be +// in libprism so it can be shared its the various callers. + +VALUE rb_cPrism; +VALUE rb_cPrismNode; +VALUE rb_cPrismSource; +VALUE rb_cPrismToken; +VALUE rb_cPrismLocation; + +VALUE rb_cPrismComment; +VALUE rb_cPrismInlineComment; +VALUE rb_cPrismEmbDocComment; +VALUE rb_cPrismMagicComment; +VALUE rb_cPrismParseError; +VALUE rb_cPrismParseWarning; +VALUE rb_cPrismResult; +VALUE rb_cPrismParseResult; +VALUE rb_cPrismLexResult; +VALUE rb_cPrismParseLexResult; +VALUE rb_cPrismStringQuery; +VALUE rb_cPrismScope; +VALUE rb_cPrismCurrentVersionError; + +VALUE rb_cPrismDebugEncoding; + +ID rb_id_option_command_line; +ID rb_id_option_encoding; +ID rb_id_option_filepath; +ID rb_id_option_freeze; +ID rb_id_option_frozen_string_literal; +ID rb_id_option_line; +ID rb_id_option_main_script; +ID rb_id_option_partial_script; +ID rb_id_option_scopes; +ID rb_id_option_version; +ID rb_id_source_for; +ID rb_id_forwarding_positionals; +ID rb_id_forwarding_keywords; +ID rb_id_forwarding_block; +ID rb_id_forwarding_all; + +/******************************************************************************/ +/* IO of Ruby code */ +/******************************************************************************/ + +/** + * Check if the given VALUE is a string. If it's not a string, then raise a + * TypeError. Otherwise return the VALUE as a C string. + */ +static const char * +check_string(VALUE value) { + // Check if the value is a string. If it's not, then raise a type error. + if (!RB_TYPE_P(value, T_STRING)) { + rb_raise(rb_eTypeError, "wrong argument type %" PRIsVALUE " (expected String)", rb_obj_class(value)); + } + + // Otherwise, return the value as a C string. + return RSTRING_PTR(value); +} + +/** + * Load the contents and size of the given string into the given pm_string_t. + */ +static void +input_load_string(pm_string_t *input, VALUE string) { + // Check if the string is a string. If it's not, then raise a type error. + if (!RB_TYPE_P(string, T_STRING)) { + rb_raise(rb_eTypeError, "wrong argument type %" PRIsVALUE " (expected String)", rb_obj_class(string)); + } + + pm_string_constant_init(input, RSTRING_PTR(string), RSTRING_LEN(string)); +} + +/******************************************************************************/ +/* Building C options from Ruby options */ +/******************************************************************************/ + +/** + * Build the scopes associated with the provided Ruby keyword value. + */ +static void +build_options_scopes(pm_options_t *options, VALUE scopes) { + // Check if the value is an array. If it's not, then raise a type error. + if (!RB_TYPE_P(scopes, T_ARRAY)) { + rb_raise(rb_eTypeError, "wrong argument type %"PRIsVALUE" (expected Array)", rb_obj_class(scopes)); + } + + // Initialize the scopes array. + size_t scopes_count = RARRAY_LEN(scopes); + if (!pm_options_scopes_init(options, scopes_count)) { + rb_raise(rb_eNoMemError, "failed to allocate memory"); + } + + // Iterate over the scopes and add them to the options. + for (size_t scope_index = 0; scope_index < scopes_count; scope_index++) { + VALUE scope = rb_ary_entry(scopes, scope_index); + + // The scope can be either an array or it can be a Prism::Scope object. + // Parse out the correct values here from either. + VALUE locals; + uint8_t forwarding = PM_OPTIONS_SCOPE_FORWARDING_NONE; + + if (RB_TYPE_P(scope, T_ARRAY)) { + locals = scope; + } else if (rb_obj_is_kind_of(scope, rb_cPrismScope)) { + locals = rb_ivar_get(scope, rb_intern("@locals")); + if (!RB_TYPE_P(locals, T_ARRAY)) { + rb_raise(rb_eTypeError, "wrong argument type %"PRIsVALUE" (expected Array)", rb_obj_class(locals)); + } + + VALUE names = rb_ivar_get(scope, rb_intern("@forwarding")); + if (!RB_TYPE_P(names, T_ARRAY)) { + rb_raise(rb_eTypeError, "wrong argument type %"PRIsVALUE" (expected Array)", rb_obj_class(names)); + } + + size_t names_count = RARRAY_LEN(names); + for (size_t name_index = 0; name_index < names_count; name_index++) { + VALUE name = rb_ary_entry(names, name_index); + + // Check that the name is a symbol. If it's not, then raise + // a type error. + if (!RB_TYPE_P(name, T_SYMBOL)) { + rb_raise(rb_eTypeError, "wrong argument type %"PRIsVALUE" (expected Symbol)", rb_obj_class(name)); + } + + ID id = SYM2ID(name); + if (id == rb_id_forwarding_positionals) { + forwarding |= PM_OPTIONS_SCOPE_FORWARDING_POSITIONALS; + } else if (id == rb_id_forwarding_keywords) { + forwarding |= PM_OPTIONS_SCOPE_FORWARDING_KEYWORDS; + } else if (id == rb_id_forwarding_block) { + forwarding |= PM_OPTIONS_SCOPE_FORWARDING_BLOCK; + } else if (id == rb_id_forwarding_all) { + forwarding |= PM_OPTIONS_SCOPE_FORWARDING_ALL; + } else { + rb_raise(rb_eArgError, "invalid forwarding value: %" PRIsVALUE, name); + } + } + } else { + rb_raise(rb_eTypeError, "wrong argument type %"PRIsVALUE" (expected Array or Prism::Scope)", rb_obj_class(scope)); + } + + // Initialize the scope array. + size_t locals_count = RARRAY_LEN(locals); + pm_options_scope_t *options_scope = &options->scopes[scope_index]; + if (!pm_options_scope_init(options_scope, locals_count)) { + rb_raise(rb_eNoMemError, "failed to allocate memory"); + } + + // Iterate over the locals and add them to the scope. + for (size_t local_index = 0; local_index < locals_count; local_index++) { + VALUE local = rb_ary_entry(locals, local_index); + + // Check that the local is a symbol. If it's not, then raise a + // type error. + if (!RB_TYPE_P(local, T_SYMBOL)) { + rb_raise(rb_eTypeError, "wrong argument type %"PRIsVALUE" (expected Symbol)", rb_obj_class(local)); + } + + // Add the local to the scope. + pm_string_t *scope_local = &options_scope->locals[local_index]; + const char *name = rb_id2name(SYM2ID(local)); + pm_string_constant_init(scope_local, name, strlen(name)); + } + + // Now set the forwarding options. + pm_options_scope_forwarding_set(options_scope, forwarding); + } +} + +/** + * An iterator function that is called for each key-value in the keywords hash. + */ +static int +build_options_i(VALUE key, VALUE value, VALUE argument) { + pm_options_t *options = (pm_options_t *) argument; + ID key_id = SYM2ID(key); + + if (key_id == rb_id_option_filepath) { + if (!NIL_P(value)) pm_options_filepath_set(options, check_string(value)); + } else if (key_id == rb_id_option_encoding) { + if (!NIL_P(value)) { + if (value == Qfalse) { + pm_options_encoding_locked_set(options, true); + } else { + pm_options_encoding_set(options, rb_enc_name(rb_to_encoding(value))); + } + } + } else if (key_id == rb_id_option_line) { + if (!NIL_P(value)) pm_options_line_set(options, NUM2INT(value)); + } else if (key_id == rb_id_option_frozen_string_literal) { + if (!NIL_P(value)) pm_options_frozen_string_literal_set(options, RTEST(value)); + } else if (key_id == rb_id_option_version) { + if (!NIL_P(value)) { + const char *version = check_string(value); + + if (RSTRING_LEN(value) == 7 && strncmp(version, "current", 7) == 0) { + const char *current_version = RSTRING_PTR(rb_const_get(rb_cObject, rb_intern("RUBY_VERSION"))); + if (!pm_options_version_set(options, current_version, 3)) { + rb_exc_raise(rb_exc_new_cstr(rb_cPrismCurrentVersionError, current_version)); + } + } else if (!pm_options_version_set(options, version, RSTRING_LEN(value))) { + rb_raise(rb_eArgError, "invalid version: %" PRIsVALUE, value); + } + } + } else if (key_id == rb_id_option_scopes) { + if (!NIL_P(value)) build_options_scopes(options, value); + } else if (key_id == rb_id_option_command_line) { + if (!NIL_P(value)) { + const char *string = check_string(value); + uint8_t command_line = 0; + + for (size_t index = 0; index < strlen(string); index++) { + switch (string[index]) { + case 'a': command_line |= PM_OPTIONS_COMMAND_LINE_A; break; + case 'e': command_line |= PM_OPTIONS_COMMAND_LINE_E; break; + case 'l': command_line |= PM_OPTIONS_COMMAND_LINE_L; break; + case 'n': command_line |= PM_OPTIONS_COMMAND_LINE_N; break; + case 'p': command_line |= PM_OPTIONS_COMMAND_LINE_P; break; + case 'x': command_line |= PM_OPTIONS_COMMAND_LINE_X; break; + default: rb_raise(rb_eArgError, "invalid command line flag: '%c'", string[index]); break; + } + } + + pm_options_command_line_set(options, command_line); + } + } else if (key_id == rb_id_option_main_script) { + if (!NIL_P(value)) pm_options_main_script_set(options, RTEST(value)); + } else if (key_id == rb_id_option_partial_script) { + if (!NIL_P(value)) pm_options_partial_script_set(options, RTEST(value)); + } else if (key_id == rb_id_option_freeze) { + if (!NIL_P(value)) pm_options_freeze_set(options, RTEST(value)); + } else { + rb_raise(rb_eArgError, "unknown keyword: %" PRIsVALUE, key); + } + + return ST_CONTINUE; +} + +/** + * We need a struct here to pass through rb_protect and it has to be a single + * value. Because the sizeof(VALUE) == sizeof(void *), we're going to pass this + * through as an opaque pointer and cast it on both sides. + */ +struct build_options_data { + pm_options_t *options; + VALUE keywords; +}; + +/** + * Build the set of options from the given keywords. Note that this can raise a + * Ruby error if the options are not valid. + */ +static VALUE +build_options(VALUE argument) { + struct build_options_data *data = (struct build_options_data *) argument; + rb_hash_foreach(data->keywords, build_options_i, (VALUE) data->options); + return Qnil; +} + +/** + * Extract the options from the given keyword arguments. + */ +static void +extract_options(pm_options_t *options, VALUE filepath, VALUE keywords) { + options->line = 1; // default + + if (!NIL_P(keywords)) { + struct build_options_data data = { .options = options, .keywords = keywords }; + struct build_options_data *argument = &data; + + int state = 0; + rb_protect(build_options, (VALUE) argument, &state); + + if (state != 0) { + pm_options_free(options); + rb_jump_tag(state); + } + } + + if (!NIL_P(filepath)) { + if (!RB_TYPE_P(filepath, T_STRING)) { + pm_options_free(options); + rb_raise(rb_eTypeError, "wrong argument type %"PRIsVALUE" (expected String)", rb_obj_class(filepath)); + } + + pm_options_filepath_set(options, RSTRING_PTR(filepath)); + } +} + +/** + * Read options for methods that look like (source, **options). + */ +static void +string_options(int argc, VALUE *argv, pm_string_t *input, pm_options_t *options) { + VALUE string; + VALUE keywords; + rb_scan_args(argc, argv, "1:", &string, &keywords); + + extract_options(options, Qnil, keywords); + input_load_string(input, string); +} + +/** + * Read options for methods that look like (filepath, **options). + */ +static void +file_options(int argc, VALUE *argv, pm_string_t *input, pm_options_t *options, VALUE *encoded_filepath) { + VALUE filepath; + VALUE keywords; + rb_scan_args(argc, argv, "1:", &filepath, &keywords); + + Check_Type(filepath, T_STRING); + *encoded_filepath = rb_str_encode_ospath(filepath); + extract_options(options, *encoded_filepath, keywords); + + const char *source = (const char *) pm_string_source(&options->filepath); + pm_string_init_result_t result; + + switch (result = pm_string_file_init(input, source)) { + case PM_STRING_INIT_SUCCESS: + break; + case PM_STRING_INIT_ERROR_GENERIC: { + pm_options_free(options); + +#ifdef _WIN32 + int e = rb_w32_map_errno(GetLastError()); +#else + int e = errno; +#endif + + rb_syserr_fail(e, source); + break; + } + case PM_STRING_INIT_ERROR_DIRECTORY: + pm_options_free(options); + rb_syserr_fail(EISDIR, source); + break; + default: + pm_options_free(options); + rb_raise(rb_eRuntimeError, "Unknown error (%d) initializing file: %s", result, source); + break; + } +} + +#ifndef PRISM_EXCLUDE_SERIALIZATION + +/******************************************************************************/ +/* Serializing the AST */ +/******************************************************************************/ + +/** + * Dump the AST corresponding to the given input to a string. + */ +static VALUE +dump_input(pm_string_t *input, const pm_options_t *options) { + pm_buffer_t buffer; + if (!pm_buffer_init(&buffer)) { + rb_raise(rb_eNoMemError, "failed to allocate memory"); + } + + pm_parser_t parser; + pm_parser_init(&parser, pm_string_source(input), pm_string_length(input), options); + + pm_node_t *node = pm_parse(&parser); + pm_serialize(&parser, node, &buffer); + + VALUE result = rb_str_new(pm_buffer_value(&buffer), pm_buffer_length(&buffer)); + pm_node_destroy(&parser, node); + pm_buffer_free(&buffer); + pm_parser_free(&parser); + + return result; +} + +/** + * call-seq: + * Prism::dump(source, **options) -> String + * + * Dump the AST corresponding to the given string to a string. For supported + * options, see Prism::parse. + */ +static VALUE +dump(int argc, VALUE *argv, VALUE self) { + pm_string_t input; + pm_options_t options = { 0 }; + string_options(argc, argv, &input, &options); + +#ifdef PRISM_BUILD_DEBUG + size_t length = pm_string_length(&input); + char* dup = xmalloc(length); + memcpy(dup, pm_string_source(&input), length); + pm_string_constant_init(&input, dup, length); +#endif + + VALUE value = dump_input(&input, &options); + if (options.freeze) rb_obj_freeze(value); + +#ifdef PRISM_BUILD_DEBUG + xfree(dup); +#endif + + pm_string_free(&input); + pm_options_free(&options); + + return value; +} + +/** + * call-seq: + * Prism::dump_file(filepath, **options) -> String + * + * Dump the AST corresponding to the given file to a string. For supported + * options, see Prism::parse. + */ +static VALUE +dump_file(int argc, VALUE *argv, VALUE self) { + pm_string_t input; + pm_options_t options = { 0 }; + + VALUE encoded_filepath; + file_options(argc, argv, &input, &options, &encoded_filepath); + + VALUE value = dump_input(&input, &options); + pm_string_free(&input); + pm_options_free(&options); + + return value; +} + +#endif + +/******************************************************************************/ +/* Extracting values for the parse result */ +/******************************************************************************/ + +/** + * The same as rb_class_new_instance, but accepts an additional boolean to + * indicate whether or not the resulting class instance should be frozen. + */ +static inline VALUE +rb_class_new_instance_freeze(int argc, const VALUE *argv, VALUE klass, bool freeze) { + VALUE value = rb_class_new_instance(argc, argv, klass); + if (freeze) rb_obj_freeze(value); + return value; +} + +/** + * Create a new Location instance from the given parser and bounds. + */ +static inline VALUE +parser_location(const pm_parser_t *parser, VALUE source, bool freeze, const uint8_t *start, size_t length) { + VALUE argv[] = { source, LONG2FIX(start - parser->start), LONG2FIX(length) }; + return rb_class_new_instance_freeze(3, argv, rb_cPrismLocation, freeze); +} + +/** + * Create a new Location instance from the given parser and location. + */ +#define PARSER_LOCATION_LOC(parser, source, freeze, loc) \ + parser_location(parser, source, freeze, loc.start, (size_t) (loc.end - loc.start)) + +/** + * Build a new Comment instance from the given parser and comment. + */ +static inline VALUE +parser_comment(const pm_parser_t *parser, VALUE source, bool freeze, const pm_comment_t *comment) { + VALUE argv[] = { PARSER_LOCATION_LOC(parser, source, freeze, comment->location) }; + VALUE type = (comment->type == PM_COMMENT_EMBDOC) ? rb_cPrismEmbDocComment : rb_cPrismInlineComment; + return rb_class_new_instance_freeze(1, argv, type, freeze); +} + +/** + * Extract the comments out of the parser into an array. + */ +static VALUE +parser_comments(const pm_parser_t *parser, VALUE source, bool freeze) { + VALUE comments = rb_ary_new_capa(parser->comment_list.size); + + for ( + const pm_comment_t *comment = (const pm_comment_t *) parser->comment_list.head; + comment != NULL; + comment = (const pm_comment_t *) comment->node.next + ) { + VALUE value = parser_comment(parser, source, freeze, comment); + rb_ary_push(comments, value); + } + + if (freeze) rb_obj_freeze(comments); + return comments; +} + +/** + * Build a new MagicComment instance from the given parser and magic comment. + */ +static inline VALUE +parser_magic_comment(const pm_parser_t *parser, VALUE source, bool freeze, const pm_magic_comment_t *magic_comment) { + VALUE key_loc = parser_location(parser, source, freeze, magic_comment->key_start, magic_comment->key_length); + VALUE value_loc = parser_location(parser, source, freeze, magic_comment->value_start, magic_comment->value_length); + VALUE argv[] = { key_loc, value_loc }; + return rb_class_new_instance_freeze(2, argv, rb_cPrismMagicComment, freeze); +} + +/** + * Extract the magic comments out of the parser into an array. + */ +static VALUE +parser_magic_comments(const pm_parser_t *parser, VALUE source, bool freeze) { + VALUE magic_comments = rb_ary_new_capa(parser->magic_comment_list.size); + + for ( + const pm_magic_comment_t *magic_comment = (const pm_magic_comment_t *) parser->magic_comment_list.head; + magic_comment != NULL; + magic_comment = (const pm_magic_comment_t *) magic_comment->node.next + ) { + VALUE value = parser_magic_comment(parser, source, freeze, magic_comment); + rb_ary_push(magic_comments, value); + } + + if (freeze) rb_obj_freeze(magic_comments); + return magic_comments; +} + +/** + * Extract out the data location from the parser into a Location instance if one + * exists. + */ +static VALUE +parser_data_loc(const pm_parser_t *parser, VALUE source, bool freeze) { + if (parser->data_loc.end == NULL) { + return Qnil; + } else { + return PARSER_LOCATION_LOC(parser, source, freeze, parser->data_loc); + } +} + +/** + * Extract the errors out of the parser into an array. + */ +static VALUE +parser_errors(const pm_parser_t *parser, rb_encoding *encoding, VALUE source, bool freeze) { + VALUE errors = rb_ary_new_capa(parser->error_list.size); + + for ( + const pm_diagnostic_t *error = (const pm_diagnostic_t *) parser->error_list.head; + error != NULL; + error = (const pm_diagnostic_t *) error->node.next + ) { + VALUE type = ID2SYM(rb_intern(pm_diagnostic_id_human(error->diag_id))); + VALUE message = rb_obj_freeze(rb_enc_str_new_cstr(error->message, encoding)); + VALUE location = PARSER_LOCATION_LOC(parser, source, freeze, error->location); + + VALUE level = Qnil; + switch (error->level) { + case PM_ERROR_LEVEL_SYNTAX: + level = ID2SYM(rb_intern("syntax")); + break; + case PM_ERROR_LEVEL_ARGUMENT: + level = ID2SYM(rb_intern("argument")); + break; + case PM_ERROR_LEVEL_LOAD: + level = ID2SYM(rb_intern("load")); + break; + default: + rb_raise(rb_eRuntimeError, "Unknown level: %" PRIu8, error->level); + } + + VALUE argv[] = { type, message, location, level }; + VALUE value = rb_class_new_instance_freeze(4, argv, rb_cPrismParseError, freeze); + rb_ary_push(errors, value); + } + + if (freeze) rb_obj_freeze(errors); + return errors; +} + +/** + * Extract the warnings out of the parser into an array. + */ +static VALUE +parser_warnings(const pm_parser_t *parser, rb_encoding *encoding, VALUE source, bool freeze) { + VALUE warnings = rb_ary_new_capa(parser->warning_list.size); + + for ( + const pm_diagnostic_t *warning = (const pm_diagnostic_t *) parser->warning_list.head; + warning != NULL; + warning = (const pm_diagnostic_t *) warning->node.next + ) { + VALUE type = ID2SYM(rb_intern(pm_diagnostic_id_human(warning->diag_id))); + VALUE message = rb_obj_freeze(rb_enc_str_new_cstr(warning->message, encoding)); + VALUE location = PARSER_LOCATION_LOC(parser, source, freeze, warning->location); + + VALUE level = Qnil; + switch (warning->level) { + case PM_WARNING_LEVEL_DEFAULT: + level = ID2SYM(rb_intern("default")); + break; + case PM_WARNING_LEVEL_VERBOSE: + level = ID2SYM(rb_intern("verbose")); + break; + default: + rb_raise(rb_eRuntimeError, "Unknown level: %" PRIu8, warning->level); + } + + VALUE argv[] = { type, message, location, level }; + VALUE value = rb_class_new_instance_freeze(4, argv, rb_cPrismParseWarning, freeze); + rb_ary_push(warnings, value); + } + + if (freeze) rb_obj_freeze(warnings); + return warnings; +} + +/** + * Create a new parse result from the given parser, value, encoding, and source. + */ +static VALUE +parse_result_create(VALUE class, const pm_parser_t *parser, VALUE value, rb_encoding *encoding, VALUE source, bool freeze) { + VALUE result_argv[] = { + value, + parser_comments(parser, source, freeze), + parser_magic_comments(parser, source, freeze), + parser_data_loc(parser, source, freeze), + parser_errors(parser, encoding, source, freeze), + parser_warnings(parser, encoding, source, freeze), + source + }; + + return rb_class_new_instance_freeze(7, result_argv, class, freeze); +} + +/******************************************************************************/ +/* Lexing Ruby code */ +/******************************************************************************/ + +/** + * This struct gets stored in the parser and passed in to the lex callback any + * time a new token is found. We use it to store the necessary information to + * initialize a Token instance. + */ +typedef struct { + VALUE source; + VALUE tokens; + rb_encoding *encoding; + bool freeze; +} parse_lex_data_t; + +/** + * This is passed as a callback to the parser. It gets called every time a new + * token is found. Once found, we initialize a new instance of Token and push it + * onto the tokens array. + */ +static void +parse_lex_token(void *data, pm_parser_t *parser, pm_token_t *token) { + parse_lex_data_t *parse_lex_data = (parse_lex_data_t *) parser->lex_callback->data; + + VALUE value = pm_token_new(parser, token, parse_lex_data->encoding, parse_lex_data->source, parse_lex_data->freeze); + VALUE yields = rb_assoc_new(value, INT2FIX(parser->lex_state)); + + if (parse_lex_data->freeze) { + rb_obj_freeze(value); + rb_obj_freeze(yields); + } + + rb_ary_push(parse_lex_data->tokens, yields); +} + +/** + * This is called whenever the encoding changes based on the magic comment at + * the top of the file. We use it to update the encoding that we are using to + * create tokens. + */ +static void +parse_lex_encoding_changed_callback(pm_parser_t *parser) { + parse_lex_data_t *parse_lex_data = (parse_lex_data_t *) parser->lex_callback->data; + parse_lex_data->encoding = rb_enc_find(parser->encoding->name); + + // Since the encoding changed, we need to go back and change the encoding of + // the tokens that were already lexed. This is only going to end up being + // one or two tokens, since the encoding can only change at the top of the + // file. + VALUE tokens = parse_lex_data->tokens; + VALUE next_tokens = rb_ary_new(); + + for (long index = 0; index < RARRAY_LEN(tokens); index++) { + VALUE yields = rb_ary_entry(tokens, index); + VALUE token = rb_ary_entry(yields, 0); + + VALUE value = rb_ivar_get(token, rb_intern("@value")); + VALUE next_value = rb_str_dup(value); + + rb_enc_associate(next_value, parse_lex_data->encoding); + if (parse_lex_data->freeze) rb_obj_freeze(next_value); + + VALUE next_token_argv[] = { + parse_lex_data->source, + rb_ivar_get(token, rb_intern("@type")), + next_value, + rb_ivar_get(token, rb_intern("@location")) + }; + + VALUE next_token = rb_class_new_instance(4, next_token_argv, rb_cPrismToken); + VALUE next_yields = rb_assoc_new(next_token, rb_ary_entry(yields, 1)); + + if (parse_lex_data->freeze) { + rb_obj_freeze(next_token); + rb_obj_freeze(next_yields); + } + + rb_ary_push(next_tokens, next_yields); + } + + rb_ary_replace(parse_lex_data->tokens, next_tokens); +} + +/** + * Parse the given input and return a ParseResult containing just the tokens or + * the nodes and tokens. + */ +static VALUE +parse_lex_input(pm_string_t *input, const pm_options_t *options, bool return_nodes) { + pm_parser_t parser; + pm_parser_init(&parser, pm_string_source(input), pm_string_length(input), options); + pm_parser_register_encoding_changed_callback(&parser, parse_lex_encoding_changed_callback); + + VALUE source_string = rb_str_new((const char *) pm_string_source(input), pm_string_length(input)); + VALUE offsets = rb_ary_new_capa(parser.newline_list.size); + VALUE source = rb_funcall(rb_cPrismSource, rb_id_source_for, 3, source_string, LONG2NUM(parser.start_line), offsets); + + parse_lex_data_t parse_lex_data = { + .source = source, + .tokens = rb_ary_new(), + .encoding = rb_utf8_encoding(), + .freeze = options->freeze, + }; + + parse_lex_data_t *data = &parse_lex_data; + pm_lex_callback_t lex_callback = (pm_lex_callback_t) { + .data = (void *) data, + .callback = parse_lex_token, + }; + + parser.lex_callback = &lex_callback; + pm_node_t *node = pm_parse(&parser); + + // Here we need to update the Source object to have the correct + // encoding for the source string and the correct newline offsets. + // We do it here because we've already created the Source object and given + // it over to all of the tokens, and both of these are only set after pm_parse(). + rb_encoding *encoding = rb_enc_find(parser.encoding->name); + rb_enc_associate(source_string, encoding); + + for (size_t index = 0; index < parser.newline_list.size; index++) { + rb_ary_push(offsets, ULONG2NUM(parser.newline_list.offsets[index])); + } + + if (options->freeze) { + rb_obj_freeze(source_string); + rb_obj_freeze(offsets); + rb_obj_freeze(source); + rb_obj_freeze(parse_lex_data.tokens); + } + + VALUE result; + if (return_nodes) { + VALUE value = rb_ary_new_capa(2); + rb_ary_push(value, pm_ast_new(&parser, node, parse_lex_data.encoding, source, options->freeze)); + rb_ary_push(value, parse_lex_data.tokens); + if (options->freeze) rb_obj_freeze(value); + result = parse_result_create(rb_cPrismParseLexResult, &parser, value, parse_lex_data.encoding, source, options->freeze); + } else { + result = parse_result_create(rb_cPrismLexResult, &parser, parse_lex_data.tokens, parse_lex_data.encoding, source, options->freeze); + } + + pm_node_destroy(&parser, node); + pm_parser_free(&parser); + + return result; +} + +/** + * call-seq: + * Prism::lex(source, **options) -> LexResult + * + * Return a LexResult instance that contains an array of Token instances + * corresponding to the given string. For supported options, see Prism::parse. + */ +static VALUE +lex(int argc, VALUE *argv, VALUE self) { + pm_string_t input; + pm_options_t options = { 0 }; + string_options(argc, argv, &input, &options); + + VALUE result = parse_lex_input(&input, &options, false); + pm_string_free(&input); + pm_options_free(&options); + + return result; +} + +/** + * call-seq: + * Prism::lex_file(filepath, **options) -> LexResult + * + * Return a LexResult instance that contains an array of Token instances + * corresponding to the given file. For supported options, see Prism::parse. + */ +static VALUE +lex_file(int argc, VALUE *argv, VALUE self) { + pm_string_t input; + pm_options_t options = { 0 }; + + VALUE encoded_filepath; + file_options(argc, argv, &input, &options, &encoded_filepath); + + VALUE value = parse_lex_input(&input, &options, false); + pm_string_free(&input); + pm_options_free(&options); + + return value; +} + +/******************************************************************************/ +/* Parsing Ruby code */ +/******************************************************************************/ + +/** + * Parse the given input and return a ParseResult instance. + */ +static VALUE +parse_input(pm_string_t *input, const pm_options_t *options) { + pm_parser_t parser; + pm_parser_init(&parser, pm_string_source(input), pm_string_length(input), options); + + pm_node_t *node = pm_parse(&parser); + rb_encoding *encoding = rb_enc_find(parser.encoding->name); + + VALUE source = pm_source_new(&parser, encoding, options->freeze); + VALUE value = pm_ast_new(&parser, node, encoding, source, options->freeze); + VALUE result = parse_result_create(rb_cPrismParseResult, &parser, value, encoding, source, options->freeze); + + if (options->freeze) { + rb_obj_freeze(source); + } + + pm_node_destroy(&parser, node); + pm_parser_free(&parser); + + return result; +} + +/** + * call-seq: + * Prism::parse(source, **options) -> ParseResult + * + * Parse the given string and return a ParseResult instance. The options that + * are supported are: + * + * * `command_line` - either nil or a string of the various options that were + * set on the command line. Valid values are combinations of "a", "l", + * "n", "p", and "x". + * * `encoding` - the encoding of the source being parsed. This should be an + * encoding or nil. + * * `filepath` - the filepath of the source being parsed. This should be a + * string or nil. + * * `freeze` - whether or not to deeply freeze the AST. This should be a + * boolean or nil. + * * `frozen_string_literal` - whether or not the frozen string literal pragma + * has been set. This should be a boolean or nil. + * * `line` - the line number that the parse starts on. This should be an + * integer or nil. Note that this is 1-indexed. + * * `main_script` - a boolean indicating whether or not the source being parsed + * is the main script being run by the interpreter. This controls whether + * or not shebangs are parsed for additional flags and whether or not the + * parser will attempt to find a matching shebang if the first one does + * not contain the word "ruby". + * * `partial_script` - when the file being parsed is considered a "partial" + * script, jumps will not be marked as errors if they are not contained + * within loops/blocks. This is used in the case that you're parsing a + * script that you know will be embedded inside another script later, but + * you do not have that context yet. For example, when parsing an ERB + * template that will be evaluated inside another script. + * * `scopes` - the locals that are in scope surrounding the code that is being + * parsed. This should be an array of arrays of symbols or nil. Scopes are + * ordered from the outermost scope to the innermost one. + * * `version` - the version of Ruby syntax that prism should used to parse Ruby + * code. By default prism assumes you want to parse with the latest + * version of Ruby syntax (which you can trigger with `nil` or + * `"latest"`). You may also restrict the syntax to a specific version of + * Ruby, e.g., with `"3.3.0"`. To parse with the same syntax version that + * the current Ruby is running use `version: "current"`. Raises + * ArgumentError if the version is not currently supported by Prism. + */ +static VALUE +parse(int argc, VALUE *argv, VALUE self) { + pm_string_t input; + pm_options_t options = { 0 }; + string_options(argc, argv, &input, &options); + +#ifdef PRISM_BUILD_DEBUG + size_t length = pm_string_length(&input); + char* dup = xmalloc(length); + memcpy(dup, pm_string_source(&input), length); + pm_string_constant_init(&input, dup, length); +#endif + + VALUE value = parse_input(&input, &options); + +#ifdef PRISM_BUILD_DEBUG + xfree(dup); +#endif + + pm_string_free(&input); + pm_options_free(&options); + return value; +} + +/** + * call-seq: + * Prism::parse_file(filepath, **options) -> ParseResult + * + * Parse the given file and return a ParseResult instance. For supported + * options, see Prism::parse. + */ +static VALUE +parse_file(int argc, VALUE *argv, VALUE self) { + pm_string_t input; + pm_options_t options = { 0 }; + + VALUE encoded_filepath; + file_options(argc, argv, &input, &options, &encoded_filepath); + + VALUE value = parse_input(&input, &options); + pm_string_free(&input); + pm_options_free(&options); + + return value; +} + +/** + * Parse the given input and return nothing. + */ +static void +profile_input(pm_string_t *input, const pm_options_t *options) { + pm_parser_t parser; + pm_parser_init(&parser, pm_string_source(input), pm_string_length(input), options); + + pm_node_t *node = pm_parse(&parser); + pm_node_destroy(&parser, node); + pm_parser_free(&parser); +} + +/** + * call-seq: + * Prism::profile(source, **options) -> nil + * + * Parse the given string and return nothing. This method is meant to allow + * profilers to avoid the overhead of reifying the AST to Ruby. For supported + * options, see Prism::parse. + */ +static VALUE +profile(int argc, VALUE *argv, VALUE self) { + pm_string_t input; + pm_options_t options = { 0 }; + + string_options(argc, argv, &input, &options); + profile_input(&input, &options); + pm_string_free(&input); + pm_options_free(&options); + + return Qnil; +} + +/** + * call-seq: + * Prism::profile_file(filepath, **options) -> nil + * + * Parse the given file and return nothing. This method is meant to allow + * profilers to avoid the overhead of reifying the AST to Ruby. For supported + * options, see Prism::parse. + */ +static VALUE +profile_file(int argc, VALUE *argv, VALUE self) { + pm_string_t input; + pm_options_t options = { 0 }; + + VALUE encoded_filepath; + file_options(argc, argv, &input, &options, &encoded_filepath); + + profile_input(&input, &options); + pm_string_free(&input); + pm_options_free(&options); + + return Qnil; +} + +static int +parse_stream_eof(void *stream) { + if (rb_funcall((VALUE) stream, rb_intern("eof?"), 0)) { + return 1; + } + return 0; +} + +/** + * An implementation of fgets that is suitable for use with Ruby IO objects. + */ +static char * +parse_stream_fgets(char *string, int size, void *stream) { + RUBY_ASSERT(size > 0); + + VALUE line = rb_funcall((VALUE) stream, rb_intern("gets"), 1, INT2FIX(size - 1)); + if (NIL_P(line)) { + return NULL; + } + + const char *cstr = RSTRING_PTR(line); + long length = RSTRING_LEN(line); + + memcpy(string, cstr, length); + string[length] = '\0'; + + return string; +} + +/** + * call-seq: + * Prism::parse_stream(stream, **options) -> ParseResult + * + * Parse the given object that responds to `gets` and return a ParseResult + * instance. The options that are supported are the same as Prism::parse. + */ +static VALUE +parse_stream(int argc, VALUE *argv, VALUE self) { + VALUE stream; + VALUE keywords; + rb_scan_args(argc, argv, "1:", &stream, &keywords); + + pm_options_t options = { 0 }; + extract_options(&options, Qnil, keywords); + + pm_parser_t parser; + pm_buffer_t buffer; + + pm_node_t *node = pm_parse_stream(&parser, &buffer, (void *) stream, parse_stream_fgets, parse_stream_eof, &options); + rb_encoding *encoding = rb_enc_find(parser.encoding->name); + + VALUE source = pm_source_new(&parser, encoding, options.freeze); + VALUE value = pm_ast_new(&parser, node, encoding, source, options.freeze); + VALUE result = parse_result_create(rb_cPrismParseResult, &parser, value, encoding, source, options.freeze); + + pm_node_destroy(&parser, node); + pm_buffer_free(&buffer); + pm_parser_free(&parser); + + return result; +} + +/** + * Parse the given input and return an array of Comment objects. + */ +static VALUE +parse_input_comments(pm_string_t *input, const pm_options_t *options) { + pm_parser_t parser; + pm_parser_init(&parser, pm_string_source(input), pm_string_length(input), options); + + pm_node_t *node = pm_parse(&parser); + rb_encoding *encoding = rb_enc_find(parser.encoding->name); + + VALUE source = pm_source_new(&parser, encoding, options->freeze); + VALUE comments = parser_comments(&parser, source, options->freeze); + + pm_node_destroy(&parser, node); + pm_parser_free(&parser); + + return comments; +} + +/** + * call-seq: + * Prism::parse_comments(source, **options) -> Array + * + * Parse the given string and return an array of Comment objects. For supported + * options, see Prism::parse. + */ +static VALUE +parse_comments(int argc, VALUE *argv, VALUE self) { + pm_string_t input; + pm_options_t options = { 0 }; + string_options(argc, argv, &input, &options); + + VALUE result = parse_input_comments(&input, &options); + pm_string_free(&input); + pm_options_free(&options); + + return result; +} + +/** + * call-seq: + * Prism::parse_file_comments(filepath, **options) -> Array + * + * Parse the given file and return an array of Comment objects. For supported + * options, see Prism::parse. + */ +static VALUE +parse_file_comments(int argc, VALUE *argv, VALUE self) { + pm_string_t input; + pm_options_t options = { 0 }; + + VALUE encoded_filepath; + file_options(argc, argv, &input, &options, &encoded_filepath); + + VALUE value = parse_input_comments(&input, &options); + pm_string_free(&input); + pm_options_free(&options); + + return value; +} + +/** + * call-seq: + * Prism::parse_lex(source, **options) -> ParseLexResult + * + * Parse the given string and return a ParseLexResult instance that contains a + * 2-element array, where the first element is the AST and the second element is + * an array of Token instances. + * + * This API is only meant to be used in the case where you need both the AST and + * the tokens. If you only need one or the other, use either Prism::parse or + * Prism::lex. + * + * For supported options, see Prism::parse. + */ +static VALUE +parse_lex(int argc, VALUE *argv, VALUE self) { + pm_string_t input; + pm_options_t options = { 0 }; + string_options(argc, argv, &input, &options); + + VALUE value = parse_lex_input(&input, &options, true); + pm_string_free(&input); + pm_options_free(&options); + + return value; +} + +/** + * call-seq: + * Prism::parse_lex_file(filepath, **options) -> ParseLexResult + * + * Parse the given file and return a ParseLexResult instance that contains a + * 2-element array, where the first element is the AST and the second element is + * an array of Token instances. + * + * This API is only meant to be used in the case where you need both the AST and + * the tokens. If you only need one or the other, use either Prism::parse_file + * or Prism::lex_file. + * + * For supported options, see Prism::parse. + */ +static VALUE +parse_lex_file(int argc, VALUE *argv, VALUE self) { + pm_string_t input; + pm_options_t options = { 0 }; + + VALUE encoded_filepath; + file_options(argc, argv, &input, &options, &encoded_filepath); + + VALUE value = parse_lex_input(&input, &options, true); + pm_string_free(&input); + pm_options_free(&options); + + return value; +} + +/** + * Parse the given input and return true if it parses without errors. + */ +static VALUE +parse_input_success_p(pm_string_t *input, const pm_options_t *options) { + pm_parser_t parser; + pm_parser_init(&parser, pm_string_source(input), pm_string_length(input), options); + + pm_node_t *node = pm_parse(&parser); + pm_node_destroy(&parser, node); + + VALUE result = parser.error_list.size == 0 ? Qtrue : Qfalse; + pm_parser_free(&parser); + + return result; +} + +/** + * call-seq: + * Prism::parse_success?(source, **options) -> bool + * + * Parse the given string and return true if it parses without errors. For + * supported options, see Prism::parse. + */ +static VALUE +parse_success_p(int argc, VALUE *argv, VALUE self) { + pm_string_t input; + pm_options_t options = { 0 }; + string_options(argc, argv, &input, &options); + + VALUE result = parse_input_success_p(&input, &options); + pm_string_free(&input); + pm_options_free(&options); + + return result; +} + +/** + * call-seq: + * Prism::parse_failure?(source, **options) -> bool + * + * Parse the given string and return true if it parses with errors. For + * supported options, see Prism::parse. + */ +static VALUE +parse_failure_p(int argc, VALUE *argv, VALUE self) { + return RTEST(parse_success_p(argc, argv, self)) ? Qfalse : Qtrue; +} + +/** + * call-seq: + * Prism::parse_file_success?(filepath, **options) -> bool + * + * Parse the given file and return true if it parses without errors. For + * supported options, see Prism::parse. + */ +static VALUE +parse_file_success_p(int argc, VALUE *argv, VALUE self) { + pm_string_t input; + pm_options_t options = { 0 }; + + VALUE encoded_filepath; + file_options(argc, argv, &input, &options, &encoded_filepath); + + VALUE result = parse_input_success_p(&input, &options); + pm_string_free(&input); + pm_options_free(&options); + + return result; +} + +/** + * call-seq: + * Prism::parse_file_failure?(filepath, **options) -> bool + * + * Parse the given file and return true if it parses with errors. For + * supported options, see Prism::parse. + */ +static VALUE +parse_file_failure_p(int argc, VALUE *argv, VALUE self) { + return RTEST(parse_file_success_p(argc, argv, self)) ? Qfalse : Qtrue; +} + +/******************************************************************************/ +/* String query methods */ +/******************************************************************************/ + +/** + * Process the result of a call to a string query method and return an + * appropriate value. + */ +static VALUE +string_query(pm_string_query_t result) { + switch (result) { + case PM_STRING_QUERY_ERROR: + rb_raise(rb_eArgError, "Invalid or non ascii-compatible encoding"); + return Qfalse; + case PM_STRING_QUERY_FALSE: + return Qfalse; + case PM_STRING_QUERY_TRUE: + return Qtrue; + } + return Qfalse; +} + +/** + * call-seq: + * Prism::StringQuery::local?(string) -> bool + * + * Returns true if the string constitutes a valid local variable name. Note that + * this means the names that can be set through Binding#local_variable_set, not + * necessarily the ones that can be set through a local variable assignment. + */ +static VALUE +string_query_local_p(VALUE self, VALUE string) { + const uint8_t *source = (const uint8_t *) check_string(string); + return string_query(pm_string_query_local(source, RSTRING_LEN(string), rb_enc_get(string)->name)); +} + +/** + * call-seq: + * Prism::StringQuery::constant?(string) -> bool + * + * Returns true if the string constitutes a valid constant name. Note that this + * means the names that can be set through Module#const_set, not necessarily the + * ones that can be set through a constant assignment. + */ +static VALUE +string_query_constant_p(VALUE self, VALUE string) { + const uint8_t *source = (const uint8_t *) check_string(string); + return string_query(pm_string_query_constant(source, RSTRING_LEN(string), rb_enc_get(string)->name)); +} + +/** + * call-seq: + * Prism::StringQuery::method_name?(string) -> bool + * + * Returns true if the string constitutes a valid method name. + */ +static VALUE +string_query_method_name_p(VALUE self, VALUE string) { + const uint8_t *source = (const uint8_t *) check_string(string); + return string_query(pm_string_query_method_name(source, RSTRING_LEN(string), rb_enc_get(string)->name)); +} + +/******************************************************************************/ +/* Initialization of the extension */ +/******************************************************************************/ + +/** + * The init function that Ruby calls when loading this extension. + */ +RUBY_FUNC_EXPORTED void +Init_prism(void) { + // Make sure that the prism library version matches the expected version. + // Otherwise something was compiled incorrectly. + if (strcmp(pm_version(), EXPECTED_PRISM_VERSION) != 0) { + rb_raise( + rb_eRuntimeError, + "The prism library version (%s) does not match the expected version (%s)", + pm_version(), + EXPECTED_PRISM_VERSION + ); + } + +#ifdef HAVE_RB_EXT_RACTOR_SAFE + // Mark this extension as Ractor-safe. + rb_ext_ractor_safe(true); +#endif + + // Grab up references to all of the constants that we're going to need to + // reference throughout this extension. + rb_cPrism = rb_define_module("Prism"); + rb_cPrismNode = rb_define_class_under(rb_cPrism, "Node", rb_cObject); + rb_cPrismSource = rb_define_class_under(rb_cPrism, "Source", rb_cObject); + rb_cPrismToken = rb_define_class_under(rb_cPrism, "Token", rb_cObject); + rb_cPrismLocation = rb_define_class_under(rb_cPrism, "Location", rb_cObject); + rb_cPrismComment = rb_define_class_under(rb_cPrism, "Comment", rb_cObject); + rb_cPrismInlineComment = rb_define_class_under(rb_cPrism, "InlineComment", rb_cPrismComment); + rb_cPrismEmbDocComment = rb_define_class_under(rb_cPrism, "EmbDocComment", rb_cPrismComment); + rb_cPrismMagicComment = rb_define_class_under(rb_cPrism, "MagicComment", rb_cObject); + rb_cPrismParseError = rb_define_class_under(rb_cPrism, "ParseError", rb_cObject); + rb_cPrismParseWarning = rb_define_class_under(rb_cPrism, "ParseWarning", rb_cObject); + rb_cPrismResult = rb_define_class_under(rb_cPrism, "Result", rb_cObject); + rb_cPrismParseResult = rb_define_class_under(rb_cPrism, "ParseResult", rb_cPrismResult); + rb_cPrismLexResult = rb_define_class_under(rb_cPrism, "LexResult", rb_cPrismResult); + rb_cPrismParseLexResult = rb_define_class_under(rb_cPrism, "ParseLexResult", rb_cPrismResult); + rb_cPrismStringQuery = rb_define_class_under(rb_cPrism, "StringQuery", rb_cObject); + rb_cPrismScope = rb_define_class_under(rb_cPrism, "Scope", rb_cObject); + + rb_cPrismCurrentVersionError = rb_const_get(rb_cPrism, rb_intern("CurrentVersionError")); + + // Intern all of the IDs eagerly that we support so that we don't have to do + // it every time we parse. + rb_id_option_command_line = rb_intern_const("command_line"); + rb_id_option_encoding = rb_intern_const("encoding"); + rb_id_option_filepath = rb_intern_const("filepath"); + rb_id_option_freeze = rb_intern_const("freeze"); + rb_id_option_frozen_string_literal = rb_intern_const("frozen_string_literal"); + rb_id_option_line = rb_intern_const("line"); + rb_id_option_main_script = rb_intern_const("main_script"); + rb_id_option_partial_script = rb_intern_const("partial_script"); + rb_id_option_scopes = rb_intern_const("scopes"); + rb_id_option_version = rb_intern_const("version"); + rb_id_source_for = rb_intern("for"); + rb_id_forwarding_positionals = rb_intern("*"); + rb_id_forwarding_keywords = rb_intern("**"); + rb_id_forwarding_block = rb_intern("&"); + rb_id_forwarding_all = rb_intern("..."); + + /** + * The version of the prism library. + */ + rb_define_const(rb_cPrism, "VERSION", rb_str_freeze(rb_str_new_cstr(EXPECTED_PRISM_VERSION))); + + // First, the functions that have to do with lexing and parsing. + rb_define_singleton_method(rb_cPrism, "lex", lex, -1); + rb_define_singleton_method(rb_cPrism, "lex_file", lex_file, -1); + rb_define_singleton_method(rb_cPrism, "parse", parse, -1); + rb_define_singleton_method(rb_cPrism, "parse_file", parse_file, -1); + rb_define_singleton_method(rb_cPrism, "profile", profile, -1); + rb_define_singleton_method(rb_cPrism, "profile_file", profile_file, -1); + rb_define_singleton_method(rb_cPrism, "parse_stream", parse_stream, -1); + rb_define_singleton_method(rb_cPrism, "parse_comments", parse_comments, -1); + rb_define_singleton_method(rb_cPrism, "parse_file_comments", parse_file_comments, -1); + rb_define_singleton_method(rb_cPrism, "parse_lex", parse_lex, -1); + rb_define_singleton_method(rb_cPrism, "parse_lex_file", parse_lex_file, -1); + rb_define_singleton_method(rb_cPrism, "parse_success?", parse_success_p, -1); + rb_define_singleton_method(rb_cPrism, "parse_failure?", parse_failure_p, -1); + rb_define_singleton_method(rb_cPrism, "parse_file_success?", parse_file_success_p, -1); + rb_define_singleton_method(rb_cPrism, "parse_file_failure?", parse_file_failure_p, -1); + +#ifndef PRISM_EXCLUDE_SERIALIZATION + rb_define_singleton_method(rb_cPrism, "dump", dump, -1); + rb_define_singleton_method(rb_cPrism, "dump_file", dump_file, -1); +#endif + + rb_define_singleton_method(rb_cPrismStringQuery, "local?", string_query_local_p, 1); + rb_define_singleton_method(rb_cPrismStringQuery, "constant?", string_query_constant_p, 1); + rb_define_singleton_method(rb_cPrismStringQuery, "method_name?", string_query_method_name_p, 1); + + // Next, initialize the other APIs. + Init_prism_api_node(); + Init_prism_pack(); +} diff --git a/prism/extension.h b/prism/extension.h new file mode 100644 index 0000000000..510faa48e8 --- /dev/null +++ b/prism/extension.h @@ -0,0 +1,19 @@ +#ifndef PRISM_EXT_NODE_H +#define PRISM_EXT_NODE_H + +#define EXPECTED_PRISM_VERSION "1.8.0" + +#include <ruby.h> +#include <ruby/encoding.h> +#include "prism.h" + +VALUE pm_source_new(const pm_parser_t *parser, rb_encoding *encoding, bool freeze); +VALUE pm_token_new(const pm_parser_t *parser, const pm_token_t *token, rb_encoding *encoding, VALUE source, bool freeze); +VALUE pm_ast_new(const pm_parser_t *parser, const pm_node_t *node, rb_encoding *encoding, VALUE source, bool freeze); +VALUE pm_integer_new(const pm_integer_t *integer); + +void Init_prism_api_node(void); +void Init_prism_pack(void); +RUBY_FUNC_EXPORTED void Init_prism(void); + +#endif diff --git a/prism/node.h b/prism/node.h new file mode 100644 index 0000000000..e8686a327c --- /dev/null +++ b/prism/node.h @@ -0,0 +1,129 @@ +/** + * @file node.h + * + * Functions related to nodes in the AST. + */ +#ifndef PRISM_NODE_H +#define PRISM_NODE_H + +#include "prism/defines.h" +#include "prism/parser.h" +#include "prism/util/pm_buffer.h" + +/** + * Loop through each node in the node list, writing each node to the given + * pm_node_t pointer. + */ +#define PM_NODE_LIST_FOREACH(list, index, node) \ + for (size_t index = 0; index < (list)->size && ((node) = (list)->nodes[index]); index++) + +/** + * Append a new node onto the end of the node list. + * + * @param list The list to append to. + * @param node The node to append. + */ +void pm_node_list_append(pm_node_list_t *list, pm_node_t *node); + +/** + * Prepend a new node onto the beginning of the node list. + * + * @param list The list to prepend to. + * @param node The node to prepend. + */ +void pm_node_list_prepend(pm_node_list_t *list, pm_node_t *node); + +/** + * Concatenate the given node list onto the end of the other node list. + * + * @param list The list to concatenate onto. + * @param other The list to concatenate. + */ +void pm_node_list_concat(pm_node_list_t *list, pm_node_list_t *other); + +/** + * Free the internal memory associated with the given node list. + * + * @param list The list to free. + */ +void pm_node_list_free(pm_node_list_t *list); + +/** + * Deallocate a node and all of its children. + * + * @param parser The parser that owns the node. + * @param node The node to deallocate. + */ +PRISM_EXPORTED_FUNCTION void pm_node_destroy(pm_parser_t *parser, struct pm_node *node); + +/** + * Returns a string representation of the given node type. + * + * @param node_type The node type to convert to a string. + * @return A string representation of the given node type. + */ +PRISM_EXPORTED_FUNCTION const char * pm_node_type_to_str(pm_node_type_t node_type); + +/** + * Visit each of the nodes in this subtree using the given visitor callback. The + * callback function will be called for each node in the subtree. If it returns + * false, then that node's children will not be visited. If it returns true, + * then the children will be visited. The data parameter is treated as an opaque + * pointer and is passed to the visitor callback for consumers to use as they + * see fit. + * + * As an example: + * + * ```c + * #include "prism.h" + * + * bool visit(const pm_node_t *node, void *data) { + * size_t *indent = (size_t *) data; + * for (size_t i = 0; i < *indent * 2; i++) putc(' ', stdout); + * printf("%s\n", pm_node_type_to_str(node->type)); + * + * size_t next_indent = *indent + 1; + * size_t *next_data = &next_indent; + * pm_visit_child_nodes(node, visit, next_data); + * + * return false; + * } + * + * int main(void) { + * const char *source = "1 + 2; 3 + 4"; + * size_t size = strlen(source); + * + * pm_parser_t parser; + * pm_options_t options = { 0 }; + * pm_parser_init(&parser, (const uint8_t *) source, size, &options); + * + * size_t indent = 0; + * pm_node_t *node = pm_parse(&parser); + * + * size_t *data = &indent; + * pm_visit_node(node, visit, data); + * + * pm_node_destroy(&parser, node); + * pm_parser_free(&parser); + * return EXIT_SUCCESS; + * } + * ``` + * + * @param node The root node to start visiting from. + * @param visitor The callback to call for each node in the subtree. + * @param data An opaque pointer that is passed to the visitor callback. + */ +PRISM_EXPORTED_FUNCTION void pm_visit_node(const pm_node_t *node, bool (*visitor)(const pm_node_t *node, void *data), void *data); + +/** + * Visit the children of the given node with the given callback. This is the + * default behavior for walking the tree that is called from pm_visit_node if + * the callback returns true. + * + * @param node The node to visit the children of. + * @param visitor The callback to call for each child node. + * @param data An opaque pointer that is passed to the visitor callback. + */ +PRISM_EXPORTED_FUNCTION void pm_visit_child_nodes(const pm_node_t *node, bool (*visitor)(const pm_node_t *node, void *data), void *data); + +#endif diff --git a/prism/options.c b/prism/options.c new file mode 100644 index 0000000000..09d2a65a6c --- /dev/null +++ b/prism/options.c @@ -0,0 +1,338 @@ +#include "prism/options.h" + +/** + * Set the shebang callback option on the given options struct. + */ +PRISM_EXPORTED_FUNCTION void +pm_options_shebang_callback_set(pm_options_t *options, pm_options_shebang_callback_t shebang_callback, void *shebang_callback_data) { + options->shebang_callback = shebang_callback; + options->shebang_callback_data = shebang_callback_data; +} + +/** + * Set the filepath option on the given options struct. + */ +PRISM_EXPORTED_FUNCTION void +pm_options_filepath_set(pm_options_t *options, const char *filepath) { + pm_string_constant_init(&options->filepath, filepath, strlen(filepath)); +} + +/** + * Set the encoding option on the given options struct. + */ +PRISM_EXPORTED_FUNCTION void +pm_options_encoding_set(pm_options_t *options, const char *encoding) { + pm_string_constant_init(&options->encoding, encoding, strlen(encoding)); +} + +/** + * Set the encoding_locked option on the given options struct. + */ +PRISM_EXPORTED_FUNCTION void +pm_options_encoding_locked_set(pm_options_t *options, bool encoding_locked) { + options->encoding_locked = encoding_locked; +} + +/** + * Set the line option on the given options struct. + */ +PRISM_EXPORTED_FUNCTION void +pm_options_line_set(pm_options_t *options, int32_t line) { + options->line = line; +} + +/** + * Set the frozen string literal option on the given options struct. + */ +PRISM_EXPORTED_FUNCTION void +pm_options_frozen_string_literal_set(pm_options_t *options, bool frozen_string_literal) { + options->frozen_string_literal = frozen_string_literal ? PM_OPTIONS_FROZEN_STRING_LITERAL_ENABLED : PM_OPTIONS_FROZEN_STRING_LITERAL_DISABLED; +} + +/** + * Sets the command line option on the given options struct. + */ +PRISM_EXPORTED_FUNCTION void +pm_options_command_line_set(pm_options_t *options, uint8_t command_line) { + options->command_line = command_line; +} + +/** + * Checks if the given slice represents a number. + */ +static inline bool +is_number(const char *string, size_t length) { + return pm_strspn_decimal_digit((const uint8_t *) string, (ptrdiff_t) length) == length; +} + +/** + * Set the version option on the given options struct by parsing the given + * string. If the string contains an invalid option, this returns false. + * Otherwise, it returns true. + */ +PRISM_EXPORTED_FUNCTION bool +pm_options_version_set(pm_options_t *options, const char *version, size_t length) { + if (version == NULL) { + options->version = PM_OPTIONS_VERSION_LATEST; + return true; + } + + if (length == 3) { + if (strncmp(version, "3.3", 3) == 0) { + options->version = PM_OPTIONS_VERSION_CRUBY_3_3; + return true; + } + + if (strncmp(version, "3.4", 3) == 0) { + options->version = PM_OPTIONS_VERSION_CRUBY_3_4; + return true; + } + + if (strncmp(version, "3.5", 3) == 0 || strncmp(version, "4.0", 3) == 0) { + options->version = PM_OPTIONS_VERSION_CRUBY_4_0; + return true; + } + + if (strncmp(version, "4.1", 3) == 0) { + options->version = PM_OPTIONS_VERSION_CRUBY_4_1; + return true; + } + + return false; + } + + if (length >= 4 && is_number(version + 4, length - 4)) { + if (strncmp(version, "3.3.", 4) == 0) { + options->version = PM_OPTIONS_VERSION_CRUBY_3_3; + return true; + } + + if (strncmp(version, "3.4.", 4) == 0) { + options->version = PM_OPTIONS_VERSION_CRUBY_3_4; + return true; + } + + if (strncmp(version, "3.5.", 4) == 0 || strncmp(version, "4.0.", 4) == 0) { + options->version = PM_OPTIONS_VERSION_CRUBY_4_0; + return true; + } + + if (strncmp(version, "4.1.", 4) == 0) { + options->version = PM_OPTIONS_VERSION_CRUBY_4_1; + return true; + } + } + + if (length >= 6) { + if (strncmp(version, "latest", 7) == 0) { // 7 to compare the \0 as well + options->version = PM_OPTIONS_VERSION_LATEST; + return true; + } + } + + return false; +} + +/** + * Set the main script option on the given options struct. + */ +PRISM_EXPORTED_FUNCTION void +pm_options_main_script_set(pm_options_t *options, bool main_script) { + options->main_script = main_script; +} + +/** + * Set the partial script option on the given options struct. + */ +PRISM_EXPORTED_FUNCTION void +pm_options_partial_script_set(pm_options_t *options, bool partial_script) { + options->partial_script = partial_script; +} + +/** + * Set the freeze option on the given options struct. + */ +PRISM_EXPORTED_FUNCTION void +pm_options_freeze_set(pm_options_t *options, bool freeze) { + options->freeze = freeze; +} + +// For some reason, GCC analyzer thinks we're leaking allocated scopes and +// locals here, even though we definitely aren't. This is a false positive. +// Ideally we wouldn't need to suppress this. +#if defined(__GNUC__) && (__GNUC__ >= 10) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wanalyzer-malloc-leak" +#endif + +/** + * Allocate and zero out the scopes array on the given options struct. + */ +PRISM_EXPORTED_FUNCTION bool +pm_options_scopes_init(pm_options_t *options, size_t scopes_count) { + options->scopes_count = scopes_count; + options->scopes = xcalloc(scopes_count, sizeof(pm_options_scope_t)); + return options->scopes != NULL; +} + +/** + * Return a pointer to the scope at the given index within the given options. + */ +PRISM_EXPORTED_FUNCTION const pm_options_scope_t * +pm_options_scope_get(const pm_options_t *options, size_t index) { + return &options->scopes[index]; +} + +/** + * Create a new options scope struct. This will hold a set of locals that are in + * scope surrounding the code that is being parsed. + */ +PRISM_EXPORTED_FUNCTION bool +pm_options_scope_init(pm_options_scope_t *scope, size_t locals_count) { + scope->locals_count = locals_count; + scope->locals = xcalloc(locals_count, sizeof(pm_string_t)); + scope->forwarding = PM_OPTIONS_SCOPE_FORWARDING_NONE; + return scope->locals != NULL; +} + +/** + * Return a pointer to the local at the given index within the given scope. + */ +PRISM_EXPORTED_FUNCTION const pm_string_t * +pm_options_scope_local_get(const pm_options_scope_t *scope, size_t index) { + return &scope->locals[index]; +} + +/** + * Set the forwarding option on the given scope struct. + */ +PRISM_EXPORTED_FUNCTION void +pm_options_scope_forwarding_set(pm_options_scope_t *scope, uint8_t forwarding) { + scope->forwarding = forwarding; +} + +/** + * Free the internal memory associated with the options. + */ +PRISM_EXPORTED_FUNCTION void +pm_options_free(pm_options_t *options) { + pm_string_free(&options->filepath); + pm_string_free(&options->encoding); + + for (size_t scope_index = 0; scope_index < options->scopes_count; scope_index++) { + pm_options_scope_t *scope = &options->scopes[scope_index]; + + for (size_t local_index = 0; local_index < scope->locals_count; local_index++) { + pm_string_free(&scope->locals[local_index]); + } + + xfree(scope->locals); + } + + xfree(options->scopes); +} + +/** + * Read a 32-bit unsigned integer from a pointer. This function is used to read + * the options that are passed into the parser from the Ruby implementation. It + * handles aligned and unaligned reads. + */ +static uint32_t +pm_options_read_u32(const char *data) { + if (((uintptr_t) data) % sizeof(uint32_t) == 0) { + return *((uint32_t *) data); + } else { + uint32_t value; + memcpy(&value, data, sizeof(uint32_t)); + return value; + } +} + +/** + * Read a 32-bit signed integer from a pointer. This function is used to read + * the options that are passed into the parser from the Ruby implementation. It + * handles aligned and unaligned reads. + */ +static int32_t +pm_options_read_s32(const char *data) { + if (((uintptr_t) data) % sizeof(int32_t) == 0) { + return *((int32_t *) data); + } else { + int32_t value; + memcpy(&value, data, sizeof(int32_t)); + return value; + } +} + +/** + * Deserialize an options struct from the given binary string. This is used to + * pass options to the parser from an FFI call so that consumers of the library + * from an FFI perspective don't have to worry about the structure of our + * options structs. Since the source of these calls will be from Ruby + * implementation internals we assume it is from a trusted source. + */ +void +pm_options_read(pm_options_t *options, const char *data) { + options->line = 1; // default + if (data == NULL) return; + + uint32_t filepath_length = pm_options_read_u32(data); + data += 4; + + if (filepath_length > 0) { + pm_string_constant_init(&options->filepath, data, filepath_length); + data += filepath_length; + } + + options->line = pm_options_read_s32(data); + data += 4; + + uint32_t encoding_length = pm_options_read_u32(data); + data += 4; + + if (encoding_length > 0) { + pm_string_constant_init(&options->encoding, data, encoding_length); + data += encoding_length; + } + + options->frozen_string_literal = (int8_t) *data++; + options->command_line = (uint8_t) *data++; + options->version = (pm_options_version_t) *data++; + options->encoding_locked = ((uint8_t) *data++) > 0; + options->main_script = ((uint8_t) *data++) > 0; + options->partial_script = ((uint8_t) *data++) > 0; + options->freeze = ((uint8_t) *data++) > 0; + + uint32_t scopes_count = pm_options_read_u32(data); + data += 4; + + if (scopes_count > 0) { + if (!pm_options_scopes_init(options, scopes_count)) return; + + for (size_t scope_index = 0; scope_index < scopes_count; scope_index++) { + uint32_t locals_count = pm_options_read_u32(data); + data += 4; + + pm_options_scope_t *scope = &options->scopes[scope_index]; + if (!pm_options_scope_init(scope, locals_count)) { + pm_options_free(options); + return; + } + + uint8_t forwarding = (uint8_t) *data++; + pm_options_scope_forwarding_set(&options->scopes[scope_index], forwarding); + + for (size_t local_index = 0; local_index < locals_count; local_index++) { + uint32_t local_length = pm_options_read_u32(data); + data += 4; + + pm_string_constant_init(&scope->locals[local_index], data, local_length); + data += local_length; + } + } + } +} + +#if defined(__GNUC__) && (__GNUC__ >= 10) +#pragma GCC diagnostic pop +#endif diff --git a/prism/options.h b/prism/options.h new file mode 100644 index 0000000000..c00c7bf755 --- /dev/null +++ b/prism/options.h @@ -0,0 +1,488 @@ +/** + * @file options.h + * + * The options that can be passed to parsing. + */ +#ifndef PRISM_OPTIONS_H +#define PRISM_OPTIONS_H + +#include "prism/defines.h" +#include "prism/util/pm_char.h" +#include "prism/util/pm_string.h" + +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> + +/** + * String literals should be made frozen. + */ +#define PM_OPTIONS_FROZEN_STRING_LITERAL_DISABLED ((int8_t) -1) + +/** + * String literals may be frozen or mutable depending on the implementation + * default. + */ +#define PM_OPTIONS_FROZEN_STRING_LITERAL_UNSET ((int8_t) 0) + +/** + * String literals should be made mutable. + */ +#define PM_OPTIONS_FROZEN_STRING_LITERAL_ENABLED ((int8_t) 1) + +/** + * A scope of locals surrounding the code that is being parsed. + */ +typedef struct pm_options_scope { + /** The number of locals in the scope. */ + size_t locals_count; + + /** The names of the locals in the scope. */ + pm_string_t *locals; + + /** Flags for the set of forwarding parameters in this scope. */ + uint8_t forwarding; +} pm_options_scope_t; + +/** The default value for parameters. */ +static const uint8_t PM_OPTIONS_SCOPE_FORWARDING_NONE = 0x0; + +/** When the scope is fowarding with the * parameter. */ +static const uint8_t PM_OPTIONS_SCOPE_FORWARDING_POSITIONALS = 0x1; + +/** When the scope is fowarding with the ** parameter. */ +static const uint8_t PM_OPTIONS_SCOPE_FORWARDING_KEYWORDS = 0x2; + +/** When the scope is fowarding with the & parameter. */ +static const uint8_t PM_OPTIONS_SCOPE_FORWARDING_BLOCK = 0x4; + +/** When the scope is fowarding with the ... parameter. */ +static const uint8_t PM_OPTIONS_SCOPE_FORWARDING_ALL = 0x8; + +// Forward declaration needed by the callback typedef. +struct pm_options; + +/** + * The callback called when additional switches are found in a shebang comment + * that need to be processed by the runtime. + * + * @param options The options struct that may be updated by this callback. + * Certain fields will be checked for changes, specifically encoding, + * command_line, and frozen_string_literal. + * @param source The source of the shebang comment. + * @param length The length of the source. + * @param shebang_callback_data Any additional data that should be passed along + * to the callback. + */ +typedef void (*pm_options_shebang_callback_t)(struct pm_options *options, const uint8_t *source, size_t length, void *shebang_callback_data); + +/** + * The version of Ruby syntax that we should be parsing with. This is used to + * allow consumers to specify which behavior they want in case they need to + * parse in the same way as a specific version of CRuby would have. + */ +typedef enum { + /** If an explicit version is not provided, the current version of prism will be used. */ + PM_OPTIONS_VERSION_UNSET = 0, + + /** The vendored version of prism in CRuby 3.3.x. */ + PM_OPTIONS_VERSION_CRUBY_3_3 = 1, + + /** The vendored version of prism in CRuby 3.4.x. */ + PM_OPTIONS_VERSION_CRUBY_3_4 = 2, + + /** The vendored version of prism in CRuby 4.0.x. */ + PM_OPTIONS_VERSION_CRUBY_3_5 = 3, + + /** The vendored version of prism in CRuby 4.0.x. */ + PM_OPTIONS_VERSION_CRUBY_4_0 = 3, + + /** The vendored version of prism in CRuby 4.1.x. */ + PM_OPTIONS_VERSION_CRUBY_4_1 = 4, + + /** The current version of prism. */ + PM_OPTIONS_VERSION_LATEST = PM_OPTIONS_VERSION_CRUBY_4_1 +} pm_options_version_t; + +/** + * The options that can be passed to the parser. + */ +typedef struct pm_options { + /** + * The callback to call when additional switches are found in a shebang + * comment. + */ + pm_options_shebang_callback_t shebang_callback; + + /** + * Any additional data that should be passed along to the shebang callback + * if one was set. + */ + void *shebang_callback_data; + + /** The name of the file that is currently being parsed. */ + pm_string_t filepath; + + /** + * The line within the file that the parse starts on. This value is + * 1-indexed. + */ + int32_t line; + + /** + * The name of the encoding that the source file is in. Note that this must + * correspond to a name that can be found with Encoding.find in Ruby. + */ + pm_string_t encoding; + + /** + * The number of scopes surrounding the code that is being parsed. + */ + size_t scopes_count; + + /** + * The scopes surrounding the code that is being parsed. For most parses + * this will be NULL, but for evals it will be the locals that are in scope + * surrounding the eval. Scopes are ordered from the outermost scope to the + * innermost one. + */ + pm_options_scope_t *scopes; + + /** + * The version of prism that we should be parsing with. This is used to + * allow consumers to specify which behavior they want in case they need to + * parse exactly as a specific version of CRuby. + */ + pm_options_version_t version; + + /** A bitset of the various options that were set on the command line. */ + uint8_t command_line; + + /** + * Whether or not the frozen string literal option has been set. + * May be: + * - PM_OPTIONS_FROZEN_STRING_LITERAL_DISABLED + * - PM_OPTIONS_FROZEN_STRING_LITERAL_ENABLED + * - PM_OPTIONS_FROZEN_STRING_LITERAL_UNSET + */ + int8_t frozen_string_literal; + + /** + * Whether or not the encoding magic comments should be respected. This is a + * niche use-case where you want to parse a file with a specific encoding + * but ignore any encoding magic comments at the top of the file. + */ + bool encoding_locked; + + /** + * When the file being parsed is the main script, the shebang will be + * considered for command-line flags (or for implicit -x). The caller needs + * to pass this information to the parser so that it can behave correctly. + */ + bool main_script; + + /** + * When the file being parsed is considered a "partial" script, jumps will + * not be marked as errors if they are not contained within loops/blocks. + * This is used in the case that you're parsing a script that you know will + * be embedded inside another script later, but you do not have that context + * yet. For example, when parsing an ERB template that will be evaluated + * inside another script. + */ + bool partial_script; + + /** + * Whether or not the parser should freeze the nodes that it creates. This + * makes it possible to have a deeply frozen AST that is safe to share + * between concurrency primitives. + */ + bool freeze; +} pm_options_t; + +/** + * A bit representing whether or not the command line -a option was set. -a + * splits the input line $_ into $F. + */ +static const uint8_t PM_OPTIONS_COMMAND_LINE_A = 0x1; + +/** + * A bit representing whether or not the command line -e option was set. -e + * allow the user to specify a script to be executed. This is necessary for + * prism to know because certain warnings are not generated when -e is used. + */ +static const uint8_t PM_OPTIONS_COMMAND_LINE_E = 0x2; + +/** + * A bit representing whether or not the command line -l option was set. -l + * chomps the input line by default. + */ +static const uint8_t PM_OPTIONS_COMMAND_LINE_L = 0x4; + +/** + * A bit representing whether or not the command line -n option was set. -n + * wraps the script in a while gets loop. + */ +static const uint8_t PM_OPTIONS_COMMAND_LINE_N = 0x8; + +/** + * A bit representing whether or not the command line -p option was set. -p + * prints the value of $_ at the end of each loop. + */ +static const uint8_t PM_OPTIONS_COMMAND_LINE_P = 0x10; + +/** + * A bit representing whether or not the command line -x option was set. -x + * searches the input file for a shebang that matches the current Ruby engine. + */ +static const uint8_t PM_OPTIONS_COMMAND_LINE_X = 0x20; + +/** + * Set the shebang callback option on the given options struct. + * + * @param options The options struct to set the shebang callback on. + * @param shebang_callback The shebang callback to set. + * @param shebang_callback_data Any additional data that should be passed along + * to the callback. + * + * \public \memberof pm_options + */ +PRISM_EXPORTED_FUNCTION void pm_options_shebang_callback_set(pm_options_t *options, pm_options_shebang_callback_t shebang_callback, void *shebang_callback_data); + +/** + * Set the filepath option on the given options struct. + * + * @param options The options struct to set the filepath on. + * @param filepath The filepath to set. + * + * \public \memberof pm_options + */ +PRISM_EXPORTED_FUNCTION void pm_options_filepath_set(pm_options_t *options, const char *filepath); + +/** + * Set the line option on the given options struct. + * + * @param options The options struct to set the line on. + * @param line The line to set. + * + * \public \memberof pm_options + */ +PRISM_EXPORTED_FUNCTION void pm_options_line_set(pm_options_t *options, int32_t line); + +/** + * Set the encoding option on the given options struct. + * + * @param options The options struct to set the encoding on. + * @param encoding The encoding to set. + * + * \public \memberof pm_options + */ +PRISM_EXPORTED_FUNCTION void pm_options_encoding_set(pm_options_t *options, const char *encoding); + +/** + * Set the encoding_locked option on the given options struct. + * + * @param options The options struct to set the encoding_locked value on. + * @param encoding_locked The encoding_locked value to set. + * + * \public \memberof pm_options + */ +PRISM_EXPORTED_FUNCTION void pm_options_encoding_locked_set(pm_options_t *options, bool encoding_locked); + +/** + * Set the frozen string literal option on the given options struct. + * + * @param options The options struct to set the frozen string literal value on. + * @param frozen_string_literal The frozen string literal value to set. + * + * \public \memberof pm_options + */ +PRISM_EXPORTED_FUNCTION void pm_options_frozen_string_literal_set(pm_options_t *options, bool frozen_string_literal); + +/** + * Sets the command line option on the given options struct. + * + * @param options The options struct to set the command line option on. + * @param command_line The command_line value to set. + * + * \public \memberof pm_options + */ +PRISM_EXPORTED_FUNCTION void pm_options_command_line_set(pm_options_t *options, uint8_t command_line); + +/** + * Set the version option on the given options struct by parsing the given + * string. If the string contains an invalid option, this returns false. + * Otherwise, it returns true. + * + * @param options The options struct to set the version on. + * @param version The version to set. + * @param length The length of the version string. + * @return Whether or not the version was parsed successfully. + * + * \public \memberof pm_options + */ +PRISM_EXPORTED_FUNCTION bool pm_options_version_set(pm_options_t *options, const char *version, size_t length); + +/** + * Set the main script option on the given options struct. + * + * @param options The options struct to set the main script value on. + * @param main_script The main script value to set. + * + * \public \memberof pm_options + */ +PRISM_EXPORTED_FUNCTION void pm_options_main_script_set(pm_options_t *options, bool main_script); + +/** + * Set the partial script option on the given options struct. + * + * @param options The options struct to set the partial script value on. + * @param partial_script The partial script value to set. + * + * \public \memberof pm_options + */ +PRISM_EXPORTED_FUNCTION void pm_options_partial_script_set(pm_options_t *options, bool partial_script); + +/** + * Set the freeze option on the given options struct. + * + * @param options The options struct to set the freeze value on. + * @param freeze The freeze value to set. + * + * \public \memberof pm_options + */ +PRISM_EXPORTED_FUNCTION void pm_options_freeze_set(pm_options_t *options, bool freeze); + +/** + * Allocate and zero out the scopes array on the given options struct. + * + * @param options The options struct to initialize the scopes array on. + * @param scopes_count The number of scopes to allocate. + * @return Whether or not the scopes array was initialized successfully. + * + * \public \memberof pm_options + */ +PRISM_EXPORTED_FUNCTION bool pm_options_scopes_init(pm_options_t *options, size_t scopes_count); + +/** + * Return a pointer to the scope at the given index within the given options. + * + * @param options The options struct to get the scope from. + * @param index The index of the scope to get. + * @return A pointer to the scope at the given index. + * + * \public \memberof pm_options + */ +PRISM_EXPORTED_FUNCTION const pm_options_scope_t * pm_options_scope_get(const pm_options_t *options, size_t index); + +/** + * Create a new options scope struct. This will hold a set of locals that are in + * scope surrounding the code that is being parsed. + * + * @param scope The scope struct to initialize. + * @param locals_count The number of locals to allocate. + * @return Whether or not the scope was initialized successfully. + * + * \public \memberof pm_options + */ +PRISM_EXPORTED_FUNCTION bool pm_options_scope_init(pm_options_scope_t *scope, size_t locals_count); + +/** + * Return a pointer to the local at the given index within the given scope. + * + * @param scope The scope struct to get the local from. + * @param index The index of the local to get. + * @return A pointer to the local at the given index. + * + * \public \memberof pm_options + */ +PRISM_EXPORTED_FUNCTION const pm_string_t * pm_options_scope_local_get(const pm_options_scope_t *scope, size_t index); + +/** + * Set the forwarding option on the given scope struct. + * + * @param scope The scope struct to set the forwarding on. + * @param forwarding The forwarding value to set. + * + * \public \memberof pm_options + */ +PRISM_EXPORTED_FUNCTION void pm_options_scope_forwarding_set(pm_options_scope_t *scope, uint8_t forwarding); + +/** + * Free the internal memory associated with the options. + * + * @param options The options struct whose internal memory should be freed. + * + * \public \memberof pm_options + */ +PRISM_EXPORTED_FUNCTION void pm_options_free(pm_options_t *options); + +/** + * Deserialize an options struct from the given binary string. This is used to + * pass options to the parser from an FFI call so that consumers of the library + * from an FFI perspective don't have to worry about the structure of our + * options structs. Since the source of these calls will be from Ruby + * implementation internals we assume it is from a trusted source. + * + * `data` is assumed to be a valid pointer pointing to well-formed data. The + * layout of this data should be the same every time, and is described below: + * + * | # bytes | field | + * | ------- | -------------------------- | + * | `4` | the length of the filepath | + * | ... | the filepath bytes | + * | `4` | the line number | + * | `4` | the length the encoding | + * | ... | the encoding bytes | + * | `1` | frozen string literal | + * | `1` | -p command line option | + * | `1` | -n command line option | + * | `1` | -l command line option | + * | `1` | -a command line option | + * | `1` | the version | + * | `1` | encoding locked | + * | `1` | main script | + * | `1` | partial script | + * | `1` | freeze | + * | `4` | the number of scopes | + * | ... | the scopes | + * + * The version field is an enum, so it should be one of the following values: + * + * | value | version | + * | ----- | ------------------------- | + * | `0` | use the latest version of prism | + * | `1` | use the version of prism that is vendored in CRuby 3.3.0 | + * + * Each scope is laid out as follows: + * + * | # bytes | field | + * | ------- | -------------------------- | + * | `4` | the number of locals | + * | `1` | the forwarding flags | + * | ... | the locals | + * + * Each local is laid out as follows: + * + * | # bytes | field | + * | ------- | -------------------------- | + * | `4` | the length of the local | + * | ... | the local bytes | + * + * Some additional things to note about this layout: + * + * * The filepath can have a length of 0, in which case we'll consider it an + * empty string. + * * The line number should be 0-indexed. + * * The encoding can have a length of 0, in which case we'll use the default + * encoding (UTF-8). If it's not 0, it should correspond to a name of an + * encoding that can be passed to `Encoding.find` in Ruby. + * * The frozen string literal, encoding locked, main script, and partial script + * fields are booleans, so their values should be either 0 or 1. + * * The number of scopes can be 0. + * + * @param options The options struct to deserialize into. + * @param data The binary string to deserialize from. + */ +void pm_options_read(pm_options_t *options, const char *data); + +#endif diff --git a/prism/pack.c b/prism/pack.c new file mode 100644 index 0000000000..1388ca8a3b --- /dev/null +++ b/prism/pack.c @@ -0,0 +1,509 @@ +#include "prism/pack.h" + +// We optionally support parsing String#pack templates. For systems that don't +// want or need this functionality, it can be turned off with the +// PRISM_EXCLUDE_PACK define. +#ifdef PRISM_EXCLUDE_PACK + +void pm_pack_parse(void) {} + +#else + +#include <stdbool.h> +#include <errno.h> + +static uintmax_t +strtoumaxc(const char **format) { + uintmax_t value = 0; + while (**format >= '0' && **format <= '9') { + if (value > UINTMAX_MAX / 10) { + errno = ERANGE; + } + value = value * 10 + ((uintmax_t) (**format - '0')); + (*format)++; + } + return value; +} + +PRISM_EXPORTED_FUNCTION pm_pack_result +pm_pack_parse( + pm_pack_variant variant, + const char **format, + const char *format_end, + pm_pack_type *type, + pm_pack_signed *signed_type, + pm_pack_endian *endian, + pm_pack_size *size, + pm_pack_length_type *length_type, + uint64_t *length, + pm_pack_encoding *encoding +) { + if (*encoding == PM_PACK_ENCODING_START) { + *encoding = PM_PACK_ENCODING_US_ASCII; + } + + if (*format == format_end) { + *type = PM_PACK_END; + *signed_type = PM_PACK_SIGNED_NA; + *endian = PM_PACK_ENDIAN_NA; + *size = PM_PACK_SIZE_NA; + *length_type = PM_PACK_LENGTH_NA; + return PM_PACK_OK; + } + + *length_type = PM_PACK_LENGTH_FIXED; + *length = 1; + bool length_changed_allowed = true; + + char directive = **format; + (*format)++; + switch (directive) { + case ' ': + case '\t': + case '\n': + case '\v': + case '\f': + case '\r': + *type = PM_PACK_SPACE; + *signed_type = PM_PACK_SIGNED_NA; + *endian = PM_PACK_ENDIAN_NA; + *size = PM_PACK_SIZE_NA; + *length_type = PM_PACK_LENGTH_NA; + *length = 0; + return PM_PACK_OK; + case '#': + while ((*format < format_end) && (**format != '\n')) { + (*format)++; + } + *type = PM_PACK_COMMENT; + *signed_type = PM_PACK_SIGNED_NA; + *endian = PM_PACK_ENDIAN_NA; + *size = PM_PACK_SIZE_NA; + *length_type = PM_PACK_LENGTH_NA; + *length = 0; + return PM_PACK_OK; + case 'C': + *type = PM_PACK_INTEGER; + *signed_type = PM_PACK_UNSIGNED; + *endian = PM_PACK_AGNOSTIC_ENDIAN; + *size = PM_PACK_SIZE_8; + break; + case 'S': + *type = PM_PACK_INTEGER; + *signed_type = PM_PACK_UNSIGNED; + *endian = PM_PACK_NATIVE_ENDIAN; + *size = PM_PACK_SIZE_16; + break; + case 'L': + *type = PM_PACK_INTEGER; + *signed_type = PM_PACK_UNSIGNED; + *endian = PM_PACK_NATIVE_ENDIAN; + *size = PM_PACK_SIZE_32; + break; + case 'Q': + *type = PM_PACK_INTEGER; + *signed_type = PM_PACK_UNSIGNED; + *endian = PM_PACK_NATIVE_ENDIAN; + *size = PM_PACK_SIZE_64; + break; + case 'J': + *type = PM_PACK_INTEGER; + *signed_type = PM_PACK_UNSIGNED; + *endian = PM_PACK_NATIVE_ENDIAN; + *size = PM_PACK_SIZE_P; + break; + case 'c': + *type = PM_PACK_INTEGER; + *signed_type = PM_PACK_SIGNED; + *endian = PM_PACK_AGNOSTIC_ENDIAN; + *size = PM_PACK_SIZE_8; + break; + case 's': + *type = PM_PACK_INTEGER; + *signed_type = PM_PACK_SIGNED; + *endian = PM_PACK_NATIVE_ENDIAN; + *size = PM_PACK_SIZE_16; + break; + case 'l': + *type = PM_PACK_INTEGER; + *signed_type = PM_PACK_SIGNED; + *endian = PM_PACK_NATIVE_ENDIAN; + *size = PM_PACK_SIZE_32; + break; + case 'q': + *type = PM_PACK_INTEGER; + *signed_type = PM_PACK_SIGNED; + *endian = PM_PACK_NATIVE_ENDIAN; + *size = PM_PACK_SIZE_64; + break; + case 'j': + *type = PM_PACK_INTEGER; + *signed_type = PM_PACK_SIGNED; + *endian = PM_PACK_NATIVE_ENDIAN; + *size = PM_PACK_SIZE_P; + break; + case 'I': + *type = PM_PACK_INTEGER; + *signed_type = PM_PACK_UNSIGNED; + *endian = PM_PACK_NATIVE_ENDIAN; + *size = PM_PACK_SIZE_INT; + break; + case 'i': + *type = PM_PACK_INTEGER; + *signed_type = PM_PACK_SIGNED; + *endian = PM_PACK_NATIVE_ENDIAN; + *size = PM_PACK_SIZE_INT; + break; + case 'n': + *type = PM_PACK_INTEGER; + *signed_type = PM_PACK_UNSIGNED; + *endian = PM_PACK_BIG_ENDIAN; + *size = PM_PACK_SIZE_16; + length_changed_allowed = false; + break; + case 'N': + *type = PM_PACK_INTEGER; + *signed_type = PM_PACK_UNSIGNED; + *endian = PM_PACK_BIG_ENDIAN; + *size = PM_PACK_SIZE_32; + length_changed_allowed = false; + break; + case 'v': + *type = PM_PACK_INTEGER; + *signed_type = PM_PACK_UNSIGNED; + *endian = PM_PACK_LITTLE_ENDIAN; + *size = PM_PACK_SIZE_16; + length_changed_allowed = false; + break; + case 'V': + *type = PM_PACK_INTEGER; + *signed_type = PM_PACK_UNSIGNED; + *endian = PM_PACK_LITTLE_ENDIAN; + *size = PM_PACK_SIZE_32; + length_changed_allowed = false; + break; + case 'U': + *type = PM_PACK_UTF8; + *signed_type = PM_PACK_SIGNED_NA; + *endian = PM_PACK_ENDIAN_NA; + *size = PM_PACK_SIZE_NA; + break; + case 'w': + *type = PM_PACK_BER; + *signed_type = PM_PACK_SIGNED_NA; + *endian = PM_PACK_ENDIAN_NA; + *size = PM_PACK_SIZE_NA; + break; + case 'D': + case 'd': + *type = PM_PACK_FLOAT; + *signed_type = PM_PACK_SIGNED_NA; + *endian = PM_PACK_NATIVE_ENDIAN; + *size = PM_PACK_SIZE_64; + break; + case 'F': + case 'f': + *type = PM_PACK_FLOAT; + *signed_type = PM_PACK_SIGNED_NA; + *endian = PM_PACK_NATIVE_ENDIAN; + *size = PM_PACK_SIZE_32; + break; + case 'E': + *type = PM_PACK_FLOAT; + *signed_type = PM_PACK_SIGNED_NA; + *endian = PM_PACK_LITTLE_ENDIAN; + *size = PM_PACK_SIZE_64; + break; + case 'e': + *type = PM_PACK_FLOAT; + *signed_type = PM_PACK_SIGNED_NA; + *endian = PM_PACK_LITTLE_ENDIAN; + *size = PM_PACK_SIZE_32; + break; + case 'G': + *type = PM_PACK_FLOAT; + *signed_type = PM_PACK_SIGNED_NA; + *endian = PM_PACK_BIG_ENDIAN; + *size = PM_PACK_SIZE_64; + break; + case 'g': + *type = PM_PACK_FLOAT; + *signed_type = PM_PACK_SIGNED_NA; + *endian = PM_PACK_BIG_ENDIAN; + *size = PM_PACK_SIZE_32; + break; + case 'A': + *type = PM_PACK_STRING_SPACE_PADDED; + *signed_type = PM_PACK_SIGNED_NA; + *endian = PM_PACK_ENDIAN_NA; + *size = PM_PACK_SIZE_NA; + break; + case 'a': + *type = PM_PACK_STRING_NULL_PADDED; + *signed_type = PM_PACK_SIGNED_NA; + *endian = PM_PACK_ENDIAN_NA; + *size = PM_PACK_SIZE_NA; + break; + case 'Z': + *type = PM_PACK_STRING_NULL_TERMINATED; + *signed_type = PM_PACK_SIGNED_NA; + *endian = PM_PACK_ENDIAN_NA; + *size = PM_PACK_SIZE_NA; + break; + case 'B': + *type = PM_PACK_STRING_MSB; + *signed_type = PM_PACK_SIGNED_NA; + *endian = PM_PACK_ENDIAN_NA; + *size = PM_PACK_SIZE_NA; + break; + case 'b': + *type = PM_PACK_STRING_LSB; + *signed_type = PM_PACK_SIGNED_NA; + *endian = PM_PACK_ENDIAN_NA; + *size = PM_PACK_SIZE_NA; + break; + case 'H': + *type = PM_PACK_STRING_HEX_HIGH; + *signed_type = PM_PACK_SIGNED_NA; + *endian = PM_PACK_ENDIAN_NA; + *size = PM_PACK_SIZE_NA; + break; + case 'h': + *type = PM_PACK_STRING_HEX_LOW; + *signed_type = PM_PACK_SIGNED_NA; + *endian = PM_PACK_ENDIAN_NA; + *size = PM_PACK_SIZE_NA; + break; + case 'u': + *type = PM_PACK_STRING_UU; + *signed_type = PM_PACK_SIGNED_NA; + *endian = PM_PACK_ENDIAN_NA; + *size = PM_PACK_SIZE_NA; + break; + case 'M': + *type = PM_PACK_STRING_MIME; + *signed_type = PM_PACK_SIGNED_NA; + *endian = PM_PACK_ENDIAN_NA; + *size = PM_PACK_SIZE_NA; + break; + case 'm': + *type = PM_PACK_STRING_BASE64; + *signed_type = PM_PACK_SIGNED_NA; + *endian = PM_PACK_ENDIAN_NA; + *size = PM_PACK_SIZE_NA; + break; + case 'P': + *type = PM_PACK_STRING_FIXED; + *signed_type = PM_PACK_SIGNED_NA; + *endian = PM_PACK_ENDIAN_NA; + *size = PM_PACK_SIZE_NA; + break; + case 'p': + *type = PM_PACK_STRING_POINTER; + *signed_type = PM_PACK_SIGNED_NA; + *endian = PM_PACK_ENDIAN_NA; + *size = PM_PACK_SIZE_NA; + break; + case '@': + *type = PM_PACK_MOVE; + *signed_type = PM_PACK_SIGNED_NA; + *endian = PM_PACK_ENDIAN_NA; + *size = PM_PACK_SIZE_NA; + break; + case 'X': + *type = PM_PACK_BACK; + *signed_type = PM_PACK_SIGNED_NA; + *endian = PM_PACK_ENDIAN_NA; + *size = PM_PACK_SIZE_NA; + break; + case 'x': + *type = PM_PACK_NULL; + *signed_type = PM_PACK_SIGNED_NA; + *endian = PM_PACK_ENDIAN_NA; + *size = PM_PACK_SIZE_NA; + break; + case '%': + return PM_PACK_ERROR_UNSUPPORTED_DIRECTIVE; + default: + return PM_PACK_ERROR_UNKNOWN_DIRECTIVE; + } + + bool explicit_endian = false; + + while (*format < format_end) { + switch (**format) { + case '_': + case '!': + (*format)++; + if (*type != PM_PACK_INTEGER || !length_changed_allowed) { + return PM_PACK_ERROR_BANG_NOT_ALLOWED; + } + switch (*size) { + case PM_PACK_SIZE_SHORT: + case PM_PACK_SIZE_INT: + case PM_PACK_SIZE_LONG: + case PM_PACK_SIZE_LONG_LONG: + break; + case PM_PACK_SIZE_16: + *size = PM_PACK_SIZE_SHORT; + break; + case PM_PACK_SIZE_32: + *size = PM_PACK_SIZE_LONG; + break; + case PM_PACK_SIZE_64: + *size = PM_PACK_SIZE_LONG_LONG; + break; + case PM_PACK_SIZE_P: + break; + default: + return PM_PACK_ERROR_BANG_NOT_ALLOWED; + } + break; + case '<': + (*format)++; + if (explicit_endian) { + return PM_PACK_ERROR_DOUBLE_ENDIAN; + } + *endian = PM_PACK_LITTLE_ENDIAN; + explicit_endian = true; + break; + case '>': + (*format)++; + if (explicit_endian) { + return PM_PACK_ERROR_DOUBLE_ENDIAN; + } + *endian = PM_PACK_BIG_ENDIAN; + explicit_endian = true; + break; + default: + goto exit_modifier_loop; + } + } + +exit_modifier_loop: + + if (variant == PM_PACK_VARIANT_UNPACK && *type == PM_PACK_MOVE) { + *length = 0; + } + + if (*format < format_end) { + if (**format == '*') { + switch (*type) { + case PM_PACK_NULL: + case PM_PACK_BACK: + switch (variant) { + case PM_PACK_VARIANT_PACK: + *length_type = PM_PACK_LENGTH_FIXED; + break; + case PM_PACK_VARIANT_UNPACK: + *length_type = PM_PACK_LENGTH_MAX; + break; + } + *length = 0; + break; + + case PM_PACK_MOVE: + switch (variant) { + case PM_PACK_VARIANT_PACK: + *length_type = PM_PACK_LENGTH_FIXED; + break; + case PM_PACK_VARIANT_UNPACK: + *length_type = PM_PACK_LENGTH_RELATIVE; + break; + } + *length = 0; + break; + + case PM_PACK_STRING_UU: + *length_type = PM_PACK_LENGTH_FIXED; + *length = 0; + break; + + case PM_PACK_STRING_FIXED: + switch (variant) { + case PM_PACK_VARIANT_PACK: + *length_type = PM_PACK_LENGTH_FIXED; + *length = 1; + break; + case PM_PACK_VARIANT_UNPACK: + *length_type = PM_PACK_LENGTH_MAX; + *length = 0; + break; + } + break; + + case PM_PACK_STRING_MIME: + case PM_PACK_STRING_BASE64: + *length_type = PM_PACK_LENGTH_FIXED; + *length = 1; + break; + + default: + *length_type = PM_PACK_LENGTH_MAX; + *length = 0; + break; + } + + (*format)++; + } else if (**format >= '0' && **format <= '9') { + errno = 0; + *length_type = PM_PACK_LENGTH_FIXED; + #if UINTMAX_MAX < UINT64_MAX + #error "prism's design assumes uintmax_t is at least as large as uint64_t" + #endif + uintmax_t length_max = strtoumaxc(format); + if (errno || length_max > UINT64_MAX) { + return PM_PACK_ERROR_LENGTH_TOO_BIG; + } + *length = (uint64_t) length_max; + } + } + + switch (*type) { + case PM_PACK_UTF8: + /* if encoding is US-ASCII, upgrade to UTF-8 */ + if (*encoding == PM_PACK_ENCODING_US_ASCII) { + *encoding = PM_PACK_ENCODING_UTF_8; + } + break; + case PM_PACK_STRING_MIME: + case PM_PACK_STRING_BASE64: + case PM_PACK_STRING_UU: + /* keep US-ASCII (do nothing) */ + break; + default: + /* fall back to BINARY */ + *encoding = PM_PACK_ENCODING_ASCII_8BIT; + break; + } + + return PM_PACK_OK; +} + +PRISM_EXPORTED_FUNCTION size_t +pm_size_to_native(pm_pack_size size) { + switch (size) { + case PM_PACK_SIZE_SHORT: + return sizeof(short); + case PM_PACK_SIZE_INT: + return sizeof(int); + case PM_PACK_SIZE_LONG: + return sizeof(long); + case PM_PACK_SIZE_LONG_LONG: + return sizeof(long long); + case PM_PACK_SIZE_8: + return 1; + case PM_PACK_SIZE_16: + return 2; + case PM_PACK_SIZE_32: + return 4; + case PM_PACK_SIZE_64: + return 8; + case PM_PACK_SIZE_P: + return sizeof(void *); + default: + return 0; + } +} + +#endif diff --git a/prism/pack.h b/prism/pack.h new file mode 100644 index 0000000000..0b0b4b19cc --- /dev/null +++ b/prism/pack.h @@ -0,0 +1,163 @@ +/** + * @file pack.h + * + * A pack template string parser. + */ +#ifndef PRISM_PACK_H +#define PRISM_PACK_H + +#include "prism/defines.h" + +// We optionally support parsing String#pack templates. For systems that don't +// want or need this functionality, it can be turned off with the +// PRISM_EXCLUDE_PACK define. +#ifdef PRISM_EXCLUDE_PACK + +void pm_pack_parse(void); + +#else + +#include <stdint.h> +#include <stdlib.h> + +/** The version of the pack template language that we are parsing. */ +typedef enum pm_pack_version { + PM_PACK_VERSION_3_2_0 +} pm_pack_version; + +/** The type of pack template we are parsing. */ +typedef enum pm_pack_variant { + PM_PACK_VARIANT_PACK, + PM_PACK_VARIANT_UNPACK +} pm_pack_variant; + +/** A directive within the pack template. */ +typedef enum pm_pack_type { + PM_PACK_SPACE, + PM_PACK_COMMENT, + PM_PACK_INTEGER, + PM_PACK_UTF8, + PM_PACK_BER, + PM_PACK_FLOAT, + PM_PACK_STRING_SPACE_PADDED, + PM_PACK_STRING_NULL_PADDED, + PM_PACK_STRING_NULL_TERMINATED, + PM_PACK_STRING_MSB, + PM_PACK_STRING_LSB, + PM_PACK_STRING_HEX_HIGH, + PM_PACK_STRING_HEX_LOW, + PM_PACK_STRING_UU, + PM_PACK_STRING_MIME, + PM_PACK_STRING_BASE64, + PM_PACK_STRING_FIXED, + PM_PACK_STRING_POINTER, + PM_PACK_MOVE, + PM_PACK_BACK, + PM_PACK_NULL, + PM_PACK_END +} pm_pack_type; + +/** The signness of a pack directive. */ +typedef enum pm_pack_signed { + PM_PACK_UNSIGNED, + PM_PACK_SIGNED, + PM_PACK_SIGNED_NA +} pm_pack_signed; + +/** The endianness of a pack directive. */ +typedef enum pm_pack_endian { + PM_PACK_AGNOSTIC_ENDIAN, + PM_PACK_LITTLE_ENDIAN, // aka 'VAX', or 'V' + PM_PACK_BIG_ENDIAN, // aka 'network', or 'N' + PM_PACK_NATIVE_ENDIAN, + PM_PACK_ENDIAN_NA +} pm_pack_endian; + +/** The size of an integer pack directive. */ +typedef enum pm_pack_size { + PM_PACK_SIZE_SHORT, + PM_PACK_SIZE_INT, + PM_PACK_SIZE_LONG, + PM_PACK_SIZE_LONG_LONG, + PM_PACK_SIZE_8, + PM_PACK_SIZE_16, + PM_PACK_SIZE_32, + PM_PACK_SIZE_64, + PM_PACK_SIZE_P, + PM_PACK_SIZE_NA +} pm_pack_size; + +/** The type of length of a pack directive. */ +typedef enum pm_pack_length_type { + PM_PACK_LENGTH_FIXED, + PM_PACK_LENGTH_MAX, + PM_PACK_LENGTH_RELATIVE, // special case for unpack @* + PM_PACK_LENGTH_NA +} pm_pack_length_type; + +/** The type of encoding for a pack template string. */ +typedef enum pm_pack_encoding { + PM_PACK_ENCODING_START, + PM_PACK_ENCODING_ASCII_8BIT, + PM_PACK_ENCODING_US_ASCII, + PM_PACK_ENCODING_UTF_8 +} pm_pack_encoding; + +/** The result of parsing a pack template. */ +typedef enum pm_pack_result { + PM_PACK_OK, + PM_PACK_ERROR_UNSUPPORTED_DIRECTIVE, + PM_PACK_ERROR_UNKNOWN_DIRECTIVE, + PM_PACK_ERROR_LENGTH_TOO_BIG, + PM_PACK_ERROR_BANG_NOT_ALLOWED, + PM_PACK_ERROR_DOUBLE_ENDIAN +} pm_pack_result; + +/** + * Parse a single directive from a pack or unpack format string. + * + * @param variant (in) pack or unpack + * @param format (in, out) the start of the next directive to parse on calling, + * and advanced beyond the parsed directive on return, or as much of it as + * was consumed until an error was encountered + * @param format_end (in) the end of the format string + * @param type (out) the type of the directive + * @param signed_type (out) whether the value is signed + * @param endian (out) the endianness of the value + * @param size (out) the size of the value + * @param length_type (out) what kind of length is specified + * @param length (out) the length of the directive + * @param encoding (in, out) takes the current encoding of the string which + * would result from parsing the whole format string, and returns a possibly + * changed directive - the encoding should be `PM_PACK_ENCODING_START` when + * pm_pack_parse is called for the first directive in a format string + * + * @return `PM_PACK_OK` on success or `PM_PACK_ERROR_*` on error + * @note Consult Ruby documentation for the meaning of directives. + */ +PRISM_EXPORTED_FUNCTION pm_pack_result +pm_pack_parse( + pm_pack_variant variant, + const char **format, + const char *format_end, + pm_pack_type *type, + pm_pack_signed *signed_type, + pm_pack_endian *endian, + pm_pack_size *size, + pm_pack_length_type *length_type, + uint64_t *length, + pm_pack_encoding *encoding +); + +/** + * Prism abstracts sizes away from the native system - this converts an abstract + * size to a native size. + * + * @param size The abstract size to convert. + * @return The native size. + */ +PRISM_EXPORTED_FUNCTION size_t pm_size_to_native(pm_pack_size size); + +#endif + +#endif diff --git a/prism/parser.h b/prism/parser.h new file mode 100644 index 0000000000..95d7aac710 --- /dev/null +++ b/prism/parser.h @@ -0,0 +1,936 @@ +/** + * @file parser.h + * + * The parser used to parse Ruby source. + */ +#ifndef PRISM_PARSER_H +#define PRISM_PARSER_H + +#include "prism/defines.h" +#include "prism/ast.h" +#include "prism/encoding.h" +#include "prism/options.h" +#include "prism/static_literals.h" +#include "prism/util/pm_constant_pool.h" +#include "prism/util/pm_list.h" +#include "prism/util/pm_newline_list.h" +#include "prism/util/pm_string.h" + +#include <stdbool.h> + +/** + * This enum provides various bits that represent different kinds of states that + * the lexer can track. This is used to determine which kind of token to return + * based on the context of the parser. + */ +typedef enum { + PM_LEX_STATE_BIT_BEG, + PM_LEX_STATE_BIT_END, + PM_LEX_STATE_BIT_ENDARG, + PM_LEX_STATE_BIT_ENDFN, + PM_LEX_STATE_BIT_ARG, + PM_LEX_STATE_BIT_CMDARG, + PM_LEX_STATE_BIT_MID, + PM_LEX_STATE_BIT_FNAME, + PM_LEX_STATE_BIT_DOT, + PM_LEX_STATE_BIT_CLASS, + PM_LEX_STATE_BIT_LABEL, + PM_LEX_STATE_BIT_LABELED, + PM_LEX_STATE_BIT_FITEM +} pm_lex_state_bit_t; + +/** + * This enum combines the various bits from the above enum into individual + * values that represent the various states of the lexer. + */ +typedef enum { + PM_LEX_STATE_NONE = 0, + PM_LEX_STATE_BEG = (1 << PM_LEX_STATE_BIT_BEG), + PM_LEX_STATE_END = (1 << PM_LEX_STATE_BIT_END), + PM_LEX_STATE_ENDARG = (1 << PM_LEX_STATE_BIT_ENDARG), + PM_LEX_STATE_ENDFN = (1 << PM_LEX_STATE_BIT_ENDFN), + PM_LEX_STATE_ARG = (1 << PM_LEX_STATE_BIT_ARG), + PM_LEX_STATE_CMDARG = (1 << PM_LEX_STATE_BIT_CMDARG), + PM_LEX_STATE_MID = (1 << PM_LEX_STATE_BIT_MID), + PM_LEX_STATE_FNAME = (1 << PM_LEX_STATE_BIT_FNAME), + PM_LEX_STATE_DOT = (1 << PM_LEX_STATE_BIT_DOT), + PM_LEX_STATE_CLASS = (1 << PM_LEX_STATE_BIT_CLASS), + PM_LEX_STATE_LABEL = (1 << PM_LEX_STATE_BIT_LABEL), + PM_LEX_STATE_LABELED = (1 << PM_LEX_STATE_BIT_LABELED), + PM_LEX_STATE_FITEM = (1 << PM_LEX_STATE_BIT_FITEM), + PM_LEX_STATE_BEG_ANY = PM_LEX_STATE_BEG | PM_LEX_STATE_MID | PM_LEX_STATE_CLASS, + PM_LEX_STATE_ARG_ANY = PM_LEX_STATE_ARG | PM_LEX_STATE_CMDARG, + PM_LEX_STATE_END_ANY = PM_LEX_STATE_END | PM_LEX_STATE_ENDARG | PM_LEX_STATE_ENDFN +} pm_lex_state_t; + +/** + * The type of quote that a heredoc uses. + */ +typedef enum { + PM_HEREDOC_QUOTE_NONE, + PM_HEREDOC_QUOTE_SINGLE = '\'', + PM_HEREDOC_QUOTE_DOUBLE = '"', + PM_HEREDOC_QUOTE_BACKTICK = '`', +} pm_heredoc_quote_t; + +/** + * The type of indentation that a heredoc uses. + */ +typedef enum { + PM_HEREDOC_INDENT_NONE, + PM_HEREDOC_INDENT_DASH, + PM_HEREDOC_INDENT_TILDE, +} pm_heredoc_indent_t; + +/** + * All of the information necessary to store to lexing a heredoc. + */ +typedef struct { + /** A pointer to the start of the heredoc identifier. */ + const uint8_t *ident_start; + + /** The length of the heredoc identifier. */ + size_t ident_length; + + /** The type of quote that the heredoc uses. */ + pm_heredoc_quote_t quote; + + /** The type of indentation that the heredoc uses. */ + pm_heredoc_indent_t indent; +} pm_heredoc_lex_mode_t; + +/** + * When lexing Ruby source, the lexer has a small amount of state to tell which + * kind of token it is currently lexing. For example, when we find the start of + * a string, the first token that we return is a TOKEN_STRING_BEGIN token. After + * that the lexer is now in the PM_LEX_STRING mode, and will return tokens that + * are found as part of a string. + */ +typedef struct pm_lex_mode { + /** The type of this lex mode. */ + enum { + /** This state is used when any given token is being lexed. */ + PM_LEX_DEFAULT, + + /** + * This state is used when we're lexing as normal but inside an embedded + * expression of a string. + */ + PM_LEX_EMBEXPR, + + /** + * This state is used when we're lexing a variable that is embedded + * directly inside of a string with the # shorthand. + */ + PM_LEX_EMBVAR, + + /** This state is used when you are inside the content of a heredoc. */ + PM_LEX_HEREDOC, + + /** + * This state is used when we are lexing a list of tokens, as in a %w + * word list literal or a %i symbol list literal. + */ + PM_LEX_LIST, + + /** + * This state is used when a regular expression has been begun and we + * are looking for the terminator. + */ + PM_LEX_REGEXP, + + /** + * This state is used when we are lexing a string or a string-like + * token, as in string content with either quote or an xstring. + */ + PM_LEX_STRING + } mode; + + /** The data associated with this type of lex mode. */ + union { + struct { + /** This keeps track of the nesting level of the list. */ + size_t nesting; + + /** Whether or not interpolation is allowed in this list. */ + bool interpolation; + + /** + * When lexing a list, it takes into account balancing the + * terminator if the terminator is one of (), [], {}, or <>. + */ + uint8_t incrementor; + + /** This is the terminator of the list literal. */ + uint8_t terminator; + + /** + * This is the character set that should be used to delimit the + * tokens within the list. + */ + uint8_t breakpoints[11]; + } list; + + struct { + /** + * This keeps track of the nesting level of the regular expression. + */ + size_t nesting; + + /** + * When lexing a regular expression, it takes into account balancing + * the terminator if the terminator is one of (), [], {}, or <>. + */ + uint8_t incrementor; + + /** This is the terminator of the regular expression. */ + uint8_t terminator; + + /** + * This is the character set that should be used to delimit the + * tokens within the regular expression. + */ + uint8_t breakpoints[7]; + } regexp; + + struct { + /** This keeps track of the nesting level of the string. */ + size_t nesting; + + /** Whether or not interpolation is allowed in this string. */ + bool interpolation; + + /** + * Whether or not at the end of the string we should allow a :, + * which would indicate this was a dynamic symbol instead of a + * string. + */ + bool label_allowed; + + /** + * When lexing a string, it takes into account balancing the + * terminator if the terminator is one of (), [], {}, or <>. + */ + uint8_t incrementor; + + /** + * This is the terminator of the string. It is typically either a + * single or double quote. + */ + uint8_t terminator; + + /** + * This is the character set that should be used to delimit the + * tokens within the string. + */ + uint8_t breakpoints[7]; + } string; + + struct { + /** + * All of the data necessary to lex a heredoc. + */ + pm_heredoc_lex_mode_t base; + + /** + * This is the pointer to the character where lexing should resume + * once the heredoc has been completely processed. + */ + const uint8_t *next_start; + + /** + * This is used to track the amount of common whitespace on each + * line so that we know how much to dedent each line in the case of + * a tilde heredoc. + */ + size_t *common_whitespace; + + /** True if the previous token ended with a line continuation. */ + bool line_continuation; + } heredoc; + } as; + + /** The previous lex state so that it knows how to pop. */ + struct pm_lex_mode *prev; +} pm_lex_mode_t; + +/** + * We pre-allocate a certain number of lex states in order to avoid having to + * call malloc too many times while parsing. You really shouldn't need more than + * this because you only really nest deeply when doing string interpolation. + */ +#define PM_LEX_STACK_SIZE 4 + +/** + * The parser used to parse Ruby source. + */ +typedef struct pm_parser pm_parser_t; + +/** + * While parsing, we keep track of a stack of contexts. This is helpful for + * error recovery so that we can pop back to a previous context when we hit a + * token that is understood by a parent context but not by the current context. + */ +typedef enum { + /** a null context, used for returning a value from a function */ + PM_CONTEXT_NONE = 0, + + /** a begin statement */ + PM_CONTEXT_BEGIN, + + /** an ensure statement with an explicit begin */ + PM_CONTEXT_BEGIN_ENSURE, + + /** a rescue else statement with an explicit begin */ + PM_CONTEXT_BEGIN_ELSE, + + /** a rescue statement with an explicit begin */ + PM_CONTEXT_BEGIN_RESCUE, + + /** expressions in block arguments using braces */ + PM_CONTEXT_BLOCK_BRACES, + + /** expressions in block arguments using do..end */ + PM_CONTEXT_BLOCK_KEYWORDS, + + /** an ensure statement within a do..end block */ + PM_CONTEXT_BLOCK_ENSURE, + + /** a rescue else statement within a do..end block */ + PM_CONTEXT_BLOCK_ELSE, + + /** expressions in block parameters `foo do |...| end ` */ + PM_CONTEXT_BLOCK_PARAMETERS, + + /** a rescue statement within a do..end block */ + PM_CONTEXT_BLOCK_RESCUE, + + /** a case when statements */ + PM_CONTEXT_CASE_WHEN, + + /** a case in statements */ + PM_CONTEXT_CASE_IN, + + /** a class declaration */ + PM_CONTEXT_CLASS, + + /** an ensure statement within a class statement */ + PM_CONTEXT_CLASS_ENSURE, + + /** a rescue else statement within a class statement */ + PM_CONTEXT_CLASS_ELSE, + + /** a rescue statement within a class statement */ + PM_CONTEXT_CLASS_RESCUE, + + /** a method definition */ + PM_CONTEXT_DEF, + + /** an ensure statement within a method definition */ + PM_CONTEXT_DEF_ENSURE, + + /** a rescue else statement within a method definition */ + PM_CONTEXT_DEF_ELSE, + + /** a rescue statement within a method definition */ + PM_CONTEXT_DEF_RESCUE, + + /** a method definition's parameters */ + PM_CONTEXT_DEF_PARAMS, + + /** a defined? expression */ + PM_CONTEXT_DEFINED, + + /** a method definition's default parameter */ + PM_CONTEXT_DEFAULT_PARAMS, + + /** an else clause */ + PM_CONTEXT_ELSE, + + /** an elsif clause */ + PM_CONTEXT_ELSIF, + + /** an interpolated expression */ + PM_CONTEXT_EMBEXPR, + + /** a for loop */ + PM_CONTEXT_FOR, + + /** a for loop's index */ + PM_CONTEXT_FOR_INDEX, + + /** an if statement */ + PM_CONTEXT_IF, + + /** a lambda expression with braces */ + PM_CONTEXT_LAMBDA_BRACES, + + /** a lambda expression with do..end */ + PM_CONTEXT_LAMBDA_DO_END, + + /** an ensure statement within a lambda expression */ + PM_CONTEXT_LAMBDA_ENSURE, + + /** a rescue else statement within a lambda expression */ + PM_CONTEXT_LAMBDA_ELSE, + + /** a rescue statement within a lambda expression */ + PM_CONTEXT_LAMBDA_RESCUE, + + /** the predicate clause of a loop statement */ + PM_CONTEXT_LOOP_PREDICATE, + + /** the top level context */ + PM_CONTEXT_MAIN, + + /** a module declaration */ + PM_CONTEXT_MODULE, + + /** an ensure statement within a module statement */ + PM_CONTEXT_MODULE_ENSURE, + + /** a rescue else statement within a module statement */ + PM_CONTEXT_MODULE_ELSE, + + /** a rescue statement within a module statement */ + PM_CONTEXT_MODULE_RESCUE, + + /** a multiple target expression */ + PM_CONTEXT_MULTI_TARGET, + + /** a parenthesized expression */ + PM_CONTEXT_PARENS, + + /** an END block */ + PM_CONTEXT_POSTEXE, + + /** a predicate inside an if/elsif/unless statement */ + PM_CONTEXT_PREDICATE, + + /** a BEGIN block */ + PM_CONTEXT_PREEXE, + + /** a modifier rescue clause */ + PM_CONTEXT_RESCUE_MODIFIER, + + /** a singleton class definition */ + PM_CONTEXT_SCLASS, + + /** an ensure statement with a singleton class */ + PM_CONTEXT_SCLASS_ENSURE, + + /** a rescue else statement with a singleton class */ + PM_CONTEXT_SCLASS_ELSE, + + /** a rescue statement with a singleton class */ + PM_CONTEXT_SCLASS_RESCUE, + + /** a ternary expression */ + PM_CONTEXT_TERNARY, + + /** an unless statement */ + PM_CONTEXT_UNLESS, + + /** an until statement */ + PM_CONTEXT_UNTIL, + + /** a while statement */ + PM_CONTEXT_WHILE, +} pm_context_t; + +/** This is a node in a linked list of contexts. */ +typedef struct pm_context_node { + /** The context that this node represents. */ + pm_context_t context; + + /** A pointer to the previous context in the linked list. */ + struct pm_context_node *prev; +} pm_context_node_t; + +/** This is the type of a comment that we've found while parsing. */ +typedef enum { + PM_COMMENT_INLINE, + PM_COMMENT_EMBDOC +} pm_comment_type_t; + +/** + * This is a node in the linked list of comments that we've found while parsing. + * + * @extends pm_list_node_t + */ +typedef struct pm_comment { + /** The embedded base node. */ + pm_list_node_t node; + + /** The location of the comment in the source. */ + pm_location_t location; + + /** The type of comment that we've found. */ + pm_comment_type_t type; +} pm_comment_t; + +/** + * This is a node in the linked list of magic comments that we've found while + * parsing. + * + * @extends pm_list_node_t + */ +typedef struct { + /** The embedded base node. */ + pm_list_node_t node; + + /** A pointer to the start of the key in the source. */ + const uint8_t *key_start; + + /** A pointer to the start of the value in the source. */ + const uint8_t *value_start; + + /** The length of the key in the source. */ + uint32_t key_length; + + /** The length of the value in the source. */ + uint32_t value_length; +} pm_magic_comment_t; + +/** + * When the encoding that is being used to parse the source is changed by prism, + * we provide the ability here to call out to a user-defined function. + */ +typedef void (*pm_encoding_changed_callback_t)(pm_parser_t *parser); + +/** + * When you are lexing through a file, the lexer needs all of the information + * that the parser additionally provides (for example, the local table). So if + * you want to properly lex Ruby, you need to actually lex it in the context of + * the parser. In order to provide this functionality, we optionally allow a + * struct to be attached to the parser that calls back out to a user-provided + * callback when each token is lexed. + */ +typedef struct { + /** + * This opaque pointer is used to provide whatever information the user + * deemed necessary to the callback. In our case we use it to pass the array + * that the tokens get appended into. + */ + void *data; + + /** + * This is the callback that is called when a token is lexed. It is passed + * the opaque data pointer, the parser, and the token that was lexed. + */ + void (*callback)(void *data, pm_parser_t *parser, pm_token_t *token); +} pm_lex_callback_t; + +/** The type of shareable constant value that can be set. */ +typedef uint8_t pm_shareable_constant_value_t; +static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_NONE = 0x0; +static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_LITERAL = PM_SHAREABLE_CONSTANT_NODE_FLAGS_LITERAL; +static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_EXPERIMENTAL_EVERYTHING = PM_SHAREABLE_CONSTANT_NODE_FLAGS_EXPERIMENTAL_EVERYTHING; +static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_EXPERIMENTAL_COPY = PM_SHAREABLE_CONSTANT_NODE_FLAGS_EXPERIMENTAL_COPY; + +/** + * This tracks an individual local variable in a certain lexical context, as + * well as the number of times is it read. + */ +typedef struct { + /** The name of the local variable. */ + pm_constant_id_t name; + + /** The location of the local variable in the source. */ + pm_location_t location; + + /** The index of the local variable in the local table. */ + uint32_t index; + + /** The number of times the local variable is read. */ + uint32_t reads; + + /** The hash of the local variable. */ + uint32_t hash; +} pm_local_t; + +/** + * This is a set of local variables in a certain lexical context (method, class, + * module, etc.). We need to track how many times these variables are read in + * order to warn if they only get written. + */ +typedef struct pm_locals { + /** The number of local variables in the set. */ + uint32_t size; + + /** The capacity of the local variables set. */ + uint32_t capacity; + + /** The nullable allocated memory for the local variables in the set. */ + pm_local_t *locals; +} pm_locals_t; + +/** The flags about scope parameters that can be set. */ +typedef uint8_t pm_scope_parameters_t; +static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_NONE = 0x0; +static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_POSITIONALS = 0x1; +static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_KEYWORDS = 0x2; +static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_BLOCK = 0x4; +static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_ALL = 0x8; +static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_IMPLICIT_DISALLOWED = 0x10; +static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_NUMBERED_INNER = 0x20; +static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_NUMBERED_FOUND = 0x40; + +/** + * This struct represents a node in a linked list of scopes. Some scopes can see + * into their parent scopes, while others cannot. + */ +typedef struct pm_scope { + /** A pointer to the previous scope in the linked list. */ + struct pm_scope *previous; + + /** The IDs of the locals in the given scope. */ + pm_locals_t locals; + + /** + * This is a list of the implicit parameters contained within the block. + * These will be processed after the block is parsed to determine the kind + * of parameters node that should be used and to check if any errors need to + * be added. + */ + pm_node_list_t implicit_parameters; + + /** + * This is a bitfield that indicates the parameters that are being used in + * this scope. It is a combination of the PM_SCOPE_PARAMETERS_* constants. + * There are three different kinds of parameters that can be used in a + * scope: + * + * - Ordinary parameters (e.g., def foo(bar); end) + * - Numbered parameters (e.g., def foo; _1; end) + * - The it parameter (e.g., def foo; it; end) + * + * If ordinary parameters are being used, then certain parameters can be + * forwarded to another method/structure. Those are indicated by four + * additional bits in the params field. For example, some combinations of: + * + * - def foo(*); end + * - def foo(**); end + * - def foo(&); end + * - def foo(...); end + */ + pm_scope_parameters_t parameters; + + /** + * The current state of constant shareability for this scope. This is + * changed by magic shareable_constant_value comments. + */ + pm_shareable_constant_value_t shareable_constant; + + /** + * A boolean indicating whether or not this scope can see into its parent. + * If closed is true, then the scope cannot see into its parent. + */ + bool closed; +} pm_scope_t; + +/** + * A struct that represents a stack of boolean values. + */ +typedef uint32_t pm_state_stack_t; + +/** + * This struct represents the overall parser. It contains a reference to the + * source file, as well as pointers that indicate where in the source it's + * currently parsing. It also contains the most recent and current token that + * it's considering. + */ +struct pm_parser { + /** + * The next node identifier that will be assigned. This is a unique + * identifier used to track nodes such that the syntax tree can be dropped + * but the node can be found through another parse. + */ + uint32_t node_id; + + /** The current state of the lexer. */ + pm_lex_state_t lex_state; + + /** Tracks the current nesting of (), [], and {}. */ + int enclosure_nesting; + + /** + * Used to temporarily track the nesting of enclosures to determine if a { + * is the beginning of a lambda following the parameters of a lambda. + */ + int lambda_enclosure_nesting; + + /** + * Used to track the nesting of braces to ensure we get the correct value + * when we are interpolating blocks with braces. + */ + int brace_nesting; + + /** + * The stack used to determine if a do keyword belongs to the predicate of a + * while, until, or for loop. + */ + pm_state_stack_t do_loop_stack; + + /** + * The stack used to determine if a do keyword belongs to the beginning of a + * block. + */ + pm_state_stack_t accepts_block_stack; + + /** A stack of lex modes. */ + struct { + /** The current mode of the lexer. */ + pm_lex_mode_t *current; + + /** The stack of lexer modes. */ + pm_lex_mode_t stack[PM_LEX_STACK_SIZE]; + + /** The current index into the lexer mode stack. */ + size_t index; + } lex_modes; + + /** The pointer to the start of the source. */ + const uint8_t *start; + + /** The pointer to the end of the source. */ + const uint8_t *end; + + /** The previous token we were considering. */ + pm_token_t previous; + + /** The current token we're considering. */ + pm_token_t current; + + /** + * This is a special field set on the parser when we need the parser to jump + * to a specific location when lexing the next token, as opposed to just + * using the end of the previous token. Normally this is NULL. + */ + const uint8_t *next_start; + + /** + * This field indicates the end of a heredoc whose identifier was found on + * the current line. If another heredoc is found on the same line, then this + * will be moved forward to the end of that heredoc. If no heredocs are + * found on a line then this is NULL. + */ + const uint8_t *heredoc_end; + + /** The list of comments that have been found while parsing. */ + pm_list_t comment_list; + + /** The list of magic comments that have been found while parsing. */ + pm_list_t magic_comment_list; + + /** + * An optional location that represents the location of the __END__ marker + * and the rest of the content of the file. This content is loaded into the + * DATA constant when the file being parsed is the main file being executed. + */ + pm_location_t data_loc; + + /** The list of warnings that have been found while parsing. */ + pm_list_t warning_list; + + /** The list of errors that have been found while parsing. */ + pm_list_t error_list; + + /** The current local scope. */ + pm_scope_t *current_scope; + + /** The current parsing context. */ + pm_context_node_t *current_context; + + /** + * The hash keys for the hash that is currently being parsed. This is not + * usually necessary because it can pass it down the various call chains, + * but in the event that you're parsing a hash that is being directly + * pushed into another hash with **, we need to share the hash keys so that + * we can warn for the nested hash as well. + */ + pm_static_literals_t *current_hash_keys; + + /** + * The encoding functions for the current file is attached to the parser as + * it's parsing so that it can change with a magic comment. + */ + const pm_encoding_t *encoding; + + /** + * When the encoding that is being used to parse the source is changed by + * prism, we provide the ability here to call out to a user-defined + * function. + */ + pm_encoding_changed_callback_t encoding_changed_callback; + + /** + * This pointer indicates where a comment must start if it is to be + * considered an encoding comment. + */ + const uint8_t *encoding_comment_start; + + /** + * This is an optional callback that can be attached to the parser that will + * be called whenever a new token is lexed by the parser. + */ + pm_lex_callback_t *lex_callback; + + /** + * This is the path of the file being parsed. We use the filepath when + * constructing SourceFileNodes. + */ + pm_string_t filepath; + + /** + * This constant pool keeps all of the constants defined throughout the file + * so that we can reference them later. + */ + pm_constant_pool_t constant_pool; + + /** This is the list of newline offsets in the source file. */ + pm_newline_list_t newline_list; + + /** + * We want to add a flag to integer nodes that indicates their base. We only + * want to parse these once, but we don't have space on the token itself to + * communicate this information. So we store it here and pass it through + * when we find tokens that we need it for. + */ + pm_node_flags_t integer_base; + + /** + * This string is used to pass information from the lexer to the parser. It + * is particularly necessary because of escape sequences. + */ + pm_string_t current_string; + + /** + * The line number at the start of the parse. This will be used to offset + * the line numbers of all of the locations. + */ + int32_t start_line; + + /** + * When a string-like expression is being lexed, any byte or escape sequence + * that resolves to a value whose top bit is set (i.e., >= 0x80) will + * explicitly set the encoding to the same encoding as the source. + * Alternatively, if a unicode escape sequence is used (e.g., \\u{80}) that + * resolves to a value whose top bit is set, then the encoding will be + * explicitly set to UTF-8. + * + * The _next_ time this happens, if the encoding that is about to become the + * explicitly set encoding does not match the previously set explicit + * encoding, a mixed encoding error will be emitted. + * + * When the expression is finished being lexed, the explicit encoding + * controls the encoding of the expression. For the most part this means + * that the expression will either be encoded in the source encoding or + * UTF-8. This holds for all encodings except US-ASCII. If the source is + * US-ASCII and an explicit encoding was set that was _not_ UTF-8, then the + * expression will be encoded as ASCII-8BIT. + * + * Note that if the expression is a list, different elements within the same + * list can have different encodings, so this will get reset between each + * element. Furthermore all of this only applies to lists that support + * interpolation, because otherwise escapes that could change the encoding + * are ignored. + * + * At first glance, it may make more sense for this to live on the lexer + * mode, but we need it here to communicate back to the parser for character + * literals that do not push a new lexer mode. + */ + const pm_encoding_t *explicit_encoding; + + /** + * When parsing block exits (e.g., break, next, redo), we need to validate + * that they are in correct contexts. For the most part we can do this by + * looking at our parent contexts. However, modifier while and until + * expressions can change that context to make block exits valid. In these + * cases, we need to keep track of the block exits and then validate them + * after the expression has been parsed. + * + * We use a pointer here because we don't want to keep a whole list attached + * since this will only be used in the context of begin/end expressions. + */ + pm_node_list_t *current_block_exits; + + /** The version of prism that we should use to parse. */ + pm_options_version_t version; + + /** The command line flags given from the options. */ + uint8_t command_line; + + /** + * Whether or not we have found a frozen_string_literal magic comment with + * a true or false value. + * May be: + * - PM_OPTIONS_FROZEN_STRING_LITERAL_DISABLED + * - PM_OPTIONS_FROZEN_STRING_LITERAL_ENABLED + * - PM_OPTIONS_FROZEN_STRING_LITERAL_UNSET + */ + int8_t frozen_string_literal; + + /** + * Whether or not we are parsing an eval string. This impacts whether or not + * we should evaluate if block exits/yields are valid. + */ + bool parsing_eval; + + /** + * Whether or not we are parsing a "partial" script, which is a script that + * will be evaluated in the context of another script, so we should not + * check jumps (next/break/etc.) for validity. + */ + bool partial_script; + + /** Whether or not we're at the beginning of a command. */ + bool command_start; + + /** Whether or not we're currently recovering from a syntax error. */ + bool recovering; + + /** + * This is very specialized behavior for when you want to parse in a context + * that does not respect encoding comments. Its main use case is translating + * into the whitequark/parser AST which re-encodes source files in UTF-8 + * before they are parsed and ignores encoding comments. + */ + bool encoding_locked; + + /** + * Whether or not the encoding has been changed by a magic comment. We use + * this to provide a fast path for the lexer instead of going through the + * function pointer. + */ + bool encoding_changed; + + /** + * This flag indicates that we are currently parsing a pattern matching + * expression and impacts that calculation of newlines. + */ + bool pattern_matching_newlines; + + /** This flag indicates that we are currently parsing a keyword argument. */ + bool in_keyword_arg; + + /** + * Whether or not the parser has seen a token that has semantic meaning + * (i.e., a token that is not a comment or whitespace). + */ + bool semantic_token_seen; + + /** + * True if the current regular expression being lexed contains only ASCII + * characters. + */ + bool current_regular_expression_ascii_only; + + /** + * By default, Ruby always warns about mismatched indentation. This can be + * toggled with a magic comment. + */ + bool warn_mismatched_indentation; +}; + +#endif diff --git a/prism/prettyprint.h b/prism/prettyprint.h new file mode 100644 index 0000000000..5a52b2b6b8 --- /dev/null +++ b/prism/prettyprint.h @@ -0,0 +1,34 @@ +/** + * @file prettyprint.h + * + * An AST node pretty-printer. + */ +#ifndef PRISM_PRETTYPRINT_H +#define PRISM_PRETTYPRINT_H + +#include "prism/defines.h" + +#ifdef PRISM_EXCLUDE_PRETTYPRINT + +void pm_prettyprint(void); + +#else + +#include <stdio.h> + +#include "prism/ast.h" +#include "prism/parser.h" +#include "prism/util/pm_buffer.h" + +/** + * Pretty-prints the AST represented by the given node to the given buffer. + * + * @param output_buffer The buffer to write the pretty-printed AST to. + * @param parser The parser that parsed the AST. + * @param node The root node of the AST to pretty-print. + */ +PRISM_EXPORTED_FUNCTION void pm_prettyprint(pm_buffer_t *output_buffer, const pm_parser_t *parser, const pm_node_t *node); + +#endif + +#endif diff --git a/prism/prism.c b/prism/prism.c new file mode 100644 index 0000000000..b158e505b2 --- /dev/null +++ b/prism/prism.c @@ -0,0 +1,22679 @@ +#include "prism.h" + +/** + * The prism version and the serialization format. + */ +const char * +pm_version(void) { + return PRISM_VERSION; +} + +/** + * In heredocs, tabs automatically complete up to the next 8 spaces. This is + * defined in CRuby as TAB_WIDTH. + */ +#define PM_TAB_WHITESPACE_SIZE 8 + +// Macros for min/max. +#define MIN(a,b) (((a)<(b))?(a):(b)) +#define MAX(a,b) (((a)>(b))?(a):(b)) + +/******************************************************************************/ +/* Helpful AST-related macros */ +/******************************************************************************/ + +#define FL PM_NODE_FLAGS +#define UP PM_NODE_UPCAST + +#define PM_TOKEN_START(token_) ((token_)->start) +#define PM_TOKEN_END(token_) ((token_)->end) + +#define PM_NODE_START(node_) (UP(node_)->location.start) +#define PM_NODE_END(node_) (UP(node_)->location.end) + +#define PM_LOCATION_NULL_VALUE(parser_) ((pm_location_t) { .start = (parser_)->start, .end = (parser_)->start }) +#define PM_LOCATION_TOKEN_VALUE(token_) ((pm_location_t) { .start = PM_TOKEN_START(token_), .end = PM_TOKEN_END(token_) }) +#define PM_LOCATION_NODE_VALUE(node_) ((pm_location_t) { .start = PM_NODE_START(node_), .end = PM_NODE_END(node_) }) +#define PM_OPTIONAL_LOCATION_TOKEN_VALUE(token) ((token)->type == PM_TOKEN_NOT_PROVIDED ? ((pm_location_t) { 0 }) : PM_LOCATION_TOKEN_VALUE(token)) + +/******************************************************************************/ +/* Lex mode manipulations */ +/******************************************************************************/ + +/** + * Returns the incrementor character that should be used to increment the + * nesting count if one is possible. + */ +static inline uint8_t +lex_mode_incrementor(const uint8_t start) { + switch (start) { + case '(': + case '[': + case '{': + case '<': + return start; + default: + return '\0'; + } +} + +/** + * Returns the matching character that should be used to terminate a list + * beginning with the given character. + */ +static inline uint8_t +lex_mode_terminator(const uint8_t start) { + switch (start) { + case '(': + return ')'; + case '[': + return ']'; + case '{': + return '}'; + case '<': + return '>'; + default: + return start; + } +} + +/** + * Push a new lex state onto the stack. If we're still within the pre-allocated + * space of the lex state stack, then we'll just use a new slot. Otherwise we'll + * allocate a new pointer and use that. + */ +static bool +lex_mode_push(pm_parser_t *parser, pm_lex_mode_t lex_mode) { + lex_mode.prev = parser->lex_modes.current; + parser->lex_modes.index++; + + if (parser->lex_modes.index > PM_LEX_STACK_SIZE - 1) { + parser->lex_modes.current = (pm_lex_mode_t *) xmalloc(sizeof(pm_lex_mode_t)); + if (parser->lex_modes.current == NULL) return false; + + *parser->lex_modes.current = lex_mode; + } else { + parser->lex_modes.stack[parser->lex_modes.index] = lex_mode; + parser->lex_modes.current = &parser->lex_modes.stack[parser->lex_modes.index]; + } + + return true; +} + +/** + * Push on a new list lex mode. + */ +static inline bool +lex_mode_push_list(pm_parser_t *parser, bool interpolation, uint8_t delimiter) { + uint8_t incrementor = lex_mode_incrementor(delimiter); + uint8_t terminator = lex_mode_terminator(delimiter); + + pm_lex_mode_t lex_mode = { + .mode = PM_LEX_LIST, + .as.list = { + .nesting = 0, + .interpolation = interpolation, + .incrementor = incrementor, + .terminator = terminator + } + }; + + // These are the places where we need to split up the content of the list. + // We'll use strpbrk to find the first of these characters. + uint8_t *breakpoints = lex_mode.as.list.breakpoints; + memcpy(breakpoints, "\\ \t\f\r\v\n\0\0\0", sizeof(lex_mode.as.list.breakpoints)); + size_t index = 7; + + // Now we'll add the terminator to the list of breakpoints. If the + // terminator is not already a NULL byte, add it to the list. + if (terminator != '\0') { + breakpoints[index++] = terminator; + } + + // If interpolation is allowed, then we're going to check for the # + // character. Otherwise we'll only look for escapes and the terminator. + if (interpolation) { + breakpoints[index++] = '#'; + } + + // If there is an incrementor, then we'll check for that as well. + if (incrementor != '\0') { + breakpoints[index++] = incrementor; + } + + parser->explicit_encoding = NULL; + return lex_mode_push(parser, lex_mode); +} + +/** + * Push on a new list lex mode that is only used for compatibility. This is + * called when we're at the end of the file. We want the parser to be able to + * perform its normal error tolerance. + */ +static inline bool +lex_mode_push_list_eof(pm_parser_t *parser) { + return lex_mode_push_list(parser, false, '\0'); +} + +/** + * Push on a new regexp lex mode. + */ +static inline bool +lex_mode_push_regexp(pm_parser_t *parser, uint8_t incrementor, uint8_t terminator) { + pm_lex_mode_t lex_mode = { + .mode = PM_LEX_REGEXP, + .as.regexp = { + .nesting = 0, + .incrementor = incrementor, + .terminator = terminator + } + }; + + // These are the places where we need to split up the content of the + // regular expression. We'll use strpbrk to find the first of these + // characters. + uint8_t *breakpoints = lex_mode.as.regexp.breakpoints; + memcpy(breakpoints, "\r\n\\#\0\0", sizeof(lex_mode.as.regexp.breakpoints)); + size_t index = 4; + + // First we'll add the terminator. + if (terminator != '\0') { + breakpoints[index++] = terminator; + } + + // Next, if there is an incrementor, then we'll check for that as well. + if (incrementor != '\0') { + breakpoints[index++] = incrementor; + } + + parser->explicit_encoding = NULL; + return lex_mode_push(parser, lex_mode); +} + +/** + * Push on a new string lex mode. + */ +static inline bool +lex_mode_push_string(pm_parser_t *parser, bool interpolation, bool label_allowed, uint8_t incrementor, uint8_t terminator) { + pm_lex_mode_t lex_mode = { + .mode = PM_LEX_STRING, + .as.string = { + .nesting = 0, + .interpolation = interpolation, + .label_allowed = label_allowed, + .incrementor = incrementor, + .terminator = terminator + } + }; + + // These are the places where we need to split up the content of the + // string. We'll use strpbrk to find the first of these characters. + uint8_t *breakpoints = lex_mode.as.string.breakpoints; + memcpy(breakpoints, "\r\n\\\0\0\0", sizeof(lex_mode.as.string.breakpoints)); + size_t index = 3; + + // Now add in the terminator. If the terminator is not already a NULL byte, + // then we'll add it. + if (terminator != '\0') { + breakpoints[index++] = terminator; + } + + // If interpolation is allowed, then we're going to check for the # + // character. Otherwise we'll only look for escapes and the terminator. + if (interpolation) { + breakpoints[index++] = '#'; + } + + // If we have an incrementor, then we'll add that in as a breakpoint as + // well. + if (incrementor != '\0') { + breakpoints[index++] = incrementor; + } + + parser->explicit_encoding = NULL; + return lex_mode_push(parser, lex_mode); +} + +/** + * Push on a new string lex mode that is only used for compatibility. This is + * called when we're at the end of the file. We want the parser to be able to + * perform its normal error tolerance. + */ +static inline bool +lex_mode_push_string_eof(pm_parser_t *parser) { + return lex_mode_push_string(parser, false, false, '\0', '\0'); +} + +/** + * Pop the current lex state off the stack. If we're within the pre-allocated + * space of the lex state stack, then we'll just decrement the index. Otherwise + * we'll free the current pointer and use the previous pointer. + */ +static void +lex_mode_pop(pm_parser_t *parser) { + if (parser->lex_modes.index == 0) { + parser->lex_modes.current->mode = PM_LEX_DEFAULT; + } else if (parser->lex_modes.index < PM_LEX_STACK_SIZE) { + parser->lex_modes.index--; + parser->lex_modes.current = &parser->lex_modes.stack[parser->lex_modes.index]; + } else { + parser->lex_modes.index--; + pm_lex_mode_t *prev = parser->lex_modes.current->prev; + xfree(parser->lex_modes.current); + parser->lex_modes.current = prev; + } +} + +/** + * This is the equivalent of IS_lex_state is CRuby. + */ +static inline bool +lex_state_p(const pm_parser_t *parser, pm_lex_state_t state) { + return parser->lex_state & state; +} + +typedef enum { + PM_IGNORED_NEWLINE_NONE = 0, + PM_IGNORED_NEWLINE_ALL, + PM_IGNORED_NEWLINE_PATTERN +} pm_ignored_newline_type_t; + +static inline pm_ignored_newline_type_t +lex_state_ignored_p(pm_parser_t *parser) { + bool ignored = lex_state_p(parser, PM_LEX_STATE_BEG | PM_LEX_STATE_CLASS | PM_LEX_STATE_FNAME | PM_LEX_STATE_DOT) && !lex_state_p(parser, PM_LEX_STATE_LABELED); + + if (ignored) { + return PM_IGNORED_NEWLINE_ALL; + } else if ((parser->lex_state & ~((unsigned int) PM_LEX_STATE_LABEL)) == (PM_LEX_STATE_ARG | PM_LEX_STATE_LABELED)) { + return PM_IGNORED_NEWLINE_PATTERN; + } else { + return PM_IGNORED_NEWLINE_NONE; + } +} + +static inline bool +lex_state_beg_p(pm_parser_t *parser) { + return lex_state_p(parser, PM_LEX_STATE_BEG_ANY) || ((parser->lex_state & (PM_LEX_STATE_ARG | PM_LEX_STATE_LABELED)) == (PM_LEX_STATE_ARG | PM_LEX_STATE_LABELED)); +} + +static inline bool +lex_state_arg_p(pm_parser_t *parser) { + return lex_state_p(parser, PM_LEX_STATE_ARG_ANY); +} + +static inline bool +lex_state_spcarg_p(pm_parser_t *parser, bool space_seen) { + if (parser->current.end >= parser->end) { + return false; + } + return lex_state_arg_p(parser) && space_seen && !pm_char_is_whitespace(*parser->current.end); +} + +static inline bool +lex_state_end_p(pm_parser_t *parser) { + return lex_state_p(parser, PM_LEX_STATE_END_ANY); +} + +/** + * This is the equivalent of IS_AFTER_OPERATOR in CRuby. + */ +static inline bool +lex_state_operator_p(pm_parser_t *parser) { + return lex_state_p(parser, PM_LEX_STATE_FNAME | PM_LEX_STATE_DOT); +} + +/** + * Set the state of the lexer. This is defined as a function to be able to put a + * breakpoint in it. + */ +static inline void +lex_state_set(pm_parser_t *parser, pm_lex_state_t state) { + parser->lex_state = state; +} + +#ifndef PM_DEBUG_LOGGING +/** + * Debugging logging will print additional information to stdout whenever the + * lexer state changes. + */ +#define PM_DEBUG_LOGGING 0 +#endif + +#if PM_DEBUG_LOGGING +PRISM_ATTRIBUTE_UNUSED static void +debug_state(pm_parser_t *parser) { + fprintf(stderr, "STATE: "); + bool first = true; + + if (parser->lex_state == PM_LEX_STATE_NONE) { + fprintf(stderr, "NONE\n"); + return; + } + +#define CHECK_STATE(state) \ + if (parser->lex_state & state) { \ + if (!first) fprintf(stderr, "|"); \ + fprintf(stderr, "%s", #state); \ + first = false; \ + } + + CHECK_STATE(PM_LEX_STATE_BEG) + CHECK_STATE(PM_LEX_STATE_END) + CHECK_STATE(PM_LEX_STATE_ENDARG) + CHECK_STATE(PM_LEX_STATE_ENDFN) + CHECK_STATE(PM_LEX_STATE_ARG) + CHECK_STATE(PM_LEX_STATE_CMDARG) + CHECK_STATE(PM_LEX_STATE_MID) + CHECK_STATE(PM_LEX_STATE_FNAME) + CHECK_STATE(PM_LEX_STATE_DOT) + CHECK_STATE(PM_LEX_STATE_CLASS) + CHECK_STATE(PM_LEX_STATE_LABEL) + CHECK_STATE(PM_LEX_STATE_LABELED) + CHECK_STATE(PM_LEX_STATE_FITEM) + +#undef CHECK_STATE + + fprintf(stderr, "\n"); +} + +static void +debug_lex_state_set(pm_parser_t *parser, pm_lex_state_t state, char const * caller_name, int line_number) { + fprintf(stderr, "Caller: %s:%d\nPrevious: ", caller_name, line_number); + debug_state(parser); + lex_state_set(parser, state); + fprintf(stderr, "Now: "); + debug_state(parser); + fprintf(stderr, "\n"); +} + +#define lex_state_set(parser, state) debug_lex_state_set(parser, state, __func__, __LINE__) +#endif + +/******************************************************************************/ +/* Command-line macro helpers */ +/******************************************************************************/ + +/** True if the parser has the given command-line option. */ +#define PM_PARSER_COMMAND_LINE_OPTION(parser, option) ((parser)->command_line & (option)) + +/** True if the -a command line option was given. */ +#define PM_PARSER_COMMAND_LINE_OPTION_A(parser) PM_PARSER_COMMAND_LINE_OPTION(parser, PM_OPTIONS_COMMAND_LINE_A) + +/** True if the -e command line option was given. */ +#define PM_PARSER_COMMAND_LINE_OPTION_E(parser) PM_PARSER_COMMAND_LINE_OPTION(parser, PM_OPTIONS_COMMAND_LINE_E) + +/** True if the -l command line option was given. */ +#define PM_PARSER_COMMAND_LINE_OPTION_L(parser) PM_PARSER_COMMAND_LINE_OPTION(parser, PM_OPTIONS_COMMAND_LINE_L) + +/** True if the -n command line option was given. */ +#define PM_PARSER_COMMAND_LINE_OPTION_N(parser) PM_PARSER_COMMAND_LINE_OPTION(parser, PM_OPTIONS_COMMAND_LINE_N) + +/** True if the -p command line option was given. */ +#define PM_PARSER_COMMAND_LINE_OPTION_P(parser) PM_PARSER_COMMAND_LINE_OPTION(parser, PM_OPTIONS_COMMAND_LINE_P) + +/** True if the -x command line option was given. */ +#define PM_PARSER_COMMAND_LINE_OPTION_X(parser) PM_PARSER_COMMAND_LINE_OPTION(parser, PM_OPTIONS_COMMAND_LINE_X) + +/******************************************************************************/ +/* Diagnostic-related functions */ +/******************************************************************************/ + +/** + * Append an error to the list of errors on the parser. + */ +static inline void +pm_parser_err(pm_parser_t *parser, const uint8_t *start, const uint8_t *end, pm_diagnostic_id_t diag_id) { + pm_diagnostic_list_append(&parser->error_list, start, end, diag_id); +} + +/** + * Append an error to the list of errors on the parser using a format string. + */ +#define PM_PARSER_ERR_FORMAT(parser, start, end, diag_id, ...) \ + pm_diagnostic_list_append_format(&parser->error_list, start, end, diag_id, __VA_ARGS__) + +/** + * Append an error to the list of errors on the parser using the location of the + * current token. + */ +static inline void +pm_parser_err_current(pm_parser_t *parser, pm_diagnostic_id_t diag_id) { + pm_parser_err(parser, parser->current.start, parser->current.end, diag_id); +} + +/** + * Append an error to the list of errors on the parser using the given location + * using a format string. + */ +#define PM_PARSER_ERR_LOCATION_FORMAT(parser, location, diag_id, ...) \ + PM_PARSER_ERR_FORMAT(parser, (location)->start, (location)->end, diag_id, __VA_ARGS__) + +/** + * Append an error to the list of errors on the parser using the location of the + * given node. + */ +static inline void +pm_parser_err_node(pm_parser_t *parser, const pm_node_t *node, pm_diagnostic_id_t diag_id) { + pm_parser_err(parser, node->location.start, node->location.end, diag_id); +} + +/** + * Append an error to the list of errors on the parser using the location of the + * given node and a format string. + */ +#define PM_PARSER_ERR_NODE_FORMAT(parser, node, diag_id, ...) \ + PM_PARSER_ERR_FORMAT(parser, (node)->location.start, (node)->location.end, diag_id, __VA_ARGS__) + +/** + * Append an error to the list of errors on the parser using the location of the + * given node and a format string, and add on the content of the node. + */ +#define PM_PARSER_ERR_NODE_FORMAT_CONTENT(parser, node, diag_id) \ + PM_PARSER_ERR_NODE_FORMAT(parser, node, diag_id, (int) ((node)->location.end - (node)->location.start), (const char *) (node)->location.start) + +/** + * Append an error to the list of errors on the parser using the location of the + * previous token. + */ +static inline void +pm_parser_err_previous(pm_parser_t *parser, pm_diagnostic_id_t diag_id) { + pm_parser_err(parser, parser->previous.start, parser->previous.end, diag_id); +} + +/** + * Append an error to the list of errors on the parser using the location of the + * given token. + */ +static inline void +pm_parser_err_token(pm_parser_t *parser, const pm_token_t *token, pm_diagnostic_id_t diag_id) { + pm_parser_err(parser, token->start, token->end, diag_id); +} + +/** + * Append an error to the list of errors on the parser using the location of the + * given token and a format string. + */ +#define PM_PARSER_ERR_TOKEN_FORMAT(parser, token, diag_id, ...) \ + PM_PARSER_ERR_FORMAT(parser, (token).start, (token).end, diag_id, __VA_ARGS__) + +/** + * Append an error to the list of errors on the parser using the location of the + * given token and a format string, and add on the content of the token. + */ +#define PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, token, diag_id) \ + PM_PARSER_ERR_TOKEN_FORMAT(parser, token, diag_id, (int) ((token).end - (token).start), (const char *) (token).start) + +/** + * Append a warning to the list of warnings on the parser. + */ +static inline void +pm_parser_warn(pm_parser_t *parser, const uint8_t *start, const uint8_t *end, pm_diagnostic_id_t diag_id) { + pm_diagnostic_list_append(&parser->warning_list, start, end, diag_id); +} + +/** + * Append a warning to the list of warnings on the parser using the location of + * the given token. + */ +static inline void +pm_parser_warn_token(pm_parser_t *parser, const pm_token_t *token, pm_diagnostic_id_t diag_id) { + pm_parser_warn(parser, token->start, token->end, diag_id); +} + +/** + * Append a warning to the list of warnings on the parser using the location of + * the given node. + */ +static inline void +pm_parser_warn_node(pm_parser_t *parser, const pm_node_t *node, pm_diagnostic_id_t diag_id) { + pm_parser_warn(parser, node->location.start, node->location.end, diag_id); +} + +/** + * Append a warning to the list of warnings on the parser using a format string. + */ +#define PM_PARSER_WARN_FORMAT(parser, start, end, diag_id, ...) \ + pm_diagnostic_list_append_format(&parser->warning_list, start, end, diag_id, __VA_ARGS__) + +/** + * Append a warning to the list of warnings on the parser using the location of + * the given token and a format string. + */ +#define PM_PARSER_WARN_TOKEN_FORMAT(parser, token, diag_id, ...) \ + PM_PARSER_WARN_FORMAT(parser, (token).start, (token).end, diag_id, __VA_ARGS__) + +/** + * Append a warning to the list of warnings on the parser using the location of + * the given token and a format string, and add on the content of the token. + */ +#define PM_PARSER_WARN_TOKEN_FORMAT_CONTENT(parser, token, diag_id) \ + PM_PARSER_WARN_TOKEN_FORMAT(parser, token, diag_id, (int) ((token).end - (token).start), (const char *) (token).start) + +/** + * Append a warning to the list of warnings on the parser using the location of + * the given node and a format string. + */ +#define PM_PARSER_WARN_NODE_FORMAT(parser, node, diag_id, ...) \ + PM_PARSER_WARN_FORMAT(parser, (node)->location.start, (node)->location.end, diag_id, __VA_ARGS__) + +/** + * Add an error for an expected heredoc terminator. This is a special function + * only because it grabs its location off of a lex mode instead of a node or a + * token. + */ +static void +pm_parser_err_heredoc_term(pm_parser_t *parser, const uint8_t *ident_start, size_t ident_length) { + PM_PARSER_ERR_FORMAT( + parser, + ident_start, + ident_start + ident_length, + PM_ERR_HEREDOC_TERM, + (int) ident_length, + (const char *) ident_start + ); +} + +/******************************************************************************/ +/* Scope-related functions */ +/******************************************************************************/ + +/** + * Allocate and initialize a new scope. Push it onto the scope stack. + */ +static bool +pm_parser_scope_push(pm_parser_t *parser, bool closed) { + pm_scope_t *scope = (pm_scope_t *) xmalloc(sizeof(pm_scope_t)); + if (scope == NULL) return false; + + *scope = (pm_scope_t) { + .previous = parser->current_scope, + .locals = { 0 }, + .parameters = PM_SCOPE_PARAMETERS_NONE, + .implicit_parameters = { 0 }, + .shareable_constant = parser->current_scope == NULL ? PM_SCOPE_SHAREABLE_CONSTANT_NONE : parser->current_scope->shareable_constant, + .closed = closed + }; + + parser->current_scope = scope; + return true; +} + +/** + * Determine if the current scope is at the top level. This means it is either + * the top-level scope or it is open to the top-level. + */ +static bool +pm_parser_scope_toplevel_p(pm_parser_t *parser) { + pm_scope_t *scope = parser->current_scope; + + do { + if (scope->previous == NULL) return true; + if (scope->closed) return false; + } while ((scope = scope->previous) != NULL); + + assert(false && "unreachable"); + return true; +} + +/** + * Retrieve the scope at the given depth. + */ +static pm_scope_t * +pm_parser_scope_find(pm_parser_t *parser, uint32_t depth) { + pm_scope_t *scope = parser->current_scope; + + while (depth-- > 0) { + assert(scope != NULL); + scope = scope->previous; + } + + return scope; +} + +typedef enum { + PM_SCOPE_FORWARDING_PARAM_CHECK_RESULT_PASS, + PM_SCOPE_FORWARDING_PARAM_CHECK_RESULT_CONFLICT, + PM_SCOPE_FORWARDING_PARAM_CHECK_RESULT_FAIL +} pm_scope_forwarding_param_check_result_t; + +static pm_scope_forwarding_param_check_result_t +pm_parser_scope_forwarding_param_check(pm_parser_t *parser, const uint8_t mask) { + pm_scope_t *scope = parser->current_scope; + bool conflict = false; + + while (scope != NULL) { + if (scope->parameters & mask) { + if (scope->closed) { + if (conflict) { + return PM_SCOPE_FORWARDING_PARAM_CHECK_RESULT_CONFLICT; + } else { + return PM_SCOPE_FORWARDING_PARAM_CHECK_RESULT_PASS; + } + } + + conflict = true; + } + + if (scope->closed) break; + scope = scope->previous; + } + + return PM_SCOPE_FORWARDING_PARAM_CHECK_RESULT_FAIL; +} + +static void +pm_parser_scope_forwarding_block_check(pm_parser_t *parser, const pm_token_t * token) { + switch (pm_parser_scope_forwarding_param_check(parser, PM_SCOPE_PARAMETERS_FORWARDING_BLOCK)) { + case PM_SCOPE_FORWARDING_PARAM_CHECK_RESULT_PASS: + // Pass. + break; + case PM_SCOPE_FORWARDING_PARAM_CHECK_RESULT_CONFLICT: + pm_parser_err_token(parser, token, PM_ERR_ARGUMENT_CONFLICT_AMPERSAND); + break; + case PM_SCOPE_FORWARDING_PARAM_CHECK_RESULT_FAIL: + pm_parser_err_token(parser, token, PM_ERR_ARGUMENT_NO_FORWARDING_AMPERSAND); + break; + } +} + +static void +pm_parser_scope_forwarding_positionals_check(pm_parser_t *parser, const pm_token_t * token) { + switch (pm_parser_scope_forwarding_param_check(parser, PM_SCOPE_PARAMETERS_FORWARDING_POSITIONALS)) { + case PM_SCOPE_FORWARDING_PARAM_CHECK_RESULT_PASS: + // Pass. + break; + case PM_SCOPE_FORWARDING_PARAM_CHECK_RESULT_CONFLICT: + pm_parser_err_token(parser, token, PM_ERR_ARGUMENT_CONFLICT_STAR); + break; + case PM_SCOPE_FORWARDING_PARAM_CHECK_RESULT_FAIL: + pm_parser_err_token(parser, token, PM_ERR_ARGUMENT_NO_FORWARDING_STAR); + break; + } +} + +static void +pm_parser_scope_forwarding_all_check(pm_parser_t *parser, const pm_token_t *token) { + switch (pm_parser_scope_forwarding_param_check(parser, PM_SCOPE_PARAMETERS_FORWARDING_ALL)) { + case PM_SCOPE_FORWARDING_PARAM_CHECK_RESULT_PASS: + // Pass. + break; + case PM_SCOPE_FORWARDING_PARAM_CHECK_RESULT_CONFLICT: + // This shouldn't happen, because ... is not allowed in the + // declaration of blocks. If we get here, we assume we already have + // an error for this. + break; + case PM_SCOPE_FORWARDING_PARAM_CHECK_RESULT_FAIL: + pm_parser_err_token(parser, token, PM_ERR_ARGUMENT_NO_FORWARDING_ELLIPSES); + break; + } +} + +static void +pm_parser_scope_forwarding_keywords_check(pm_parser_t *parser, const pm_token_t * token) { + switch (pm_parser_scope_forwarding_param_check(parser, PM_SCOPE_PARAMETERS_FORWARDING_KEYWORDS)) { + case PM_SCOPE_FORWARDING_PARAM_CHECK_RESULT_PASS: + // Pass. + break; + case PM_SCOPE_FORWARDING_PARAM_CHECK_RESULT_CONFLICT: + pm_parser_err_token(parser, token, PM_ERR_ARGUMENT_CONFLICT_STAR_STAR); + break; + case PM_SCOPE_FORWARDING_PARAM_CHECK_RESULT_FAIL: + pm_parser_err_token(parser, token, PM_ERR_ARGUMENT_NO_FORWARDING_STAR_STAR); + break; + } +} + +/** + * Get the current state of constant shareability. + */ +static inline pm_shareable_constant_value_t +pm_parser_scope_shareable_constant_get(pm_parser_t *parser) { + return parser->current_scope->shareable_constant; +} + +/** + * Set the current state of constant shareability. We'll set it on all of the + * open scopes so that reads are quick. + */ +static void +pm_parser_scope_shareable_constant_set(pm_parser_t *parser, pm_shareable_constant_value_t shareable_constant) { + pm_scope_t *scope = parser->current_scope; + + do { + scope->shareable_constant = shareable_constant; + } while (!scope->closed && (scope = scope->previous) != NULL); +} + +/******************************************************************************/ +/* Local variable-related functions */ +/******************************************************************************/ + +/** + * The point at which the set of locals switches from being a list to a hash. + */ +#define PM_LOCALS_HASH_THRESHOLD 9 + +static void +pm_locals_free(pm_locals_t *locals) { + if (locals->capacity > 0) { + xfree(locals->locals); + } +} + +/** + * Use as simple and fast a hash function as we can that still properly mixes + * the bits. + */ +static uint32_t +pm_locals_hash(pm_constant_id_t name) { + name = ((name >> 16) ^ name) * 0x45d9f3b; + name = ((name >> 16) ^ name) * 0x45d9f3b; + name = (name >> 16) ^ name; + return name; +} + +/** + * Resize the locals list to be twice its current size. If the next capacity is + * above the threshold for switching to a hash, then we'll switch to a hash. + */ +static void +pm_locals_resize(pm_locals_t *locals) { + uint32_t next_capacity = locals->capacity == 0 ? 4 : (locals->capacity * 2); + assert(next_capacity > locals->capacity); + + pm_local_t *next_locals = xcalloc(next_capacity, sizeof(pm_local_t)); + if (next_locals == NULL) abort(); + + if (next_capacity < PM_LOCALS_HASH_THRESHOLD) { + if (locals->size > 0) { + memcpy(next_locals, locals->locals, locals->size * sizeof(pm_local_t)); + } + } else { + // If we just switched from a list to a hash, then we need to fill in + // the hash values of all of the locals. + bool hash_needed = (locals->capacity <= PM_LOCALS_HASH_THRESHOLD); + uint32_t mask = next_capacity - 1; + + for (uint32_t index = 0; index < locals->capacity; index++) { + pm_local_t *local = &locals->locals[index]; + + if (local->name != PM_CONSTANT_ID_UNSET) { + if (hash_needed) local->hash = pm_locals_hash(local->name); + + uint32_t hash = local->hash; + while (next_locals[hash & mask].name != PM_CONSTANT_ID_UNSET) hash++; + next_locals[hash & mask] = *local; + } + } + } + + pm_locals_free(locals); + locals->locals = next_locals; + locals->capacity = next_capacity; +} + +/** + * Add a new local to the set of locals. This will automatically rehash the + * locals if the size is greater than 3/4 of the capacity. + * + * @param locals The set of locals to add to. + * @param name The name of the local. + * @param start The source location that represents the start of the local. This + * is used for the location of the warning in case this local is not read. + * @param end The source location that represents the end of the local. This is + * used for the location of the warning in case this local is not read. + * @param reads The initial number of reads for this local. Usually this is set + * to 0, but for some locals (like parameters) we want to initialize it with + * 1 so that we never warn on unused parameters. + * @return True if the local was added, and false if the local already exists. + */ +static bool +pm_locals_write(pm_locals_t *locals, pm_constant_id_t name, const uint8_t *start, const uint8_t *end, uint32_t reads) { + if (locals->size >= (locals->capacity / 4 * 3)) { + pm_locals_resize(locals); + } + + if (locals->capacity < PM_LOCALS_HASH_THRESHOLD) { + for (uint32_t index = 0; index < locals->capacity; index++) { + pm_local_t *local = &locals->locals[index]; + + if (local->name == PM_CONSTANT_ID_UNSET) { + *local = (pm_local_t) { + .name = name, + .location = { .start = start, .end = end }, + .index = locals->size++, + .reads = reads, + .hash = 0 + }; + return true; + } else if (local->name == name) { + return false; + } + } + } else { + uint32_t mask = locals->capacity - 1; + uint32_t hash = pm_locals_hash(name); + uint32_t initial_hash = hash; + + do { + pm_local_t *local = &locals->locals[hash & mask]; + + if (local->name == PM_CONSTANT_ID_UNSET) { + *local = (pm_local_t) { + .name = name, + .location = { .start = start, .end = end }, + .index = locals->size++, + .reads = reads, + .hash = initial_hash + }; + return true; + } else if (local->name == name) { + return false; + } else { + hash++; + } + } while ((hash & mask) != initial_hash); + } + + assert(false && "unreachable"); + return true; +} + +/** + * Finds the index of a local variable in the locals set. If it is not found, + * this returns UINT32_MAX. + */ +static uint32_t +pm_locals_find(pm_locals_t *locals, pm_constant_id_t name) { + if (locals->capacity < PM_LOCALS_HASH_THRESHOLD) { + for (uint32_t index = 0; index < locals->size; index++) { + pm_local_t *local = &locals->locals[index]; + if (local->name == name) return index; + } + } else { + uint32_t mask = locals->capacity - 1; + uint32_t hash = pm_locals_hash(name); + uint32_t initial_hash = hash & mask; + + do { + pm_local_t *local = &locals->locals[hash & mask]; + + if (local->name == PM_CONSTANT_ID_UNSET) { + return UINT32_MAX; + } else if (local->name == name) { + return hash & mask; + } else { + hash++; + } + } while ((hash & mask) != initial_hash); + } + + return UINT32_MAX; +} + +/** + * Called when a variable is read in a certain lexical context. Tracks the read + * by adding to the reads count. + */ +static void +pm_locals_read(pm_locals_t *locals, pm_constant_id_t name) { + uint32_t index = pm_locals_find(locals, name); + assert(index != UINT32_MAX); + + pm_local_t *local = &locals->locals[index]; + assert(local->reads < UINT32_MAX); + + local->reads++; +} + +/** + * Called when a variable read is transformed into a variable write, because a + * write operator is found after the variable name. + */ +static void +pm_locals_unread(pm_locals_t *locals, pm_constant_id_t name) { + uint32_t index = pm_locals_find(locals, name); + assert(index != UINT32_MAX); + + pm_local_t *local = &locals->locals[index]; + assert(local->reads > 0); + + local->reads--; +} + +/** + * Returns the current number of reads for a local variable. + */ +static uint32_t +pm_locals_reads(pm_locals_t *locals, pm_constant_id_t name) { + uint32_t index = pm_locals_find(locals, name); + assert(index != UINT32_MAX); + + return locals->locals[index].reads; +} + +/** + * Write out the locals into the given list of constant ids in the correct + * order. This is used to set the list of locals on the nodes in the tree once + * we're sure no additional locals will be added to the set. + * + * This function is also responsible for warning when a local variable has been + * written but not read in certain contexts. + */ +static void +pm_locals_order(PRISM_ATTRIBUTE_UNUSED pm_parser_t *parser, pm_locals_t *locals, pm_constant_id_list_t *list, bool toplevel) { + pm_constant_id_list_init_capacity(list, locals->size); + + // If we're still below the threshold for switching to a hash, then we only + // need to loop over the locals until we hit the size because the locals are + // stored in a list. + uint32_t capacity = locals->capacity < PM_LOCALS_HASH_THRESHOLD ? locals->size : locals->capacity; + + // We will only warn for unused variables if we're not at the top level, or + // if we're parsing a file outside of eval or -e. + bool warn_unused = !toplevel || (!parser->parsing_eval && !PM_PARSER_COMMAND_LINE_OPTION_E(parser)); + + for (uint32_t index = 0; index < capacity; index++) { + pm_local_t *local = &locals->locals[index]; + + if (local->name != PM_CONSTANT_ID_UNSET) { + pm_constant_id_list_insert(list, (size_t) local->index, local->name); + + if (warn_unused && local->reads == 0 && ((parser->start_line >= 0) || (pm_newline_list_line(&parser->newline_list, local->location.start, parser->start_line) >= 0))) { + pm_constant_t *constant = pm_constant_pool_id_to_constant(&parser->constant_pool, local->name); + + if (constant->length >= 1 && *constant->start != '_') { + PM_PARSER_WARN_FORMAT( + parser, + local->location.start, + local->location.end, + PM_WARN_UNUSED_LOCAL_VARIABLE, + (int) constant->length, + (const char *) constant->start + ); + } + } + } + } +} + +/******************************************************************************/ +/* Node-related functions */ +/******************************************************************************/ + +/** + * Retrieve the constant pool id for the given location. + */ +static inline pm_constant_id_t +pm_parser_constant_id_location(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) { + return pm_constant_pool_insert_shared(&parser->constant_pool, start, (size_t) (end - start)); +} + +/** + * Retrieve the constant pool id for the given string. + */ +static inline pm_constant_id_t +pm_parser_constant_id_owned(pm_parser_t *parser, uint8_t *start, size_t length) { + return pm_constant_pool_insert_owned(&parser->constant_pool, start, length); +} + +/** + * Retrieve the constant pool id for the given static literal C string. + */ +static inline pm_constant_id_t +pm_parser_constant_id_constant(pm_parser_t *parser, const char *start, size_t length) { + return pm_constant_pool_insert_constant(&parser->constant_pool, (const uint8_t *) start, length); +} + +/** + * Retrieve the constant pool id for the given token. + */ +static inline pm_constant_id_t +pm_parser_constant_id_token(pm_parser_t *parser, const pm_token_t *token) { + return pm_parser_constant_id_location(parser, token->start, token->end); +} + +/** + * Retrieve the constant pool id for the given token. If the token is not + * provided, then return 0. + */ +static inline pm_constant_id_t +pm_parser_optional_constant_id_token(pm_parser_t *parser, const pm_token_t *token) { + return token->type == PM_TOKEN_NOT_PROVIDED ? 0 : pm_parser_constant_id_token(parser, token); +} + +/** + * Check whether or not the given node is value expression. + * If the node is value node, it returns NULL. + * If not, it returns the pointer to the node to be inspected as "void expression". + */ +static pm_node_t * +pm_check_value_expression(pm_parser_t *parser, pm_node_t *node) { + pm_node_t *void_node = NULL; + + while (node != NULL) { + switch (PM_NODE_TYPE(node)) { + case PM_RETURN_NODE: + case PM_BREAK_NODE: + case PM_NEXT_NODE: + case PM_REDO_NODE: + case PM_RETRY_NODE: + case PM_MATCH_REQUIRED_NODE: + return void_node != NULL ? void_node : node; + case PM_MATCH_PREDICATE_NODE: + return NULL; + case PM_BEGIN_NODE: { + pm_begin_node_t *cast = (pm_begin_node_t *) node; + + if (cast->ensure_clause != NULL) { + if (cast->rescue_clause != NULL) { + pm_node_t *vn = pm_check_value_expression(parser, UP(cast->rescue_clause)); + if (vn != NULL) return vn; + } + + if (cast->statements != NULL) { + pm_node_t *vn = pm_check_value_expression(parser, UP(cast->statements)); + if (vn != NULL) return vn; + } + + node = UP(cast->ensure_clause); + } else if (cast->rescue_clause != NULL) { + if (cast->statements == NULL) return NULL; + + pm_node_t *vn = pm_check_value_expression(parser, UP(cast->statements)); + if (vn == NULL) return NULL; + if (void_node == NULL) void_node = vn; + + for (pm_rescue_node_t *rescue_clause = cast->rescue_clause; rescue_clause != NULL; rescue_clause = rescue_clause->subsequent) { + pm_node_t *vn = pm_check_value_expression(parser, UP(rescue_clause->statements)); + if (vn == NULL) { + void_node = NULL; + break; + } + if (void_node == NULL) { + void_node = vn; + } + } + + if (cast->else_clause != NULL) { + node = UP(cast->else_clause); + } else { + return void_node; + } + } else { + node = UP(cast->statements); + } + + break; + } + case PM_ENSURE_NODE: { + pm_ensure_node_t *cast = (pm_ensure_node_t *) node; + node = UP(cast->statements); + break; + } + case PM_PARENTHESES_NODE: { + pm_parentheses_node_t *cast = (pm_parentheses_node_t *) node; + node = UP(cast->body); + break; + } + case PM_STATEMENTS_NODE: { + pm_statements_node_t *cast = (pm_statements_node_t *) node; + node = cast->body.nodes[cast->body.size - 1]; + break; + } + case PM_IF_NODE: { + pm_if_node_t *cast = (pm_if_node_t *) node; + if (cast->statements == NULL || cast->subsequent == NULL) { + return NULL; + } + pm_node_t *vn = pm_check_value_expression(parser, UP(cast->statements)); + if (vn == NULL) { + return NULL; + } + if (void_node == NULL) { + void_node = vn; + } + node = cast->subsequent; + break; + } + case PM_UNLESS_NODE: { + pm_unless_node_t *cast = (pm_unless_node_t *) node; + if (cast->statements == NULL || cast->else_clause == NULL) { + return NULL; + } + pm_node_t *vn = pm_check_value_expression(parser, UP(cast->statements)); + if (vn == NULL) { + return NULL; + } + if (void_node == NULL) { + void_node = vn; + } + node = UP(cast->else_clause); + break; + } + case PM_ELSE_NODE: { + pm_else_node_t *cast = (pm_else_node_t *) node; + node = UP(cast->statements); + break; + } + case PM_AND_NODE: { + pm_and_node_t *cast = (pm_and_node_t *) node; + node = cast->left; + break; + } + case PM_OR_NODE: { + pm_or_node_t *cast = (pm_or_node_t *) node; + node = cast->left; + break; + } + case PM_LOCAL_VARIABLE_WRITE_NODE: { + pm_local_variable_write_node_t *cast = (pm_local_variable_write_node_t *) node; + + pm_scope_t *scope = parser->current_scope; + for (uint32_t depth = 0; depth < cast->depth; depth++) scope = scope->previous; + + pm_locals_read(&scope->locals, cast->name); + return NULL; + } + default: + return NULL; + } + } + + return NULL; +} + +static inline void +pm_assert_value_expression(pm_parser_t *parser, pm_node_t *node) { + pm_node_t *void_node = pm_check_value_expression(parser, node); + if (void_node != NULL) { + pm_parser_err_node(parser, void_node, PM_ERR_VOID_EXPRESSION); + } +} + +/** + * Warn if the given node is a "void" statement. + */ +static void +pm_void_statement_check(pm_parser_t *parser, const pm_node_t *node) { + const char *type = NULL; + int length = 0; + + switch (PM_NODE_TYPE(node)) { + case PM_BACK_REFERENCE_READ_NODE: + case PM_CLASS_VARIABLE_READ_NODE: + case PM_GLOBAL_VARIABLE_READ_NODE: + case PM_INSTANCE_VARIABLE_READ_NODE: + case PM_LOCAL_VARIABLE_READ_NODE: + case PM_NUMBERED_REFERENCE_READ_NODE: + type = "a variable"; + length = 10; + break; + case PM_CALL_NODE: { + const pm_call_node_t *cast = (const pm_call_node_t *) node; + if (cast->call_operator_loc.start != NULL || cast->message_loc.start == NULL) break; + + const pm_constant_t *message = pm_constant_pool_id_to_constant(&parser->constant_pool, cast->name); + switch (message->length) { + case 1: + switch (message->start[0]) { + case '+': + case '-': + case '*': + case '/': + case '%': + case '|': + case '^': + case '&': + case '>': + case '<': + type = (const char *) message->start; + length = 1; + break; + } + break; + case 2: + switch (message->start[1]) { + case '=': + if (message->start[0] == '<' || message->start[0] == '>' || message->start[0] == '!' || message->start[0] == '=') { + type = (const char *) message->start; + length = 2; + } + break; + case '@': + if (message->start[0] == '+' || message->start[0] == '-') { + type = (const char *) message->start; + length = 2; + } + break; + case '*': + if (message->start[0] == '*') { + type = (const char *) message->start; + length = 2; + } + break; + } + break; + case 3: + if (memcmp(message->start, "<=>", 3) == 0) { + type = "<=>"; + length = 3; + } + break; + } + + break; + } + case PM_CONSTANT_PATH_NODE: + type = "::"; + length = 2; + break; + case PM_CONSTANT_READ_NODE: + type = "a constant"; + length = 10; + break; + case PM_DEFINED_NODE: + type = "defined?"; + length = 8; + break; + case PM_FALSE_NODE: + type = "false"; + length = 5; + break; + case PM_FLOAT_NODE: + case PM_IMAGINARY_NODE: + case PM_INTEGER_NODE: + case PM_INTERPOLATED_REGULAR_EXPRESSION_NODE: + case PM_INTERPOLATED_STRING_NODE: + case PM_RATIONAL_NODE: + case PM_REGULAR_EXPRESSION_NODE: + case PM_SOURCE_ENCODING_NODE: + case PM_SOURCE_FILE_NODE: + case PM_SOURCE_LINE_NODE: + case PM_STRING_NODE: + case PM_SYMBOL_NODE: + type = "a literal"; + length = 9; + break; + case PM_NIL_NODE: + type = "nil"; + length = 3; + break; + case PM_RANGE_NODE: { + const pm_range_node_t *cast = (const pm_range_node_t *) node; + + if (PM_NODE_FLAG_P(cast, PM_RANGE_FLAGS_EXCLUDE_END)) { + type = "..."; + length = 3; + } else { + type = ".."; + length = 2; + } + + break; + } + case PM_SELF_NODE: + type = "self"; + length = 4; + break; + case PM_TRUE_NODE: + type = "true"; + length = 4; + break; + default: + break; + } + + if (type != NULL) { + PM_PARSER_WARN_NODE_FORMAT(parser, node, PM_WARN_VOID_STATEMENT, length, type); + } +} + +/** + * Warn if any of the statements that are not the last statement in the list are + * a "void" statement. + */ +static void +pm_void_statements_check(pm_parser_t *parser, const pm_statements_node_t *node, bool last_value) { + assert(node->body.size > 0); + const size_t size = node->body.size - (last_value ? 1 : 0); + for (size_t index = 0; index < size; index++) { + pm_void_statement_check(parser, node->body.nodes[index]); + } +} + +/** + * When we're handling the predicate of a conditional, we need to know our + * context in order to determine the kind of warning we should deliver to the + * user. + */ +typedef enum { + PM_CONDITIONAL_PREDICATE_TYPE_CONDITIONAL, + PM_CONDITIONAL_PREDICATE_TYPE_FLIP_FLOP, + PM_CONDITIONAL_PREDICATE_TYPE_NOT +} pm_conditional_predicate_type_t; + +/** + * Add a warning to the parser if the predicate of a conditional is a literal. + */ +static void +pm_parser_warn_conditional_predicate_literal(pm_parser_t *parser, pm_node_t *node, pm_conditional_predicate_type_t type, pm_diagnostic_id_t diag_id, const char *prefix) { + switch (type) { + case PM_CONDITIONAL_PREDICATE_TYPE_CONDITIONAL: + PM_PARSER_WARN_NODE_FORMAT(parser, node, diag_id, prefix, "condition"); + break; + case PM_CONDITIONAL_PREDICATE_TYPE_FLIP_FLOP: + PM_PARSER_WARN_NODE_FORMAT(parser, node, diag_id, prefix, "flip-flop"); + break; + case PM_CONDITIONAL_PREDICATE_TYPE_NOT: + break; + } +} + +/** + * Return true if the value being written within the predicate of a conditional + * is a literal value. + */ +static bool +pm_conditional_predicate_warn_write_literal_p(const pm_node_t *node) { + switch (PM_NODE_TYPE(node)) { + case PM_ARRAY_NODE: { + if (PM_NODE_FLAG_P(node, PM_NODE_FLAG_STATIC_LITERAL)) return true; + + const pm_array_node_t *cast = (const pm_array_node_t *) node; + for (size_t index = 0; index < cast->elements.size; index++) { + if (!pm_conditional_predicate_warn_write_literal_p(cast->elements.nodes[index])) return false; + } + + return true; + } + case PM_HASH_NODE: { + if (PM_NODE_FLAG_P(node, PM_NODE_FLAG_STATIC_LITERAL)) return true; + + const pm_hash_node_t *cast = (const pm_hash_node_t *) node; + for (size_t index = 0; index < cast->elements.size; index++) { + const pm_node_t *element = cast->elements.nodes[index]; + if (!PM_NODE_TYPE_P(element, PM_ASSOC_NODE)) return false; + + const pm_assoc_node_t *assoc = (const pm_assoc_node_t *) element; + if (!pm_conditional_predicate_warn_write_literal_p(assoc->key) || !pm_conditional_predicate_warn_write_literal_p(assoc->value)) return false; + } + + return true; + } + case PM_FALSE_NODE: + case PM_FLOAT_NODE: + case PM_IMAGINARY_NODE: + case PM_INTEGER_NODE: + case PM_NIL_NODE: + case PM_RATIONAL_NODE: + case PM_REGULAR_EXPRESSION_NODE: + case PM_SOURCE_ENCODING_NODE: + case PM_SOURCE_FILE_NODE: + case PM_SOURCE_LINE_NODE: + case PM_STRING_NODE: + case PM_SYMBOL_NODE: + case PM_TRUE_NODE: + return true; + default: + return false; + } +} + +/** + * Add a warning to the parser if the value that is being written inside of a + * predicate to a conditional is a literal. + */ +static inline void +pm_conditional_predicate_warn_write_literal(pm_parser_t *parser, const pm_node_t *node) { + if (pm_conditional_predicate_warn_write_literal_p(node)) { + pm_parser_warn_node(parser, node, parser->version <= PM_OPTIONS_VERSION_CRUBY_3_3 ? PM_WARN_EQUAL_IN_CONDITIONAL_3_3 : PM_WARN_EQUAL_IN_CONDITIONAL); + } +} + +/** + * The predicate of conditional nodes can change what would otherwise be regular + * nodes into specialized nodes. For example: + * + * if foo .. bar => RangeNode becomes FlipFlopNode + * if foo and bar .. baz => RangeNode becomes FlipFlopNode + * if /foo/ => RegularExpressionNode becomes MatchLastLineNode + * if /foo #{bar}/ => InterpolatedRegularExpressionNode becomes InterpolatedMatchLastLineNode + * + * We also want to warn the user if they're using a static literal as a + * predicate or writing a static literal as the predicate. + */ +static void +pm_conditional_predicate(pm_parser_t *parser, pm_node_t *node, pm_conditional_predicate_type_t type) { + switch (PM_NODE_TYPE(node)) { + case PM_AND_NODE: { + pm_and_node_t *cast = (pm_and_node_t *) node; + pm_conditional_predicate(parser, cast->left, PM_CONDITIONAL_PREDICATE_TYPE_CONDITIONAL); + pm_conditional_predicate(parser, cast->right, PM_CONDITIONAL_PREDICATE_TYPE_CONDITIONAL); + break; + } + case PM_OR_NODE: { + pm_or_node_t *cast = (pm_or_node_t *) node; + pm_conditional_predicate(parser, cast->left, PM_CONDITIONAL_PREDICATE_TYPE_CONDITIONAL); + pm_conditional_predicate(parser, cast->right, PM_CONDITIONAL_PREDICATE_TYPE_CONDITIONAL); + break; + } + case PM_PARENTHESES_NODE: { + pm_parentheses_node_t *cast = (pm_parentheses_node_t *) node; + + if ((cast->body != NULL) && PM_NODE_TYPE_P(cast->body, PM_STATEMENTS_NODE)) { + pm_statements_node_t *statements = (pm_statements_node_t *) cast->body; + if (statements->body.size == 1) pm_conditional_predicate(parser, statements->body.nodes[0], type); + } + + break; + } + case PM_BEGIN_NODE: { + pm_begin_node_t *cast = (pm_begin_node_t *) node; + if (cast->statements != NULL) { + pm_statements_node_t *statements = cast->statements; + if (statements->body.size == 1) pm_conditional_predicate(parser, statements->body.nodes[0], type); + } + break; + } + case PM_RANGE_NODE: { + pm_range_node_t *cast = (pm_range_node_t *) node; + + if (cast->left != NULL) pm_conditional_predicate(parser, cast->left, PM_CONDITIONAL_PREDICATE_TYPE_FLIP_FLOP); + if (cast->right != NULL) pm_conditional_predicate(parser, cast->right, PM_CONDITIONAL_PREDICATE_TYPE_FLIP_FLOP); + + // Here we change the range node into a flip flop node. We can do + // this since the nodes are exactly the same except for the type. + // We're only asserting against the size when we should probably + // assert against the entire layout, but we'll assume tests will + // catch this. + assert(sizeof(pm_range_node_t) == sizeof(pm_flip_flop_node_t)); + node->type = PM_FLIP_FLOP_NODE; + + break; + } + case PM_REGULAR_EXPRESSION_NODE: + // Here we change the regular expression node into a match last line + // node. We can do this since the nodes are exactly the same except + // for the type. + assert(sizeof(pm_regular_expression_node_t) == sizeof(pm_match_last_line_node_t)); + node->type = PM_MATCH_LAST_LINE_NODE; + + if (!PM_PARSER_COMMAND_LINE_OPTION_E(parser)) { + pm_parser_warn_conditional_predicate_literal(parser, node, type, PM_WARN_LITERAL_IN_CONDITION_DEFAULT, "regex "); + } + + break; + case PM_INTERPOLATED_REGULAR_EXPRESSION_NODE: + // Here we change the interpolated regular expression node into an + // interpolated match last line node. We can do this since the nodes + // are exactly the same except for the type. + assert(sizeof(pm_interpolated_regular_expression_node_t) == sizeof(pm_interpolated_match_last_line_node_t)); + node->type = PM_INTERPOLATED_MATCH_LAST_LINE_NODE; + + if (!PM_PARSER_COMMAND_LINE_OPTION_E(parser)) { + pm_parser_warn_conditional_predicate_literal(parser, node, type, PM_WARN_LITERAL_IN_CONDITION_VERBOSE, "regex "); + } + + break; + case PM_INTEGER_NODE: + if (type == PM_CONDITIONAL_PREDICATE_TYPE_FLIP_FLOP) { + if (!PM_PARSER_COMMAND_LINE_OPTION_E(parser)) { + pm_parser_warn_node(parser, node, PM_WARN_INTEGER_IN_FLIP_FLOP); + } + } else { + pm_parser_warn_conditional_predicate_literal(parser, node, type, PM_WARN_LITERAL_IN_CONDITION_VERBOSE, ""); + } + break; + case PM_STRING_NODE: + case PM_SOURCE_FILE_NODE: + case PM_INTERPOLATED_STRING_NODE: + pm_parser_warn_conditional_predicate_literal(parser, node, type, PM_WARN_LITERAL_IN_CONDITION_DEFAULT, "string "); + break; + case PM_SYMBOL_NODE: + case PM_INTERPOLATED_SYMBOL_NODE: + pm_parser_warn_conditional_predicate_literal(parser, node, type, PM_WARN_LITERAL_IN_CONDITION_VERBOSE, "symbol "); + break; + case PM_SOURCE_LINE_NODE: + case PM_SOURCE_ENCODING_NODE: + case PM_FLOAT_NODE: + case PM_RATIONAL_NODE: + case PM_IMAGINARY_NODE: + pm_parser_warn_conditional_predicate_literal(parser, node, type, PM_WARN_LITERAL_IN_CONDITION_VERBOSE, ""); + break; + case PM_CLASS_VARIABLE_WRITE_NODE: + pm_conditional_predicate_warn_write_literal(parser, ((pm_class_variable_write_node_t *) node)->value); + break; + case PM_CONSTANT_WRITE_NODE: + pm_conditional_predicate_warn_write_literal(parser, ((pm_constant_write_node_t *) node)->value); + break; + case PM_GLOBAL_VARIABLE_WRITE_NODE: + pm_conditional_predicate_warn_write_literal(parser, ((pm_global_variable_write_node_t *) node)->value); + break; + case PM_INSTANCE_VARIABLE_WRITE_NODE: + pm_conditional_predicate_warn_write_literal(parser, ((pm_instance_variable_write_node_t *) node)->value); + break; + case PM_LOCAL_VARIABLE_WRITE_NODE: + pm_conditional_predicate_warn_write_literal(parser, ((pm_local_variable_write_node_t *) node)->value); + break; + case PM_MULTI_WRITE_NODE: + pm_conditional_predicate_warn_write_literal(parser, ((pm_multi_write_node_t *) node)->value); + break; + default: + break; + } +} + +/** + * In a lot of places in the tree you can have tokens that are not provided but + * that do not cause an error. For example, this happens in a method call + * without parentheses. In these cases we set the token to the "not provided" type. + * For example: + * + * pm_token_t token = not_provided(parser); + */ +static inline pm_token_t +not_provided(pm_parser_t *parser) { + return (pm_token_t) { .type = PM_TOKEN_NOT_PROVIDED, .start = parser->start, .end = parser->start }; +} + +/** + * This is a special out parameter to the parse_arguments_list function that + * includes opening and closing parentheses in addition to the arguments since + * it's so common. It is handy to use when passing argument information to one + * of the call node creation functions. + */ +typedef struct { + /** The optional location of the opening parenthesis or bracket. */ + pm_location_t opening_loc; + + /** The lazily-allocated optional arguments node. */ + pm_arguments_node_t *arguments; + + /** The optional location of the closing parenthesis or bracket. */ + pm_location_t closing_loc; + + /** The optional block attached to the call. */ + pm_node_t *block; + + /** The flag indicating whether this arguments list has forwarding argument. */ + bool has_forwarding; +} pm_arguments_t; + +/** + * Retrieve the end location of a `pm_arguments_t` object. + */ +static inline const uint8_t * +pm_arguments_end(pm_arguments_t *arguments) { + if (arguments->block != NULL) { + const uint8_t *end = arguments->block->location.end; + if (arguments->closing_loc.start != NULL && arguments->closing_loc.end > end) { + end = arguments->closing_loc.end; + } + return end; + } + if (arguments->closing_loc.start != NULL) { + return arguments->closing_loc.end; + } + if (arguments->arguments != NULL) { + return arguments->arguments->base.location.end; + } + return arguments->closing_loc.end; +} + +/** + * Check that we're not about to attempt to attach a brace block to a call that + * has arguments without parentheses. + */ +static void +pm_arguments_validate_block(pm_parser_t *parser, pm_arguments_t *arguments, pm_block_node_t *block) { + // First, check that we have arguments and that we don't have a closing + // location for them. + if (arguments->arguments == NULL || arguments->closing_loc.start != NULL) { + return; + } + + // Next, check that we don't have a single parentheses argument. This would + // look like: + // + // foo (1) {} + // + // In this case, it's actually okay for the block to be attached to the + // call, even though it looks like it's attached to the argument. + if (arguments->arguments->arguments.size == 1 && PM_NODE_TYPE_P(arguments->arguments->arguments.nodes[0], PM_PARENTHESES_NODE)) { + return; + } + + // If we didn't hit a case before this check, then at this point we need to + // add a syntax error. + pm_parser_err_node(parser, UP(block), PM_ERR_ARGUMENT_UNEXPECTED_BLOCK); +} + +/******************************************************************************/ +/* Basic character checks */ +/******************************************************************************/ + +/** + * This function is used extremely frequently to lex all of the identifiers in a + * source file, so it's important that it be as fast as possible. For this + * reason we have the encoding_changed boolean to check if we need to go through + * the function pointer or can just directly use the UTF-8 functions. + */ +static inline size_t +char_is_identifier_start(const pm_parser_t *parser, const uint8_t *b, ptrdiff_t n) { + if (n <= 0) return 0; + + if (parser->encoding_changed) { + size_t width; + + if ((width = parser->encoding->alpha_char(b, n)) != 0) { + return width; + } else if (*b == '_') { + return 1; + } else if (*b >= 0x80) { + return parser->encoding->char_width(b, n); + } else { + return 0; + } + } else if (*b < 0x80) { + return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT ? 1 : 0) || (*b == '_'); + } else { + return pm_encoding_utf_8_char_width(b, n); + } +} + +/** + * Similar to char_is_identifier but this function assumes that the encoding + * has not been changed. + */ +static inline size_t +char_is_identifier_utf8(const uint8_t *b, ptrdiff_t n) { + if (n <= 0) { + return 0; + } else if (*b < 0x80) { + return (*b == '_') || (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT ? 1 : 0); + } else { + return pm_encoding_utf_8_char_width(b, n); + } +} + +/** + * Like the above, this function is also used extremely frequently to lex all of + * the identifiers in a source file once the first character has been found. So + * it's important that it be as fast as possible. + */ +static inline size_t +char_is_identifier(const pm_parser_t *parser, const uint8_t *b, ptrdiff_t n) { + if (n <= 0) { + return 0; + } else if (parser->encoding_changed) { + size_t width; + + if ((width = parser->encoding->alnum_char(b, n)) != 0) { + return width; + } else if (*b == '_') { + return 1; + } else if (*b >= 0x80) { + return parser->encoding->char_width(b, n); + } else { + return 0; + } + } else { + return char_is_identifier_utf8(b, n); + } +} + +// Here we're defining a perfect hash for the characters that are allowed in +// global names. This is used to quickly check the next character after a $ to +// see if it's a valid character for a global name. +#define BIT(c, idx) (((c) / 32 - 1 == idx) ? (1U << ((c) % 32)) : 0) +#define PUNCT(idx) ( \ + BIT('~', idx) | BIT('*', idx) | BIT('$', idx) | BIT('?', idx) | \ + BIT('!', idx) | BIT('@', idx) | BIT('/', idx) | BIT('\\', idx) | \ + BIT(';', idx) | BIT(',', idx) | BIT('.', idx) | BIT('=', idx) | \ + BIT(':', idx) | BIT('<', idx) | BIT('>', idx) | BIT('\"', idx) | \ + BIT('&', idx) | BIT('`', idx) | BIT('\'', idx) | BIT('+', idx) | \ + BIT('0', idx)) + +const unsigned int pm_global_name_punctuation_hash[(0x7e - 0x20 + 31) / 32] = { PUNCT(0), PUNCT(1), PUNCT(2) }; + +#undef BIT +#undef PUNCT + +static inline bool +char_is_global_name_punctuation(const uint8_t b) { + const unsigned int i = (const unsigned int) b; + if (i <= 0x20 || 0x7e < i) return false; + + return (pm_global_name_punctuation_hash[(i - 0x20) / 32] >> (i % 32)) & 1; +} + +static inline bool +token_is_setter_name(pm_token_t *token) { + return ( + (token->type == PM_TOKEN_BRACKET_LEFT_RIGHT_EQUAL) || + ((token->type == PM_TOKEN_IDENTIFIER) && + (token->end - token->start >= 2) && + (token->end[-1] == '=')) + ); +} + +/** + * Returns true if the given local variable is a keyword. + */ +static bool +pm_local_is_keyword(const char *source, size_t length) { +#define KEYWORD(name) if (memcmp(source, name, length) == 0) return true + + switch (length) { + case 2: + switch (source[0]) { + case 'd': KEYWORD("do"); return false; + case 'i': KEYWORD("if"); KEYWORD("in"); return false; + case 'o': KEYWORD("or"); return false; + default: return false; + } + case 3: + switch (source[0]) { + case 'a': KEYWORD("and"); return false; + case 'd': KEYWORD("def"); return false; + case 'e': KEYWORD("end"); return false; + case 'f': KEYWORD("for"); return false; + case 'n': KEYWORD("nil"); KEYWORD("not"); return false; + default: return false; + } + case 4: + switch (source[0]) { + case 'c': KEYWORD("case"); return false; + case 'e': KEYWORD("else"); return false; + case 'n': KEYWORD("next"); return false; + case 'r': KEYWORD("redo"); return false; + case 's': KEYWORD("self"); return false; + case 't': KEYWORD("then"); KEYWORD("true"); return false; + case 'w': KEYWORD("when"); return false; + default: return false; + } + case 5: + switch (source[0]) { + case 'a': KEYWORD("alias"); return false; + case 'b': KEYWORD("begin"); KEYWORD("break"); return false; + case 'c': KEYWORD("class"); return false; + case 'e': KEYWORD("elsif"); return false; + case 'f': KEYWORD("false"); return false; + case 'r': KEYWORD("retry"); return false; + case 's': KEYWORD("super"); return false; + case 'u': KEYWORD("undef"); KEYWORD("until"); return false; + case 'w': KEYWORD("while"); return false; + case 'y': KEYWORD("yield"); return false; + default: return false; + } + case 6: + switch (source[0]) { + case 'e': KEYWORD("ensure"); return false; + case 'm': KEYWORD("module"); return false; + case 'r': KEYWORD("rescue"); KEYWORD("return"); return false; + case 'u': KEYWORD("unless"); return false; + default: return false; + } + case 8: + KEYWORD("__LINE__"); + KEYWORD("__FILE__"); + return false; + case 12: + KEYWORD("__ENCODING__"); + return false; + default: + return false; + } + +#undef KEYWORD +} + +/******************************************************************************/ +/* Node flag handling functions */ +/******************************************************************************/ + +/** + * Set the given flag on the given node. + */ +static inline void +pm_node_flag_set(pm_node_t *node, pm_node_flags_t flag) { + node->flags |= flag; +} + +/** + * Remove the given flag from the given node. + */ +static inline void +pm_node_flag_unset(pm_node_t *node, pm_node_flags_t flag) { + node->flags &= (pm_node_flags_t) ~flag; +} + +/** + * Set the repeated parameter flag on the given node. + */ +static inline void +pm_node_flag_set_repeated_parameter(pm_node_t *node) { + assert(PM_NODE_TYPE(node) == PM_BLOCK_LOCAL_VARIABLE_NODE || + PM_NODE_TYPE(node) == PM_BLOCK_PARAMETER_NODE || + PM_NODE_TYPE(node) == PM_KEYWORD_REST_PARAMETER_NODE || + PM_NODE_TYPE(node) == PM_OPTIONAL_KEYWORD_PARAMETER_NODE || + PM_NODE_TYPE(node) == PM_OPTIONAL_PARAMETER_NODE || + PM_NODE_TYPE(node) == PM_REQUIRED_KEYWORD_PARAMETER_NODE || + PM_NODE_TYPE(node) == PM_REQUIRED_PARAMETER_NODE || + PM_NODE_TYPE(node) == PM_REST_PARAMETER_NODE); + + pm_node_flag_set(node, PM_PARAMETER_FLAGS_REPEATED_PARAMETER); +} + +/******************************************************************************/ +/* Node creation functions */ +/******************************************************************************/ + +/** + * When you have an encoding flag on a regular expression, it takes precedence + * over all of the previously set encoding flags. So we need to mask off any + * previously set encoding flags before setting the new one. + */ +#define PM_REGULAR_EXPRESSION_ENCODING_MASK ~(PM_REGULAR_EXPRESSION_FLAGS_EUC_JP | PM_REGULAR_EXPRESSION_FLAGS_ASCII_8BIT | PM_REGULAR_EXPRESSION_FLAGS_WINDOWS_31J | PM_REGULAR_EXPRESSION_FLAGS_UTF_8) + +/** + * Parse out the options for a regular expression. + */ +static inline pm_node_flags_t +pm_regular_expression_flags_create(pm_parser_t *parser, const pm_token_t *closing) { + pm_node_flags_t flags = 0; + + if (closing->type == PM_TOKEN_REGEXP_END) { + pm_buffer_t unknown_flags = { 0 }; + + for (const uint8_t *flag = closing->start + 1; flag < closing->end; flag++) { + switch (*flag) { + case 'i': flags |= PM_REGULAR_EXPRESSION_FLAGS_IGNORE_CASE; break; + case 'm': flags |= PM_REGULAR_EXPRESSION_FLAGS_MULTI_LINE; break; + case 'x': flags |= PM_REGULAR_EXPRESSION_FLAGS_EXTENDED; break; + case 'o': flags |= PM_REGULAR_EXPRESSION_FLAGS_ONCE; break; + + case 'e': flags = (pm_node_flags_t) (((pm_node_flags_t) (flags & PM_REGULAR_EXPRESSION_ENCODING_MASK)) | PM_REGULAR_EXPRESSION_FLAGS_EUC_JP); break; + case 'n': flags = (pm_node_flags_t) (((pm_node_flags_t) (flags & PM_REGULAR_EXPRESSION_ENCODING_MASK)) | PM_REGULAR_EXPRESSION_FLAGS_ASCII_8BIT); break; + case 's': flags = (pm_node_flags_t) (((pm_node_flags_t) (flags & PM_REGULAR_EXPRESSION_ENCODING_MASK)) | PM_REGULAR_EXPRESSION_FLAGS_WINDOWS_31J); break; + case 'u': flags = (pm_node_flags_t) (((pm_node_flags_t) (flags & PM_REGULAR_EXPRESSION_ENCODING_MASK)) | PM_REGULAR_EXPRESSION_FLAGS_UTF_8); break; + + default: pm_buffer_append_byte(&unknown_flags, *flag); + } + } + + size_t unknown_flags_length = pm_buffer_length(&unknown_flags); + if (unknown_flags_length != 0) { + const char *word = unknown_flags_length >= 2 ? "options" : "option"; + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->previous, PM_ERR_REGEXP_UNKNOWN_OPTIONS, word, unknown_flags_length, pm_buffer_value(&unknown_flags)); + } + pm_buffer_free(&unknown_flags); + } + + return flags; +} + +#undef PM_REGULAR_EXPRESSION_ENCODING_MASK + +static pm_statements_node_t * +pm_statements_node_create(pm_parser_t *parser); + +static void +pm_statements_node_body_append(pm_parser_t *parser, pm_statements_node_t *node, pm_node_t *statement, bool newline); + +static size_t +pm_statements_node_body_length(pm_statements_node_t *node); + +/** + * This function is here to allow us a place to extend in the future when we + * implement our own arena allocation. + */ +static inline void * +pm_node_alloc(PRISM_ATTRIBUTE_UNUSED pm_parser_t *parser, size_t size) { + void *memory = xcalloc(1, size); + if (memory == NULL) { + fprintf(stderr, "Failed to allocate %d bytes\n", (int) size); + abort(); + } + return memory; +} + +#define PM_NODE_ALLOC(parser_, type_) (type_ *) pm_node_alloc(parser_, sizeof(type_)) +#define PM_NODE_INIT(parser_, type_, flags_, start_, end_) (pm_node_t) { \ + .type = (type_), \ + .flags = (flags_), \ + .node_id = ++(parser_)->node_id, \ + .location = { .start = (start_), .end = (end_) } \ +} + +#define PM_NODE_INIT_UNSET(parser_, type_, flags_) PM_NODE_INIT(parser_, type_, flags_, NULL, NULL) +#define PM_NODE_INIT_BASE(parser_, type_, flags_) PM_NODE_INIT(parser_, type_, flags_, (parser_)->start, (parser_)->start) +#define PM_NODE_INIT_TOKEN(parser_, type_, flags_, token_) PM_NODE_INIT(parser_, type_, flags_, PM_TOKEN_START(token_), PM_TOKEN_END(token_)) +#define PM_NODE_INIT_NODE(parser_, type_, flags_, node_) PM_NODE_INIT(parser_, type_, flags_, PM_NODE_START(node_), PM_NODE_END(node_)) + +#define PM_NODE_INIT_TOKENS(parser_, type_, flags_, left_, right_) PM_NODE_INIT(parser_, type_, flags_, PM_TOKEN_START(left_), PM_TOKEN_END(right_)) +#define PM_NODE_INIT_NODES(parser_, type_, flags_, left_, right_) PM_NODE_INIT(parser_, type_, flags_, PM_NODE_START(left_), PM_NODE_END(right_)) +#define PM_NODE_INIT_TOKEN_NODE(parser_, type_, flags_, token_, node_) PM_NODE_INIT(parser_, type_, flags_, PM_TOKEN_START(token_), PM_NODE_END(node_)) +#define PM_NODE_INIT_NODE_TOKEN(parser_, type_, flags_, node_, token_) PM_NODE_INIT(parser_, type_, flags_, PM_NODE_START(node_), PM_TOKEN_END(token_)) + +/** + * Allocate a new MissingNode node. + */ +static pm_missing_node_t * +pm_missing_node_create(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) { + pm_missing_node_t *node = PM_NODE_ALLOC(parser, pm_missing_node_t); + + *node = (pm_missing_node_t) { + .base = PM_NODE_INIT(parser, PM_MISSING_NODE, 0, start, end) + }; + + return node; +} + +/** + * Allocate and initialize a new AliasGlobalVariableNode node. + */ +static pm_alias_global_variable_node_t * +pm_alias_global_variable_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_node_t *new_name, pm_node_t *old_name) { + assert(keyword->type == PM_TOKEN_KEYWORD_ALIAS); + pm_alias_global_variable_node_t *node = PM_NODE_ALLOC(parser, pm_alias_global_variable_node_t); + + *node = (pm_alias_global_variable_node_t) { + .base = PM_NODE_INIT_TOKEN_NODE(parser, PM_ALIAS_GLOBAL_VARIABLE_NODE, 0, keyword, old_name), + .new_name = new_name, + .old_name = old_name, + .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword) + }; + + return node; +} + +/** + * Allocate and initialize a new AliasMethodNode node. + */ +static pm_alias_method_node_t * +pm_alias_method_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_node_t *new_name, pm_node_t *old_name) { + assert(keyword->type == PM_TOKEN_KEYWORD_ALIAS); + pm_alias_method_node_t *node = PM_NODE_ALLOC(parser, pm_alias_method_node_t); + + *node = (pm_alias_method_node_t) { + .base = PM_NODE_INIT_TOKEN_NODE(parser, PM_ALIAS_METHOD_NODE, 0, keyword, old_name), + .new_name = new_name, + .old_name = old_name, + .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword) + }; + + return node; +} + +/** + * Allocate a new AlternationPatternNode node. + */ +static pm_alternation_pattern_node_t * +pm_alternation_pattern_node_create(pm_parser_t *parser, pm_node_t *left, pm_node_t *right, const pm_token_t *operator) { + pm_alternation_pattern_node_t *node = PM_NODE_ALLOC(parser, pm_alternation_pattern_node_t); + + *node = (pm_alternation_pattern_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_ALTERNATION_PATTERN_NODE, 0, left, right), + .left = left, + .right = right, + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator) + }; + + return node; +} + +/** + * Allocate and initialize a new and node. + */ +static pm_and_node_t * +pm_and_node_create(pm_parser_t *parser, pm_node_t *left, const pm_token_t *operator, pm_node_t *right) { + pm_assert_value_expression(parser, left); + + pm_and_node_t *node = PM_NODE_ALLOC(parser, pm_and_node_t); + + *node = (pm_and_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_AND_NODE, 0, left, right), + .left = left, + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .right = right + }; + + return node; +} + +/** + * Allocate an initialize a new arguments node. + */ +static pm_arguments_node_t * +pm_arguments_node_create(pm_parser_t *parser) { + pm_arguments_node_t *node = PM_NODE_ALLOC(parser, pm_arguments_node_t); + + *node = (pm_arguments_node_t) { + .base = PM_NODE_INIT_BASE(parser, PM_ARGUMENTS_NODE, 0), + .arguments = { 0 } + }; + + return node; +} + +/** + * Return the size of the given arguments node. + */ +static size_t +pm_arguments_node_size(pm_arguments_node_t *node) { + return node->arguments.size; +} + +/** + * Append an argument to an arguments node. + */ +static void +pm_arguments_node_arguments_append(pm_arguments_node_t *node, pm_node_t *argument) { + if (pm_arguments_node_size(node) == 0) { + node->base.location.start = argument->location.start; + } + + if (node->base.location.end < argument->location.end) { + node->base.location.end = argument->location.end; + } + + pm_node_list_append(&node->arguments, argument); + + if (PM_NODE_TYPE_P(argument, PM_SPLAT_NODE)) { + if (PM_NODE_FLAG_P(node, PM_ARGUMENTS_NODE_FLAGS_CONTAINS_SPLAT)) { + pm_node_flag_set(UP(node), PM_ARGUMENTS_NODE_FLAGS_CONTAINS_MULTIPLE_SPLATS); + } else { + pm_node_flag_set(UP(node), PM_ARGUMENTS_NODE_FLAGS_CONTAINS_SPLAT); + } + } +} + +/** + * Allocate and initialize a new ArrayNode node. + */ +static pm_array_node_t * +pm_array_node_create(pm_parser_t *parser, const pm_token_t *opening) { + pm_array_node_t *node = PM_NODE_ALLOC(parser, pm_array_node_t); + + *node = (pm_array_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_ARRAY_NODE, PM_NODE_FLAG_STATIC_LITERAL, opening), + .opening_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(opening), + .closing_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(opening), + .elements = { 0 } + }; + + return node; +} + +/** + * Append an argument to an array node. + */ +static inline void +pm_array_node_elements_append(pm_array_node_t *node, pm_node_t *element) { + if (!node->elements.size && !node->opening_loc.start) { + node->base.location.start = element->location.start; + } + + pm_node_list_append(&node->elements, element); + node->base.location.end = element->location.end; + + // If the element is not a static literal, then the array is not a static + // literal. Turn that flag off. + if (PM_NODE_TYPE_P(element, PM_ARRAY_NODE) || PM_NODE_TYPE_P(element, PM_HASH_NODE) || PM_NODE_TYPE_P(element, PM_RANGE_NODE) || !PM_NODE_FLAG_P(element, PM_NODE_FLAG_STATIC_LITERAL)) { + pm_node_flag_unset(UP(node), PM_NODE_FLAG_STATIC_LITERAL); + } + + if (PM_NODE_TYPE_P(element, PM_SPLAT_NODE)) { + pm_node_flag_set(UP(node), PM_ARRAY_NODE_FLAGS_CONTAINS_SPLAT); + } +} + +/** + * Set the closing token and end location of an array node. + */ +static void +pm_array_node_close_set(pm_array_node_t *node, const pm_token_t *closing) { + assert(closing->type == PM_TOKEN_BRACKET_RIGHT || closing->type == PM_TOKEN_STRING_END || closing->type == PM_TOKEN_MISSING || closing->type == PM_TOKEN_NOT_PROVIDED); + node->base.location.end = closing->end; + node->closing_loc = PM_LOCATION_TOKEN_VALUE(closing); +} + +/** + * Allocate and initialize a new array pattern node. The node list given in the + * nodes parameter is guaranteed to have at least two nodes. + */ +static pm_array_pattern_node_t * +pm_array_pattern_node_node_list_create(pm_parser_t *parser, pm_node_list_t *nodes) { + pm_array_pattern_node_t *node = PM_NODE_ALLOC(parser, pm_array_pattern_node_t); + + *node = (pm_array_pattern_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_ARRAY_PATTERN_NODE, 0, nodes->nodes[0], nodes->nodes[nodes->size - 1]), + .constant = NULL, + .rest = NULL, + .requireds = { 0 }, + .posts = { 0 }, + .opening_loc = { 0 }, + .closing_loc = { 0 } + }; + + // For now we're going to just copy over each pointer manually. This could be + // much more efficient, as we could instead resize the node list. + bool found_rest = false; + pm_node_t *child; + + PM_NODE_LIST_FOREACH(nodes, index, child) { + if (!found_rest && (PM_NODE_TYPE_P(child, PM_SPLAT_NODE) || PM_NODE_TYPE_P(child, PM_IMPLICIT_REST_NODE))) { + node->rest = child; + found_rest = true; + } else if (found_rest) { + pm_node_list_append(&node->posts, child); + } else { + pm_node_list_append(&node->requireds, child); + } + } + + return node; +} + +/** + * Allocate and initialize a new array pattern node from a single rest node. + */ +static pm_array_pattern_node_t * +pm_array_pattern_node_rest_create(pm_parser_t *parser, pm_node_t *rest) { + pm_array_pattern_node_t *node = PM_NODE_ALLOC(parser, pm_array_pattern_node_t); + + *node = (pm_array_pattern_node_t) { + .base = PM_NODE_INIT_NODE(parser, PM_ARRAY_PATTERN_NODE, 0, rest), + .constant = NULL, + .rest = rest, + .requireds = { 0 }, + .posts = { 0 }, + .opening_loc = { 0 }, + .closing_loc = { 0 } + }; + + return node; +} + +/** + * Allocate and initialize a new array pattern node from a constant and opening + * and closing tokens. + */ +static pm_array_pattern_node_t * +pm_array_pattern_node_constant_create(pm_parser_t *parser, pm_node_t *constant, const pm_token_t *opening, const pm_token_t *closing) { + pm_array_pattern_node_t *node = PM_NODE_ALLOC(parser, pm_array_pattern_node_t); + + *node = (pm_array_pattern_node_t) { + .base = PM_NODE_INIT_NODE_TOKEN(parser, PM_ARRAY_PATTERN_NODE, 0, constant, closing), + .constant = constant, + .rest = NULL, + .opening_loc = PM_LOCATION_TOKEN_VALUE(opening), + .closing_loc = PM_LOCATION_TOKEN_VALUE(closing), + .requireds = { 0 }, + .posts = { 0 } + }; + + return node; +} + +/** + * Allocate and initialize a new array pattern node from an opening and closing + * token. + */ +static pm_array_pattern_node_t * +pm_array_pattern_node_empty_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *closing) { + pm_array_pattern_node_t *node = PM_NODE_ALLOC(parser, pm_array_pattern_node_t); + + *node = (pm_array_pattern_node_t) { + .base = PM_NODE_INIT_TOKENS(parser, PM_ARRAY_PATTERN_NODE, 0, opening, closing), + .constant = NULL, + .rest = NULL, + .opening_loc = PM_LOCATION_TOKEN_VALUE(opening), + .closing_loc = PM_LOCATION_TOKEN_VALUE(closing), + .requireds = { 0 }, + .posts = { 0 } + }; + + return node; +} + +static inline void +pm_array_pattern_node_requireds_append(pm_array_pattern_node_t *node, pm_node_t *inner) { + pm_node_list_append(&node->requireds, inner); +} + +/** + * Allocate and initialize a new assoc node. + */ +static pm_assoc_node_t * +pm_assoc_node_create(pm_parser_t *parser, pm_node_t *key, const pm_token_t *operator, pm_node_t *value) { + pm_assoc_node_t *node = PM_NODE_ALLOC(parser, pm_assoc_node_t); + const uint8_t *end; + + if (value != NULL && value->location.end > key->location.end) { + end = value->location.end; + } else if (operator->type != PM_TOKEN_NOT_PROVIDED) { + end = operator->end; + } else { + end = key->location.end; + } + + // Hash string keys will be frozen, so we can mark them as frozen here so + // that the compiler picks them up and also when we check for static literal + // on the keys it gets factored in. + if (PM_NODE_TYPE_P(key, PM_STRING_NODE)) { + key->flags |= PM_STRING_FLAGS_FROZEN | PM_NODE_FLAG_STATIC_LITERAL; + } + + // If the key and value of this assoc node are both static literals, then + // we can mark this node as a static literal. + pm_node_flags_t flags = 0; + if ( + !PM_NODE_TYPE_P(key, PM_ARRAY_NODE) && !PM_NODE_TYPE_P(key, PM_HASH_NODE) && !PM_NODE_TYPE_P(key, PM_RANGE_NODE) && + value && !PM_NODE_TYPE_P(value, PM_ARRAY_NODE) && !PM_NODE_TYPE_P(value, PM_HASH_NODE) && !PM_NODE_TYPE_P(value, PM_RANGE_NODE) + ) { + flags = key->flags & value->flags & PM_NODE_FLAG_STATIC_LITERAL; + } + + *node = (pm_assoc_node_t) { + .base = PM_NODE_INIT(parser, PM_ASSOC_NODE, flags, key->location.start, end), + .key = key, + .operator_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(operator), + .value = value + }; + + return node; +} + +/** + * Allocate and initialize a new assoc splat node. + */ +static pm_assoc_splat_node_t * +pm_assoc_splat_node_create(pm_parser_t *parser, pm_node_t *value, const pm_token_t *operator) { + assert(operator->type == PM_TOKEN_USTAR_STAR); + pm_assoc_splat_node_t *node = PM_NODE_ALLOC(parser, pm_assoc_splat_node_t); + + *node = (pm_assoc_splat_node_t) { + .base = ( + (value == NULL) + ? PM_NODE_INIT_TOKEN(parser, PM_ASSOC_SPLAT_NODE, 0, operator) + : PM_NODE_INIT_TOKEN_NODE(parser, PM_ASSOC_SPLAT_NODE, 0, operator, value) + ), + .value = value, + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator) + }; + + return node; +} + +/** + * Allocate a new BackReferenceReadNode node. + */ +static pm_back_reference_read_node_t * +pm_back_reference_read_node_create(pm_parser_t *parser, const pm_token_t *name) { + assert(name->type == PM_TOKEN_BACK_REFERENCE); + pm_back_reference_read_node_t *node = PM_NODE_ALLOC(parser, pm_back_reference_read_node_t); + + *node = (pm_back_reference_read_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_BACK_REFERENCE_READ_NODE, 0, name), + .name = pm_parser_constant_id_token(parser, name) + }; + + return node; +} + +/** + * Allocate and initialize new a begin node. + */ +static pm_begin_node_t * +pm_begin_node_create(pm_parser_t *parser, const pm_token_t *begin_keyword, pm_statements_node_t *statements) { + pm_begin_node_t *node = PM_NODE_ALLOC(parser, pm_begin_node_t); + + *node = (pm_begin_node_t) { + .base = ( + (statements == NULL) + ? PM_NODE_INIT_TOKEN(parser, PM_BEGIN_NODE, 0, begin_keyword) + : PM_NODE_INIT_TOKEN_NODE(parser, PM_BEGIN_NODE, 0, begin_keyword, statements) + ), + .begin_keyword_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(begin_keyword), + .statements = statements, + .end_keyword_loc = { 0 } + }; + + return node; +} + +/** + * Set the rescue clause, optionally start, and end location of a begin node. + */ +static void +pm_begin_node_rescue_clause_set(pm_begin_node_t *node, pm_rescue_node_t *rescue_clause) { + // If the begin keyword doesn't exist, we set the start on the begin_node + if (!node->begin_keyword_loc.start) { + node->base.location.start = rescue_clause->base.location.start; + } + node->base.location.end = rescue_clause->base.location.end; + node->rescue_clause = rescue_clause; +} + +/** + * Set the else clause and end location of a begin node. + */ +static void +pm_begin_node_else_clause_set(pm_begin_node_t *node, pm_else_node_t *else_clause) { + node->base.location.end = else_clause->base.location.end; + node->else_clause = else_clause; +} + +/** + * Set the ensure clause and end location of a begin node. + */ +static void +pm_begin_node_ensure_clause_set(pm_begin_node_t *node, pm_ensure_node_t *ensure_clause) { + node->base.location.end = ensure_clause->base.location.end; + node->ensure_clause = ensure_clause; +} + +/** + * Set the end keyword and end location of a begin node. + */ +static void +pm_begin_node_end_keyword_set(pm_begin_node_t *node, const pm_token_t *end_keyword) { + assert(end_keyword->type == PM_TOKEN_KEYWORD_END || end_keyword->type == PM_TOKEN_MISSING); + + node->base.location.end = end_keyword->end; + node->end_keyword_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(end_keyword); +} + +/** + * Allocate and initialize a new BlockArgumentNode node. + */ +static pm_block_argument_node_t * +pm_block_argument_node_create(pm_parser_t *parser, const pm_token_t *operator, pm_node_t *expression) { + pm_block_argument_node_t *node = PM_NODE_ALLOC(parser, pm_block_argument_node_t); + + *node = (pm_block_argument_node_t) { + .base = ( + (expression == NULL) + ? PM_NODE_INIT_TOKEN(parser, PM_BLOCK_ARGUMENT_NODE, 0, operator) + : PM_NODE_INIT_TOKEN_NODE(parser, PM_BLOCK_ARGUMENT_NODE, 0, operator, expression) + ), + .expression = expression, + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator) + }; + + return node; +} + +/** + * Allocate and initialize a new BlockNode node. + */ +static pm_block_node_t * +pm_block_node_create(pm_parser_t *parser, pm_constant_id_list_t *locals, const pm_token_t *opening, pm_node_t *parameters, pm_node_t *body, const pm_token_t *closing) { + pm_block_node_t *node = PM_NODE_ALLOC(parser, pm_block_node_t); + + *node = (pm_block_node_t) { + .base = PM_NODE_INIT_TOKENS(parser, PM_BLOCK_NODE, 0, opening, closing), + .locals = *locals, + .parameters = parameters, + .body = body, + .opening_loc = PM_LOCATION_TOKEN_VALUE(opening), + .closing_loc = PM_LOCATION_TOKEN_VALUE(closing) + }; + + return node; +} + +/** + * Allocate and initialize a new BlockParameterNode node. + */ +static pm_block_parameter_node_t * +pm_block_parameter_node_create(pm_parser_t *parser, const pm_token_t *name, const pm_token_t *operator) { + assert(operator->type == PM_TOKEN_NOT_PROVIDED || operator->type == PM_TOKEN_UAMPERSAND || operator->type == PM_TOKEN_AMPERSAND); + pm_block_parameter_node_t *node = PM_NODE_ALLOC(parser, pm_block_parameter_node_t); + + *node = (pm_block_parameter_node_t) { + .base = ( + (name->type == PM_TOKEN_NOT_PROVIDED) + ? PM_NODE_INIT_TOKEN(parser, PM_BLOCK_PARAMETER_NODE, 0, operator) + : PM_NODE_INIT_TOKENS(parser, PM_BLOCK_PARAMETER_NODE, 0, operator, name) + ), + .name = pm_parser_optional_constant_id_token(parser, name), + .name_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(name), + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator) + }; + + return node; +} + +/** + * Allocate and initialize a new BlockParametersNode node. + */ +static pm_block_parameters_node_t * +pm_block_parameters_node_create(pm_parser_t *parser, pm_parameters_node_t *parameters, const pm_token_t *opening) { + pm_block_parameters_node_t *node = PM_NODE_ALLOC(parser, pm_block_parameters_node_t); + + const uint8_t *start; + if (opening->type != PM_TOKEN_NOT_PROVIDED) { + start = opening->start; + } else if (parameters != NULL) { + start = parameters->base.location.start; + } else { + start = NULL; + } + + const uint8_t *end; + if (parameters != NULL) { + end = parameters->base.location.end; + } else if (opening->type != PM_TOKEN_NOT_PROVIDED) { + end = opening->end; + } else { + end = NULL; + } + + *node = (pm_block_parameters_node_t) { + .base = PM_NODE_INIT(parser, PM_BLOCK_PARAMETERS_NODE, 0, start, end), + .parameters = parameters, + .opening_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(opening), + .closing_loc = { 0 }, + .locals = { 0 } + }; + + return node; +} + +/** + * Set the closing location of a BlockParametersNode node. + */ +static void +pm_block_parameters_node_closing_set(pm_block_parameters_node_t *node, const pm_token_t *closing) { + assert(closing->type == PM_TOKEN_PIPE || closing->type == PM_TOKEN_PARENTHESIS_RIGHT || closing->type == PM_TOKEN_MISSING); + + node->base.location.end = closing->end; + node->closing_loc = PM_LOCATION_TOKEN_VALUE(closing); +} + +/** + * Allocate and initialize a new BlockLocalVariableNode node. + */ +static pm_block_local_variable_node_t * +pm_block_local_variable_node_create(pm_parser_t *parser, const pm_token_t *name) { + pm_block_local_variable_node_t *node = PM_NODE_ALLOC(parser, pm_block_local_variable_node_t); + + *node = (pm_block_local_variable_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_BLOCK_LOCAL_VARIABLE_NODE, 0, name), + .name = pm_parser_constant_id_token(parser, name) + }; + + return node; +} + +/** + * Append a new block-local variable to a BlockParametersNode node. + */ +static void +pm_block_parameters_node_append_local(pm_block_parameters_node_t *node, const pm_block_local_variable_node_t *local) { + pm_node_list_append(&node->locals, UP(local)); + + if (node->base.location.start == NULL) node->base.location.start = local->base.location.start; + node->base.location.end = local->base.location.end; +} + +/** + * Allocate and initialize a new BreakNode node. + */ +static pm_break_node_t * +pm_break_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_arguments_node_t *arguments) { + assert(keyword->type == PM_TOKEN_KEYWORD_BREAK); + pm_break_node_t *node = PM_NODE_ALLOC(parser, pm_break_node_t); + + *node = (pm_break_node_t) { + .base = ( + (arguments == NULL) + ? PM_NODE_INIT_TOKEN(parser, PM_BREAK_NODE, 0, keyword) + : PM_NODE_INIT_TOKEN_NODE(parser, PM_BREAK_NODE, 0, keyword, arguments) + ), + .arguments = arguments, + .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword) + }; + + return node; +} + +// There are certain flags that we want to use internally but don't want to +// expose because they are not relevant beyond parsing. Therefore we'll define +// them here and not define them in config.yml/a header file. +static const pm_node_flags_t PM_WRITE_NODE_FLAGS_IMPLICIT_ARRAY = (1 << 2); + +static const pm_node_flags_t PM_CALL_NODE_FLAGS_IMPLICIT_ARRAY = ((PM_CALL_NODE_FLAGS_LAST - 1) << 1); +static const pm_node_flags_t PM_CALL_NODE_FLAGS_COMPARISON = ((PM_CALL_NODE_FLAGS_LAST - 1) << 2); +static const pm_node_flags_t PM_CALL_NODE_FLAGS_INDEX = ((PM_CALL_NODE_FLAGS_LAST - 1) << 3); + +/** + * Allocate and initialize a new CallNode node. This sets everything to NULL or + * PM_TOKEN_NOT_PROVIDED as appropriate such that its values can be overridden + * in the various specializations of this function. + */ +static pm_call_node_t * +pm_call_node_create(pm_parser_t *parser, pm_node_flags_t flags) { + pm_call_node_t *node = PM_NODE_ALLOC(parser, pm_call_node_t); + + *node = (pm_call_node_t) { + .base = PM_NODE_INIT_BASE(parser, PM_CALL_NODE, flags), + .receiver = NULL, + .call_operator_loc = { 0 }, + .message_loc = { 0 }, + .opening_loc = { 0 }, + .arguments = NULL, + .closing_loc = { 0 }, + .equal_loc = { 0 }, + .block = NULL, + .name = 0 + }; + + return node; +} + +/** + * Returns the value that the ignore visibility flag should be set to for the + * given receiver. + */ +static inline pm_node_flags_t +pm_call_node_ignore_visibility_flag(const pm_node_t *receiver) { + return PM_NODE_TYPE_P(receiver, PM_SELF_NODE) ? PM_CALL_NODE_FLAGS_IGNORE_VISIBILITY : 0; +} + +/** + * Allocate and initialize a new CallNode node from an aref or an aset + * expression. + */ +static pm_call_node_t * +pm_call_node_aref_create(pm_parser_t *parser, pm_node_t *receiver, pm_arguments_t *arguments) { + pm_assert_value_expression(parser, receiver); + + pm_node_flags_t flags = pm_call_node_ignore_visibility_flag(receiver); + if (arguments->block == NULL || PM_NODE_TYPE_P(arguments->block, PM_BLOCK_ARGUMENT_NODE)) { + flags |= PM_CALL_NODE_FLAGS_INDEX; + } + + pm_call_node_t *node = pm_call_node_create(parser, flags); + + node->base.location.start = receiver->location.start; + node->base.location.end = pm_arguments_end(arguments); + + node->receiver = receiver; + node->message_loc.start = arguments->opening_loc.start; + node->message_loc.end = arguments->closing_loc.end; + + node->opening_loc = arguments->opening_loc; + node->arguments = arguments->arguments; + node->closing_loc = arguments->closing_loc; + node->block = arguments->block; + + node->name = pm_parser_constant_id_constant(parser, "[]", 2); + return node; +} + +/** + * Allocate and initialize a new CallNode node from a binary expression. + */ +static pm_call_node_t * +pm_call_node_binary_create(pm_parser_t *parser, pm_node_t *receiver, pm_token_t *operator, pm_node_t *argument, pm_node_flags_t flags) { + pm_assert_value_expression(parser, receiver); + pm_assert_value_expression(parser, argument); + + pm_call_node_t *node = pm_call_node_create(parser, pm_call_node_ignore_visibility_flag(receiver) | flags); + + node->base.location.start = MIN(receiver->location.start, argument->location.start); + node->base.location.end = MAX(receiver->location.end, argument->location.end); + + node->receiver = receiver; + node->message_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(operator); + + pm_arguments_node_t *arguments = pm_arguments_node_create(parser); + pm_arguments_node_arguments_append(arguments, argument); + node->arguments = arguments; + + node->name = pm_parser_constant_id_token(parser, operator); + return node; +} + +static const uint8_t * parse_operator_symbol_name(const pm_token_t *); + +/** + * Allocate and initialize a new CallNode node from a call expression. + */ +static pm_call_node_t * +pm_call_node_call_create(pm_parser_t *parser, pm_node_t *receiver, pm_token_t *operator, pm_token_t *message, pm_arguments_t *arguments) { + pm_assert_value_expression(parser, receiver); + + pm_call_node_t *node = pm_call_node_create(parser, pm_call_node_ignore_visibility_flag(receiver)); + + node->base.location.start = receiver->location.start; + const uint8_t *end = pm_arguments_end(arguments); + if (end == NULL) { + end = message->end; + } + node->base.location.end = end; + + node->receiver = receiver; + node->call_operator_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(operator); + node->message_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(message); + node->opening_loc = arguments->opening_loc; + node->arguments = arguments->arguments; + node->closing_loc = arguments->closing_loc; + node->block = arguments->block; + + if (operator->type == PM_TOKEN_AMPERSAND_DOT) { + pm_node_flag_set(UP(node), PM_CALL_NODE_FLAGS_SAFE_NAVIGATION); + } + + /** + * If the final character is `@` as is the case for `foo.~@`, + * we should ignore the @ in the same way we do for symbols. + */ + node->name = pm_parser_constant_id_location(parser, message->start, parse_operator_symbol_name(message)); + return node; +} + +/** + * Allocate and initialize a new synthesized CallNode node from a call expression. + */ +static pm_call_node_t * +pm_call_node_call_synthesized_create(pm_parser_t *parser, pm_node_t *receiver, const char *message, pm_arguments_node_t *arguments) { + pm_call_node_t *node = pm_call_node_create(parser, 0); + node->base.location.start = parser->start; + node->base.location.end = parser->end; + + node->receiver = receiver; + node->call_operator_loc = (pm_location_t) { .start = NULL, .end = NULL }; + node->message_loc = (pm_location_t) { .start = NULL, .end = NULL }; + node->arguments = arguments; + + node->name = pm_parser_constant_id_constant(parser, message, strlen(message)); + return node; +} + +/** + * Allocate and initialize a new CallNode node from a call to a method name + * without a receiver that could not have been a local variable read. + */ +static pm_call_node_t * +pm_call_node_fcall_create(pm_parser_t *parser, pm_token_t *message, pm_arguments_t *arguments) { + pm_call_node_t *node = pm_call_node_create(parser, PM_CALL_NODE_FLAGS_IGNORE_VISIBILITY); + + node->base.location.start = message->start; + node->base.location.end = pm_arguments_end(arguments); + + node->message_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(message); + node->opening_loc = arguments->opening_loc; + node->arguments = arguments->arguments; + node->closing_loc = arguments->closing_loc; + node->block = arguments->block; + + node->name = pm_parser_constant_id_token(parser, message); + return node; +} + +/** + * Allocate and initialize a new CallNode node from a synthesized call to a + * method name with the given arguments. + */ +static pm_call_node_t * +pm_call_node_fcall_synthesized_create(pm_parser_t *parser, pm_arguments_node_t *arguments, pm_constant_id_t name) { + pm_call_node_t *node = pm_call_node_create(parser, PM_CALL_NODE_FLAGS_IGNORE_VISIBILITY); + + node->base.location = PM_LOCATION_NULL_VALUE(parser); + node->arguments = arguments; + + node->name = name; + return node; +} + +/** + * Allocate and initialize a new CallNode node from a not expression. + */ +static pm_call_node_t * +pm_call_node_not_create(pm_parser_t *parser, pm_node_t *receiver, pm_token_t *message, pm_arguments_t *arguments) { + pm_assert_value_expression(parser, receiver); + if (receiver != NULL) pm_conditional_predicate(parser, receiver, PM_CONDITIONAL_PREDICATE_TYPE_NOT); + + pm_call_node_t *node = pm_call_node_create(parser, receiver == NULL ? 0 : pm_call_node_ignore_visibility_flag(receiver)); + + node->base.location.start = message->start; + if (arguments->closing_loc.start != NULL) { + node->base.location.end = arguments->closing_loc.end; + } else { + assert(receiver != NULL); + node->base.location.end = receiver->location.end; + } + + node->receiver = receiver; + node->message_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(message); + node->opening_loc = arguments->opening_loc; + node->arguments = arguments->arguments; + node->closing_loc = arguments->closing_loc; + + node->name = pm_parser_constant_id_constant(parser, "!", 1); + return node; +} + +/** + * Allocate and initialize a new CallNode node from a call shorthand expression. + */ +static pm_call_node_t * +pm_call_node_shorthand_create(pm_parser_t *parser, pm_node_t *receiver, pm_token_t *operator, pm_arguments_t *arguments) { + pm_assert_value_expression(parser, receiver); + + pm_call_node_t *node = pm_call_node_create(parser, pm_call_node_ignore_visibility_flag(receiver)); + + node->base.location.start = receiver->location.start; + node->base.location.end = pm_arguments_end(arguments); + + node->receiver = receiver; + node->call_operator_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(operator); + node->opening_loc = arguments->opening_loc; + node->arguments = arguments->arguments; + node->closing_loc = arguments->closing_loc; + node->block = arguments->block; + + if (operator->type == PM_TOKEN_AMPERSAND_DOT) { + pm_node_flag_set(UP(node), PM_CALL_NODE_FLAGS_SAFE_NAVIGATION); + } + + node->name = pm_parser_constant_id_constant(parser, "call", 4); + return node; +} + +/** + * Allocate and initialize a new CallNode node from a unary operator expression. + */ +static pm_call_node_t * +pm_call_node_unary_create(pm_parser_t *parser, pm_token_t *operator, pm_node_t *receiver, const char *name) { + pm_assert_value_expression(parser, receiver); + + pm_call_node_t *node = pm_call_node_create(parser, pm_call_node_ignore_visibility_flag(receiver)); + + node->base.location.start = operator->start; + node->base.location.end = receiver->location.end; + + node->receiver = receiver; + node->message_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(operator); + + node->name = pm_parser_constant_id_constant(parser, name, strlen(name)); + return node; +} + +/** + * Allocate and initialize a new CallNode node from a call to a method name + * without a receiver that could also have been a local variable read. + */ +static pm_call_node_t * +pm_call_node_variable_call_create(pm_parser_t *parser, pm_token_t *message) { + pm_call_node_t *node = pm_call_node_create(parser, PM_CALL_NODE_FLAGS_IGNORE_VISIBILITY); + + node->base.location = PM_LOCATION_TOKEN_VALUE(message); + node->message_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(message); + + node->name = pm_parser_constant_id_token(parser, message); + return node; +} + +/** + * Returns whether or not this call can be used on the left-hand side of an + * operator assignment. + */ +static inline bool +pm_call_node_writable_p(const pm_parser_t *parser, const pm_call_node_t *node) { + return ( + (node->message_loc.start != NULL) && + (node->message_loc.end[-1] != '!') && + (node->message_loc.end[-1] != '?') && + char_is_identifier_start(parser, node->message_loc.start, parser->end - node->message_loc.start) && + (node->opening_loc.start == NULL) && + (node->arguments == NULL) && + (node->block == NULL) + ); +} + +/** + * Initialize the read name by reading the write name and chopping off the '='. + */ +static void +pm_call_write_read_name_init(pm_parser_t *parser, pm_constant_id_t *read_name, pm_constant_id_t *write_name) { + pm_constant_t *write_constant = pm_constant_pool_id_to_constant(&parser->constant_pool, *write_name); + + if (write_constant->length > 0) { + size_t length = write_constant->length - 1; + + void *memory = xmalloc(length); + memcpy(memory, write_constant->start, length); + + *read_name = pm_constant_pool_insert_owned(&parser->constant_pool, (uint8_t *) memory, length); + } else { + // We can get here if the message was missing because of a syntax error. + *read_name = pm_parser_constant_id_constant(parser, "", 0); + } +} + +/** + * Allocate and initialize a new CallAndWriteNode node. + */ +static pm_call_and_write_node_t * +pm_call_and_write_node_create(pm_parser_t *parser, pm_call_node_t *target, const pm_token_t *operator, pm_node_t *value) { + assert(target->block == NULL); + assert(operator->type == PM_TOKEN_AMPERSAND_AMPERSAND_EQUAL); + pm_call_and_write_node_t *node = PM_NODE_ALLOC(parser, pm_call_and_write_node_t); + + *node = (pm_call_and_write_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_CALL_AND_WRITE_NODE, FL(target), target, value), + .receiver = target->receiver, + .call_operator_loc = target->call_operator_loc, + .message_loc = target->message_loc, + .read_name = 0, + .write_name = target->name, + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .value = value + }; + + pm_call_write_read_name_init(parser, &node->read_name, &node->write_name); + + // Here we're going to free the target, since it is no longer necessary. + // However, we don't want to call `pm_node_destroy` because we want to keep + // around all of its children since we just reused them. + xfree(target); + + return node; +} + +/** + * Validate that index expressions do not have keywords or blocks if we are + * parsing as Ruby 3.4+. + */ +static void +pm_index_arguments_check(pm_parser_t *parser, const pm_arguments_node_t *arguments, const pm_node_t *block) { + if (parser->version >= PM_OPTIONS_VERSION_CRUBY_3_4) { + if (arguments != NULL && PM_NODE_FLAG_P(arguments, PM_ARGUMENTS_NODE_FLAGS_CONTAINS_KEYWORDS)) { + pm_node_t *node; + PM_NODE_LIST_FOREACH(&arguments->arguments, index, node) { + if (PM_NODE_TYPE_P(node, PM_KEYWORD_HASH_NODE)) { + pm_parser_err_node(parser, node, PM_ERR_UNEXPECTED_INDEX_KEYWORDS); + break; + } + } + } + + if (block != NULL) { + pm_parser_err_node(parser, block, PM_ERR_UNEXPECTED_INDEX_BLOCK); + } + } +} + +/** + * Allocate and initialize a new IndexAndWriteNode node. + */ +static pm_index_and_write_node_t * +pm_index_and_write_node_create(pm_parser_t *parser, pm_call_node_t *target, const pm_token_t *operator, pm_node_t *value) { + assert(operator->type == PM_TOKEN_AMPERSAND_AMPERSAND_EQUAL); + pm_index_and_write_node_t *node = PM_NODE_ALLOC(parser, pm_index_and_write_node_t); + + pm_index_arguments_check(parser, target->arguments, target->block); + + assert(!target->block || PM_NODE_TYPE_P(target->block, PM_BLOCK_ARGUMENT_NODE)); + *node = (pm_index_and_write_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_INDEX_AND_WRITE_NODE, FL(target), target, value), + .receiver = target->receiver, + .call_operator_loc = target->call_operator_loc, + .opening_loc = target->opening_loc, + .arguments = target->arguments, + .closing_loc = target->closing_loc, + .block = (pm_block_argument_node_t *) target->block, + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .value = value + }; + + // Here we're going to free the target, since it is no longer necessary. + // However, we don't want to call `pm_node_destroy` because we want to keep + // around all of its children since we just reused them. + xfree(target); + + return node; +} + +/** + * Allocate a new CallOperatorWriteNode node. + */ +static pm_call_operator_write_node_t * +pm_call_operator_write_node_create(pm_parser_t *parser, pm_call_node_t *target, const pm_token_t *operator, pm_node_t *value) { + assert(target->block == NULL); + pm_call_operator_write_node_t *node = PM_NODE_ALLOC(parser, pm_call_operator_write_node_t); + + *node = (pm_call_operator_write_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_CALL_OPERATOR_WRITE_NODE, FL(target), target, value), + .receiver = target->receiver, + .call_operator_loc = target->call_operator_loc, + .message_loc = target->message_loc, + .read_name = 0, + .write_name = target->name, + .binary_operator = pm_parser_constant_id_location(parser, operator->start, operator->end - 1), + .binary_operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .value = value + }; + + pm_call_write_read_name_init(parser, &node->read_name, &node->write_name); + + // Here we're going to free the target, since it is no longer necessary. + // However, we don't want to call `pm_node_destroy` because we want to keep + // around all of its children since we just reused them. + xfree(target); + + return node; +} + +/** + * Allocate a new IndexOperatorWriteNode node. + */ +static pm_index_operator_write_node_t * +pm_index_operator_write_node_create(pm_parser_t *parser, pm_call_node_t *target, const pm_token_t *operator, pm_node_t *value) { + pm_index_operator_write_node_t *node = PM_NODE_ALLOC(parser, pm_index_operator_write_node_t); + + pm_index_arguments_check(parser, target->arguments, target->block); + + assert(!target->block || PM_NODE_TYPE_P(target->block, PM_BLOCK_ARGUMENT_NODE)); + *node = (pm_index_operator_write_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_INDEX_OPERATOR_WRITE_NODE, FL(target), target, value), + .receiver = target->receiver, + .call_operator_loc = target->call_operator_loc, + .opening_loc = target->opening_loc, + .arguments = target->arguments, + .closing_loc = target->closing_loc, + .block = (pm_block_argument_node_t *) target->block, + .binary_operator = pm_parser_constant_id_location(parser, operator->start, operator->end - 1), + .binary_operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .value = value + }; + + // Here we're going to free the target, since it is no longer necessary. + // However, we don't want to call `pm_node_destroy` because we want to keep + // around all of its children since we just reused them. + xfree(target); + + return node; +} + +/** + * Allocate and initialize a new CallOrWriteNode node. + */ +static pm_call_or_write_node_t * +pm_call_or_write_node_create(pm_parser_t *parser, pm_call_node_t *target, const pm_token_t *operator, pm_node_t *value) { + assert(target->block == NULL); + assert(operator->type == PM_TOKEN_PIPE_PIPE_EQUAL); + pm_call_or_write_node_t *node = PM_NODE_ALLOC(parser, pm_call_or_write_node_t); + + *node = (pm_call_or_write_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_CALL_OR_WRITE_NODE, FL(target), target, value), + .receiver = target->receiver, + .call_operator_loc = target->call_operator_loc, + .message_loc = target->message_loc, + .read_name = 0, + .write_name = target->name, + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .value = value + }; + + pm_call_write_read_name_init(parser, &node->read_name, &node->write_name); + + // Here we're going to free the target, since it is no longer necessary. + // However, we don't want to call `pm_node_destroy` because we want to keep + // around all of its children since we just reused them. + xfree(target); + + return node; +} + +/** + * Allocate and initialize a new IndexOrWriteNode node. + */ +static pm_index_or_write_node_t * +pm_index_or_write_node_create(pm_parser_t *parser, pm_call_node_t *target, const pm_token_t *operator, pm_node_t *value) { + assert(operator->type == PM_TOKEN_PIPE_PIPE_EQUAL); + pm_index_or_write_node_t *node = PM_NODE_ALLOC(parser, pm_index_or_write_node_t); + + pm_index_arguments_check(parser, target->arguments, target->block); + + assert(!target->block || PM_NODE_TYPE_P(target->block, PM_BLOCK_ARGUMENT_NODE)); + *node = (pm_index_or_write_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_INDEX_OR_WRITE_NODE, FL(target), target, value), + .receiver = target->receiver, + .call_operator_loc = target->call_operator_loc, + .opening_loc = target->opening_loc, + .arguments = target->arguments, + .closing_loc = target->closing_loc, + .block = (pm_block_argument_node_t *) target->block, + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .value = value + }; + + // Here we're going to free the target, since it is no longer necessary. + // However, we don't want to call `pm_node_destroy` because we want to keep + // around all of its children since we just reused them. + xfree(target); + + return node; +} + +/** + * Allocate and initialize a new CallTargetNode node from an existing call + * node. + */ +static pm_call_target_node_t * +pm_call_target_node_create(pm_parser_t *parser, pm_call_node_t *target) { + pm_call_target_node_t *node = PM_NODE_ALLOC(parser, pm_call_target_node_t); + + *node = (pm_call_target_node_t) { + .base = PM_NODE_INIT_NODE(parser, PM_CALL_TARGET_NODE, FL(target), target), + .receiver = target->receiver, + .call_operator_loc = target->call_operator_loc, + .name = target->name, + .message_loc = target->message_loc + }; + + /* It is possible to get here where we have parsed an invalid syntax tree + * where the call operator was not present. In that case we will have a + * problem because it is a required location. In this case we need to fill + * it in with a fake location so that the syntax tree remains valid. */ + if (node->call_operator_loc.start == NULL) { + node->call_operator_loc = (pm_location_t) { + .start = target->base.location.start, + .end = target->base.location.start + }; + } + + // Here we're going to free the target, since it is no longer necessary. + // However, we don't want to call `pm_node_destroy` because we want to keep + // around all of its children since we just reused them. + xfree(target); + + return node; +} + +/** + * Allocate and initialize a new IndexTargetNode node from an existing call + * node. + */ +static pm_index_target_node_t * +pm_index_target_node_create(pm_parser_t *parser, pm_call_node_t *target) { + pm_index_target_node_t *node = PM_NODE_ALLOC(parser, pm_index_target_node_t); + + pm_index_arguments_check(parser, target->arguments, target->block); + assert(!target->block || PM_NODE_TYPE_P(target->block, PM_BLOCK_ARGUMENT_NODE)); + + *node = (pm_index_target_node_t) { + .base = PM_NODE_INIT_NODE(parser, PM_INDEX_TARGET_NODE, FL(target) | PM_CALL_NODE_FLAGS_ATTRIBUTE_WRITE, target), + .receiver = target->receiver, + .opening_loc = target->opening_loc, + .arguments = target->arguments, + .closing_loc = target->closing_loc, + .block = (pm_block_argument_node_t *) target->block, + }; + + // Here we're going to free the target, since it is no longer necessary. + // However, we don't want to call `pm_node_destroy` because we want to keep + // around all of its children since we just reused them. + xfree(target); + + return node; +} + +/** + * Allocate and initialize a new CapturePatternNode node. + */ +static pm_capture_pattern_node_t * +pm_capture_pattern_node_create(pm_parser_t *parser, pm_node_t *value, pm_local_variable_target_node_t *target, const pm_token_t *operator) { + pm_capture_pattern_node_t *node = PM_NODE_ALLOC(parser, pm_capture_pattern_node_t); + + *node = (pm_capture_pattern_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_CAPTURE_PATTERN_NODE, 0, value, target), + .value = value, + .target = target, + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator) + }; + + return node; +} + +/** + * Allocate and initialize a new CaseNode node. + */ +static pm_case_node_t * +pm_case_node_create(pm_parser_t *parser, const pm_token_t *case_keyword, pm_node_t *predicate, const pm_token_t *end_keyword) { + pm_case_node_t *node = PM_NODE_ALLOC(parser, pm_case_node_t); + + *node = (pm_case_node_t) { + .base = PM_NODE_INIT_TOKENS(parser, PM_CASE_NODE, 0, case_keyword, end_keyword), + .predicate = predicate, + .else_clause = NULL, + .case_keyword_loc = PM_LOCATION_TOKEN_VALUE(case_keyword), + .end_keyword_loc = PM_LOCATION_TOKEN_VALUE(end_keyword), + .conditions = { 0 } + }; + + return node; +} + +/** + * Append a new condition to a CaseNode node. + */ +static void +pm_case_node_condition_append(pm_case_node_t *node, pm_node_t *condition) { + assert(PM_NODE_TYPE_P(condition, PM_WHEN_NODE)); + + pm_node_list_append(&node->conditions, condition); + node->base.location.end = condition->location.end; +} + +/** + * Set the else clause of a CaseNode node. + */ +static void +pm_case_node_else_clause_set(pm_case_node_t *node, pm_else_node_t *else_clause) { + node->else_clause = else_clause; + node->base.location.end = else_clause->base.location.end; +} + +/** + * Set the end location for a CaseNode node. + */ +static void +pm_case_node_end_keyword_loc_set(pm_case_node_t *node, const pm_token_t *end_keyword) { + node->base.location.end = end_keyword->end; + node->end_keyword_loc = PM_LOCATION_TOKEN_VALUE(end_keyword); +} + +/** + * Allocate and initialize a new CaseMatchNode node. + */ +static pm_case_match_node_t * +pm_case_match_node_create(pm_parser_t *parser, const pm_token_t *case_keyword, pm_node_t *predicate, const pm_token_t *end_keyword) { + pm_case_match_node_t *node = PM_NODE_ALLOC(parser, pm_case_match_node_t); + + *node = (pm_case_match_node_t) { + .base = PM_NODE_INIT_TOKENS(parser, PM_CASE_MATCH_NODE, 0, case_keyword, end_keyword), + .predicate = predicate, + .else_clause = NULL, + .case_keyword_loc = PM_LOCATION_TOKEN_VALUE(case_keyword), + .end_keyword_loc = PM_LOCATION_TOKEN_VALUE(end_keyword), + .conditions = { 0 } + }; + + return node; +} + +/** + * Append a new condition to a CaseMatchNode node. + */ +static void +pm_case_match_node_condition_append(pm_case_match_node_t *node, pm_node_t *condition) { + assert(PM_NODE_TYPE_P(condition, PM_IN_NODE)); + + pm_node_list_append(&node->conditions, condition); + node->base.location.end = condition->location.end; +} + +/** + * Set the else clause of a CaseMatchNode node. + */ +static void +pm_case_match_node_else_clause_set(pm_case_match_node_t *node, pm_else_node_t *else_clause) { + node->else_clause = else_clause; + node->base.location.end = else_clause->base.location.end; +} + +/** + * Set the end location for a CaseMatchNode node. + */ +static void +pm_case_match_node_end_keyword_loc_set(pm_case_match_node_t *node, const pm_token_t *end_keyword) { + node->base.location.end = end_keyword->end; + node->end_keyword_loc = PM_LOCATION_TOKEN_VALUE(end_keyword); +} + +/** + * Allocate a new ClassNode node. + */ +static pm_class_node_t * +pm_class_node_create(pm_parser_t *parser, pm_constant_id_list_t *locals, const pm_token_t *class_keyword, pm_node_t *constant_path, const pm_token_t *name, const pm_token_t *inheritance_operator, pm_node_t *superclass, pm_node_t *body, const pm_token_t *end_keyword) { + pm_class_node_t *node = PM_NODE_ALLOC(parser, pm_class_node_t); + + *node = (pm_class_node_t) { + .base = PM_NODE_INIT_TOKENS(parser, PM_CLASS_NODE, 0, class_keyword, end_keyword), + .locals = *locals, + .class_keyword_loc = PM_LOCATION_TOKEN_VALUE(class_keyword), + .constant_path = constant_path, + .inheritance_operator_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(inheritance_operator), + .superclass = superclass, + .body = body, + .end_keyword_loc = PM_LOCATION_TOKEN_VALUE(end_keyword), + .name = pm_parser_constant_id_token(parser, name) + }; + + return node; +} + +/** + * Allocate and initialize a new ClassVariableAndWriteNode node. + */ +static pm_class_variable_and_write_node_t * +pm_class_variable_and_write_node_create(pm_parser_t *parser, pm_class_variable_read_node_t *target, const pm_token_t *operator, pm_node_t *value) { + assert(operator->type == PM_TOKEN_AMPERSAND_AMPERSAND_EQUAL); + pm_class_variable_and_write_node_t *node = PM_NODE_ALLOC(parser, pm_class_variable_and_write_node_t); + + *node = (pm_class_variable_and_write_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_CLASS_VARIABLE_AND_WRITE_NODE, 0, target, value), + .name = target->name, + .name_loc = target->base.location, + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .value = value + }; + + return node; +} + +/** + * Allocate and initialize a new ClassVariableOperatorWriteNode node. + */ +static pm_class_variable_operator_write_node_t * +pm_class_variable_operator_write_node_create(pm_parser_t *parser, pm_class_variable_read_node_t *target, const pm_token_t *operator, pm_node_t *value) { + pm_class_variable_operator_write_node_t *node = PM_NODE_ALLOC(parser, pm_class_variable_operator_write_node_t); + + *node = (pm_class_variable_operator_write_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_CLASS_VARIABLE_OPERATOR_WRITE_NODE, 0, target, value), + .name = target->name, + .name_loc = target->base.location, + .binary_operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .value = value, + .binary_operator = pm_parser_constant_id_location(parser, operator->start, operator->end - 1) + }; + + return node; +} + +/** + * Allocate and initialize a new ClassVariableOrWriteNode node. + */ +static pm_class_variable_or_write_node_t * +pm_class_variable_or_write_node_create(pm_parser_t *parser, pm_class_variable_read_node_t *target, const pm_token_t *operator, pm_node_t *value) { + assert(operator->type == PM_TOKEN_PIPE_PIPE_EQUAL); + pm_class_variable_or_write_node_t *node = PM_NODE_ALLOC(parser, pm_class_variable_or_write_node_t); + + *node = (pm_class_variable_or_write_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_CLASS_VARIABLE_OR_WRITE_NODE, 0, target, value), + .name = target->name, + .name_loc = target->base.location, + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .value = value + }; + + return node; +} + +/** + * Allocate and initialize a new ClassVariableReadNode node. + */ +static pm_class_variable_read_node_t * +pm_class_variable_read_node_create(pm_parser_t *parser, const pm_token_t *token) { + assert(token->type == PM_TOKEN_CLASS_VARIABLE); + pm_class_variable_read_node_t *node = PM_NODE_ALLOC(parser, pm_class_variable_read_node_t); + + *node = (pm_class_variable_read_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_CLASS_VARIABLE_READ_NODE, 0, token), + .name = pm_parser_constant_id_token(parser, token) + }; + + return node; +} + +/** + * True if the given node is an implicit array node on a write, as in: + * + * a = *b + * a = 1, 2, 3 + */ +static inline pm_node_flags_t +pm_implicit_array_write_flags(const pm_node_t *node, pm_node_flags_t flags) { + if (PM_NODE_TYPE_P(node, PM_ARRAY_NODE) && ((const pm_array_node_t *) node)->opening_loc.start == NULL) { + return flags; + } + return 0; +} + +/** + * Initialize a new ClassVariableWriteNode node from a ClassVariableRead node. + */ +static pm_class_variable_write_node_t * +pm_class_variable_write_node_create(pm_parser_t *parser, pm_class_variable_read_node_t *read_node, pm_token_t *operator, pm_node_t *value) { + pm_class_variable_write_node_t *node = PM_NODE_ALLOC(parser, pm_class_variable_write_node_t); + pm_node_flags_t flags = pm_implicit_array_write_flags(value, PM_WRITE_NODE_FLAGS_IMPLICIT_ARRAY); + + *node = (pm_class_variable_write_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_CLASS_VARIABLE_WRITE_NODE, flags, read_node, value), + .name = read_node->name, + .name_loc = PM_LOCATION_NODE_VALUE(UP(read_node)), + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .value = value + }; + + return node; +} + +/** + * Allocate and initialize a new ConstantPathAndWriteNode node. + */ +static pm_constant_path_and_write_node_t * +pm_constant_path_and_write_node_create(pm_parser_t *parser, pm_constant_path_node_t *target, const pm_token_t *operator, pm_node_t *value) { + assert(operator->type == PM_TOKEN_AMPERSAND_AMPERSAND_EQUAL); + pm_constant_path_and_write_node_t *node = PM_NODE_ALLOC(parser, pm_constant_path_and_write_node_t); + + *node = (pm_constant_path_and_write_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_CONSTANT_PATH_AND_WRITE_NODE, 0, target, value), + .target = target, + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .value = value + }; + + return node; +} + +/** + * Allocate and initialize a new ConstantPathOperatorWriteNode node. + */ +static pm_constant_path_operator_write_node_t * +pm_constant_path_operator_write_node_create(pm_parser_t *parser, pm_constant_path_node_t *target, const pm_token_t *operator, pm_node_t *value) { + pm_constant_path_operator_write_node_t *node = PM_NODE_ALLOC(parser, pm_constant_path_operator_write_node_t); + + *node = (pm_constant_path_operator_write_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_CONSTANT_PATH_OPERATOR_WRITE_NODE, 0, target, value), + .target = target, + .binary_operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .value = value, + .binary_operator = pm_parser_constant_id_location(parser, operator->start, operator->end - 1) + }; + + return node; +} + +/** + * Allocate and initialize a new ConstantPathOrWriteNode node. + */ +static pm_constant_path_or_write_node_t * +pm_constant_path_or_write_node_create(pm_parser_t *parser, pm_constant_path_node_t *target, const pm_token_t *operator, pm_node_t *value) { + assert(operator->type == PM_TOKEN_PIPE_PIPE_EQUAL); + pm_constant_path_or_write_node_t *node = PM_NODE_ALLOC(parser, pm_constant_path_or_write_node_t); + + *node = (pm_constant_path_or_write_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_CONSTANT_PATH_OR_WRITE_NODE, 0, target, value), + .target = target, + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .value = value + }; + + return node; +} + +/** + * Allocate and initialize a new ConstantPathNode node. + */ +static pm_constant_path_node_t * +pm_constant_path_node_create(pm_parser_t *parser, pm_node_t *parent, const pm_token_t *delimiter, const pm_token_t *name_token) { + pm_assert_value_expression(parser, parent); + pm_constant_path_node_t *node = PM_NODE_ALLOC(parser, pm_constant_path_node_t); + + pm_constant_id_t name = PM_CONSTANT_ID_UNSET; + if (name_token->type == PM_TOKEN_CONSTANT) { + name = pm_parser_constant_id_token(parser, name_token); + } + + if (parent == NULL) { + *node = (pm_constant_path_node_t) { + .base = PM_NODE_INIT_TOKENS(parser, PM_CONSTANT_PATH_NODE, 0, delimiter, name_token), + .parent = parent, + .name = name, + .delimiter_loc = PM_LOCATION_TOKEN_VALUE(delimiter), + .name_loc = PM_LOCATION_TOKEN_VALUE(name_token) + }; + } else { + *node = (pm_constant_path_node_t) { + .base = PM_NODE_INIT_NODE_TOKEN(parser, PM_CONSTANT_PATH_NODE, 0, parent, name_token), + .parent = parent, + .name = name, + .delimiter_loc = PM_LOCATION_TOKEN_VALUE(delimiter), + .name_loc = PM_LOCATION_TOKEN_VALUE(name_token) + }; + } + + return node; +} + +/** + * Allocate a new ConstantPathWriteNode node. + */ +static pm_constant_path_write_node_t * +pm_constant_path_write_node_create(pm_parser_t *parser, pm_constant_path_node_t *target, const pm_token_t *operator, pm_node_t *value) { + pm_constant_path_write_node_t *node = PM_NODE_ALLOC(parser, pm_constant_path_write_node_t); + pm_node_flags_t flags = pm_implicit_array_write_flags(value, PM_WRITE_NODE_FLAGS_IMPLICIT_ARRAY); + + *node = (pm_constant_path_write_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_CONSTANT_PATH_WRITE_NODE, flags, target, value), + .target = target, + .operator_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(operator), + .value = value + }; + + return node; +} + +/** + * Allocate and initialize a new ConstantAndWriteNode node. + */ +static pm_constant_and_write_node_t * +pm_constant_and_write_node_create(pm_parser_t *parser, pm_constant_read_node_t *target, const pm_token_t *operator, pm_node_t *value) { + assert(operator->type == PM_TOKEN_AMPERSAND_AMPERSAND_EQUAL); + pm_constant_and_write_node_t *node = PM_NODE_ALLOC(parser, pm_constant_and_write_node_t); + + *node = (pm_constant_and_write_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_CONSTANT_AND_WRITE_NODE, 0, target, value), + .name = target->name, + .name_loc = target->base.location, + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .value = value + }; + + return node; +} + +/** + * Allocate and initialize a new ConstantOperatorWriteNode node. + */ +static pm_constant_operator_write_node_t * +pm_constant_operator_write_node_create(pm_parser_t *parser, pm_constant_read_node_t *target, const pm_token_t *operator, pm_node_t *value) { + pm_constant_operator_write_node_t *node = PM_NODE_ALLOC(parser, pm_constant_operator_write_node_t); + + *node = (pm_constant_operator_write_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_CONSTANT_OPERATOR_WRITE_NODE, 0, target, value), + .name = target->name, + .name_loc = target->base.location, + .binary_operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .value = value, + .binary_operator = pm_parser_constant_id_location(parser, operator->start, operator->end - 1) + }; + + return node; +} + +/** + * Allocate and initialize a new ConstantOrWriteNode node. + */ +static pm_constant_or_write_node_t * +pm_constant_or_write_node_create(pm_parser_t *parser, pm_constant_read_node_t *target, const pm_token_t *operator, pm_node_t *value) { + assert(operator->type == PM_TOKEN_PIPE_PIPE_EQUAL); + pm_constant_or_write_node_t *node = PM_NODE_ALLOC(parser, pm_constant_or_write_node_t); + + *node = (pm_constant_or_write_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_CONSTANT_OR_WRITE_NODE, 0, target, value), + .name = target->name, + .name_loc = target->base.location, + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .value = value + }; + + return node; +} + +/** + * Allocate and initialize a new ConstantReadNode node. + */ +static pm_constant_read_node_t * +pm_constant_read_node_create(pm_parser_t *parser, const pm_token_t *name) { + assert(name->type == PM_TOKEN_CONSTANT || name->type == PM_TOKEN_MISSING); + pm_constant_read_node_t *node = PM_NODE_ALLOC(parser, pm_constant_read_node_t); + + *node = (pm_constant_read_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_CONSTANT_READ_NODE, 0, name), + .name = pm_parser_constant_id_token(parser, name) + }; + + return node; +} + +/** + * Allocate a new ConstantWriteNode node. + */ +static pm_constant_write_node_t * +pm_constant_write_node_create(pm_parser_t *parser, pm_constant_read_node_t *target, const pm_token_t *operator, pm_node_t *value) { + pm_constant_write_node_t *node = PM_NODE_ALLOC(parser, pm_constant_write_node_t); + pm_node_flags_t flags = pm_implicit_array_write_flags(value, PM_WRITE_NODE_FLAGS_IMPLICIT_ARRAY); + + *node = (pm_constant_write_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_CONSTANT_WRITE_NODE, flags, target, value), + .name = target->name, + .name_loc = target->base.location, + .operator_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(operator), + .value = value + }; + + return node; +} + +/** + * Check if the receiver of a `def` node is allowed. + */ +static void +pm_def_node_receiver_check(pm_parser_t *parser, const pm_node_t *node) { + switch (PM_NODE_TYPE(node)) { + case PM_BEGIN_NODE: { + const pm_begin_node_t *cast = (pm_begin_node_t *) node; + if (cast->statements != NULL) pm_def_node_receiver_check(parser, UP(cast->statements)); + break; + } + case PM_PARENTHESES_NODE: { + const pm_parentheses_node_t *cast = (const pm_parentheses_node_t *) node; + if (cast->body != NULL) pm_def_node_receiver_check(parser, cast->body); + break; + } + case PM_STATEMENTS_NODE: { + const pm_statements_node_t *cast = (const pm_statements_node_t *) node; + pm_def_node_receiver_check(parser, cast->body.nodes[cast->body.size - 1]); + break; + } + case PM_ARRAY_NODE: + case PM_FLOAT_NODE: + case PM_IMAGINARY_NODE: + case PM_INTEGER_NODE: + case PM_INTERPOLATED_REGULAR_EXPRESSION_NODE: + case PM_INTERPOLATED_STRING_NODE: + case PM_INTERPOLATED_SYMBOL_NODE: + case PM_INTERPOLATED_X_STRING_NODE: + case PM_RATIONAL_NODE: + case PM_REGULAR_EXPRESSION_NODE: + case PM_SOURCE_ENCODING_NODE: + case PM_SOURCE_FILE_NODE: + case PM_SOURCE_LINE_NODE: + case PM_STRING_NODE: + case PM_SYMBOL_NODE: + case PM_X_STRING_NODE: + pm_parser_err_node(parser, node, PM_ERR_SINGLETON_FOR_LITERALS); + break; + default: + break; + } +} + +/** + * Allocate and initialize a new DefNode node. + */ +static pm_def_node_t * +pm_def_node_create( + pm_parser_t *parser, + pm_constant_id_t name, + const pm_token_t *name_loc, + pm_node_t *receiver, + pm_parameters_node_t *parameters, + pm_node_t *body, + pm_constant_id_list_t *locals, + const pm_token_t *def_keyword, + const pm_token_t *operator, + const pm_token_t *lparen, + const pm_token_t *rparen, + const pm_token_t *equal, + const pm_token_t *end_keyword +) { + pm_def_node_t *node = PM_NODE_ALLOC(parser, pm_def_node_t); + + if (receiver != NULL) { + pm_def_node_receiver_check(parser, receiver); + } + + *node = (pm_def_node_t) { + .base = ( + (end_keyword->type == PM_TOKEN_NOT_PROVIDED) + ? PM_NODE_INIT_TOKEN_NODE(parser, PM_DEF_NODE, 0, def_keyword, body) + : PM_NODE_INIT_TOKENS(parser, PM_DEF_NODE, 0, def_keyword, end_keyword) + ), + .name = name, + .name_loc = PM_LOCATION_TOKEN_VALUE(name_loc), + .receiver = receiver, + .parameters = parameters, + .body = body, + .locals = *locals, + .def_keyword_loc = PM_LOCATION_TOKEN_VALUE(def_keyword), + .operator_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(operator), + .lparen_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(lparen), + .rparen_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(rparen), + .equal_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(equal), + .end_keyword_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(end_keyword) + }; + + return node; +} + +/** + * Allocate a new DefinedNode node. + */ +static pm_defined_node_t * +pm_defined_node_create(pm_parser_t *parser, const pm_token_t *lparen, pm_node_t *value, const pm_token_t *rparen, const pm_token_t *keyword) { + pm_defined_node_t *node = PM_NODE_ALLOC(parser, pm_defined_node_t); + + *node = (pm_defined_node_t) { + .base = ( + (rparen->type == PM_TOKEN_NOT_PROVIDED) + ? PM_NODE_INIT_TOKEN_NODE(parser, PM_DEFINED_NODE, 0, keyword, value) + : PM_NODE_INIT_TOKENS(parser, PM_DEFINED_NODE, 0, keyword, rparen) + ), + .lparen_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(lparen), + .value = value, + .rparen_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(rparen), + .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword) + }; + + return node; +} + +/** + * Allocate and initialize a new ElseNode node. + */ +static pm_else_node_t * +pm_else_node_create(pm_parser_t *parser, const pm_token_t *else_keyword, pm_statements_node_t *statements, const pm_token_t *end_keyword) { + pm_else_node_t *node = PM_NODE_ALLOC(parser, pm_else_node_t); + + *node = (pm_else_node_t) { + .base = ( + ((end_keyword->type == PM_TOKEN_NOT_PROVIDED) && (statements != NULL)) + ? PM_NODE_INIT_TOKEN_NODE(parser, PM_ELSE_NODE, 0, else_keyword, statements) + : PM_NODE_INIT_TOKENS(parser, PM_ELSE_NODE, 0, else_keyword, end_keyword) + ), + .else_keyword_loc = PM_LOCATION_TOKEN_VALUE(else_keyword), + .statements = statements, + .end_keyword_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(end_keyword) + }; + + return node; +} + +/** + * Allocate and initialize a new EmbeddedStatementsNode node. + */ +static pm_embedded_statements_node_t * +pm_embedded_statements_node_create(pm_parser_t *parser, const pm_token_t *opening, pm_statements_node_t *statements, const pm_token_t *closing) { + pm_embedded_statements_node_t *node = PM_NODE_ALLOC(parser, pm_embedded_statements_node_t); + + *node = (pm_embedded_statements_node_t) { + .base = PM_NODE_INIT_TOKENS(parser, PM_EMBEDDED_STATEMENTS_NODE, 0, opening, closing), + .opening_loc = PM_LOCATION_TOKEN_VALUE(opening), + .statements = statements, + .closing_loc = PM_LOCATION_TOKEN_VALUE(closing) + }; + + return node; +} + +/** + * Allocate and initialize a new EmbeddedVariableNode node. + */ +static pm_embedded_variable_node_t * +pm_embedded_variable_node_create(pm_parser_t *parser, const pm_token_t *operator, pm_node_t *variable) { + pm_embedded_variable_node_t *node = PM_NODE_ALLOC(parser, pm_embedded_variable_node_t); + + *node = (pm_embedded_variable_node_t) { + .base = PM_NODE_INIT_TOKEN_NODE(parser, PM_EMBEDDED_VARIABLE_NODE, 0, operator, variable), + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .variable = variable + }; + + return node; +} + +/** + * Allocate a new EnsureNode node. + */ +static pm_ensure_node_t * +pm_ensure_node_create(pm_parser_t *parser, const pm_token_t *ensure_keyword, pm_statements_node_t *statements, const pm_token_t *end_keyword) { + pm_ensure_node_t *node = PM_NODE_ALLOC(parser, pm_ensure_node_t); + + *node = (pm_ensure_node_t) { + .base = PM_NODE_INIT_TOKENS(parser, PM_ENSURE_NODE, 0, ensure_keyword, end_keyword), + .ensure_keyword_loc = PM_LOCATION_TOKEN_VALUE(ensure_keyword), + .statements = statements, + .end_keyword_loc = PM_LOCATION_TOKEN_VALUE(end_keyword) + }; + + return node; +} + +/** + * Allocate and initialize a new FalseNode node. + */ +static pm_false_node_t * +pm_false_node_create(pm_parser_t *parser, const pm_token_t *token) { + assert(token->type == PM_TOKEN_KEYWORD_FALSE); + pm_false_node_t *node = PM_NODE_ALLOC(parser, pm_false_node_t); + + *node = (pm_false_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_FALSE_NODE, PM_NODE_FLAG_STATIC_LITERAL, token) + }; + + return node; +} + +/** + * Allocate and initialize a new find pattern node. The node list given in the + * nodes parameter is guaranteed to have at least two nodes. + */ +static pm_find_pattern_node_t * +pm_find_pattern_node_create(pm_parser_t *parser, pm_node_list_t *nodes) { + pm_find_pattern_node_t *node = PM_NODE_ALLOC(parser, pm_find_pattern_node_t); + + pm_node_t *left = nodes->nodes[0]; + assert(PM_NODE_TYPE_P(left, PM_SPLAT_NODE)); + pm_splat_node_t *left_splat_node = (pm_splat_node_t *) left; + + pm_node_t *right; + + if (nodes->size == 1) { + right = UP(pm_missing_node_create(parser, left->location.end, left->location.end)); + } else { + right = nodes->nodes[nodes->size - 1]; + assert(PM_NODE_TYPE_P(right, PM_SPLAT_NODE)); + } + +#if PRISM_SERIALIZE_ONLY_SEMANTICS_FIELDS + // FindPatternNode#right is typed as SplatNode in this case, so replace the potential MissingNode with a SplatNode. + // The resulting AST will anyway be ignored, but this file still needs to compile. + pm_splat_node_t *right_splat_node = PM_NODE_TYPE_P(right, PM_SPLAT_NODE) ? (pm_splat_node_t *) right : left_splat_node; +#else + pm_node_t *right_splat_node = right; +#endif + *node = (pm_find_pattern_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_FIND_PATTERN_NODE, 0, left, right), + .constant = NULL, + .left = left_splat_node, + .right = right_splat_node, + .requireds = { 0 }, + .opening_loc = { 0 }, + .closing_loc = { 0 } + }; + + // For now we're going to just copy over each pointer manually. This could be + // much more efficient, as we could instead resize the node list to only point + // to 1...-1. + for (size_t index = 1; index < nodes->size - 1; index++) { + pm_node_list_append(&node->requireds, nodes->nodes[index]); + } + + return node; +} + +/** + * Parse the value of a double, add appropriate errors if there is an issue, and + * return the value that should be saved on the PM_FLOAT_NODE node. + */ +static double +pm_double_parse(pm_parser_t *parser, const pm_token_t *token) { + ptrdiff_t diff = token->end - token->start; + if (diff <= 0) return 0.0; + + // First, get a buffer of the content. + size_t length = (size_t) diff; + char *buffer = xmalloc(sizeof(char) * (length + 1)); + memcpy((void *) buffer, token->start, length); + + // Next, determine if we need to replace the decimal point because of + // locale-specific options, and then normalize them if we have to. + char decimal_point = *localeconv()->decimal_point; + if (decimal_point != '.') { + for (size_t index = 0; index < length; index++) { + if (buffer[index] == '.') buffer[index] = decimal_point; + } + } + + // Next, handle underscores by removing them from the buffer. + for (size_t index = 0; index < length; index++) { + if (buffer[index] == '_') { + memmove((void *) (buffer + index), (void *) (buffer + index + 1), length - index); + length--; + } + } + + // Null-terminate the buffer so that strtod cannot read off the end. + buffer[length] = '\0'; + + // Now, call strtod to parse the value. Note that CRuby has their own + // version of strtod which avoids locales. We're okay using the locale-aware + // version because we've already validated through the parser that the token + // is in a valid format. + errno = 0; + char *eptr; + double value = strtod(buffer, &eptr); + + // This should never happen, because we've already checked that the token + // is in a valid format. However it's good to be safe. + if ((eptr != buffer + length) || (errno != 0 && errno != ERANGE)) { + PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, (*token), PM_ERR_FLOAT_PARSE); + xfree((void *) buffer); + return 0.0; + } + + // If errno is set, then it should only be ERANGE. At this point we need to + // check if it's infinity (it should be). + if (errno == ERANGE && PRISM_ISINF(value)) { + int warn_width; + const char *ellipsis; + + if (length > 20) { + warn_width = 20; + ellipsis = "..."; + } else { + warn_width = (int) length; + ellipsis = ""; + } + + pm_diagnostic_list_append_format(&parser->warning_list, token->start, token->end, PM_WARN_FLOAT_OUT_OF_RANGE, warn_width, (const char *) token->start, ellipsis); + value = (value < 0.0) ? -HUGE_VAL : HUGE_VAL; + } + + // Finally we can free the buffer and return the value. + xfree((void *) buffer); + return value; +} + +/** + * Allocate and initialize a new FloatNode node. + */ +static pm_float_node_t * +pm_float_node_create(pm_parser_t *parser, const pm_token_t *token) { + assert(token->type == PM_TOKEN_FLOAT); + pm_float_node_t *node = PM_NODE_ALLOC(parser, pm_float_node_t); + + *node = (pm_float_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_FLOAT_NODE, PM_NODE_FLAG_STATIC_LITERAL, token), + .value = pm_double_parse(parser, token) + }; + + return node; +} + +/** + * Allocate and initialize a new FloatNode node from a FLOAT_IMAGINARY token. + */ +static pm_imaginary_node_t * +pm_float_node_imaginary_create(pm_parser_t *parser, const pm_token_t *token) { + assert(token->type == PM_TOKEN_FLOAT_IMAGINARY); + + pm_imaginary_node_t *node = PM_NODE_ALLOC(parser, pm_imaginary_node_t); + *node = (pm_imaginary_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_IMAGINARY_NODE, PM_NODE_FLAG_STATIC_LITERAL, token), + .numeric = UP(pm_float_node_create(parser, &((pm_token_t) { + .type = PM_TOKEN_FLOAT, + .start = token->start, + .end = token->end - 1 + }))) + }; + + return node; +} + +/** + * Allocate and initialize a new RationalNode node from a FLOAT_RATIONAL token. + */ +static pm_rational_node_t * +pm_float_node_rational_create(pm_parser_t *parser, const pm_token_t *token) { + assert(token->type == PM_TOKEN_FLOAT_RATIONAL); + + pm_rational_node_t *node = PM_NODE_ALLOC(parser, pm_rational_node_t); + *node = (pm_rational_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_RATIONAL_NODE, PM_INTEGER_BASE_FLAGS_DECIMAL | PM_NODE_FLAG_STATIC_LITERAL, token), + .numerator = { 0 }, + .denominator = { 0 } + }; + + const uint8_t *start = token->start; + const uint8_t *end = token->end - 1; // r + + while (start < end && *start == '0') start++; // 0.1 -> .1 + while (end > start && end[-1] == '0') end--; // 1.0 -> 1. + + size_t length = (size_t) (end - start); + if (length == 1) { + node->denominator.value = 1; + return node; + } + + const uint8_t *point = memchr(start, '.', length); + assert(point && "should have a decimal point"); + + uint8_t *digits = xmalloc(length); + if (digits == NULL) { + fputs("[pm_float_node_rational_create] Failed to allocate memory", stderr); + abort(); + } + + memcpy(digits, start, (unsigned long) (point - start)); + memcpy(digits + (point - start), point + 1, (unsigned long) (end - point - 1)); + pm_integer_parse(&node->numerator, PM_INTEGER_BASE_DEFAULT, digits, digits + length - 1); + + size_t fract_length = 0; + for (const uint8_t *fract = point; fract < end; ++fract) { + if (*fract != '_') ++fract_length; + } + digits[0] = '1'; + if (fract_length > 1) memset(digits + 1, '0', fract_length - 1); + pm_integer_parse(&node->denominator, PM_INTEGER_BASE_DEFAULT, digits, digits + fract_length); + xfree(digits); + + pm_integers_reduce(&node->numerator, &node->denominator); + return node; +} + +/** + * Allocate and initialize a new FloatNode node from a FLOAT_RATIONAL_IMAGINARY + * token. + */ +static pm_imaginary_node_t * +pm_float_node_rational_imaginary_create(pm_parser_t *parser, const pm_token_t *token) { + assert(token->type == PM_TOKEN_FLOAT_RATIONAL_IMAGINARY); + + pm_imaginary_node_t *node = PM_NODE_ALLOC(parser, pm_imaginary_node_t); + *node = (pm_imaginary_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_IMAGINARY_NODE, PM_NODE_FLAG_STATIC_LITERAL, token), + .numeric = UP(pm_float_node_rational_create(parser, &((pm_token_t) { + .type = PM_TOKEN_FLOAT_RATIONAL, + .start = token->start, + .end = token->end - 1 + }))) + }; + + return node; +} + +/** + * Allocate and initialize a new ForNode node. + */ +static pm_for_node_t * +pm_for_node_create( + pm_parser_t *parser, + pm_node_t *index, + pm_node_t *collection, + pm_statements_node_t *statements, + const pm_token_t *for_keyword, + const pm_token_t *in_keyword, + const pm_token_t *do_keyword, + const pm_token_t *end_keyword +) { + pm_for_node_t *node = PM_NODE_ALLOC(parser, pm_for_node_t); + + *node = (pm_for_node_t) { + .base = PM_NODE_INIT_TOKENS(parser, PM_FOR_NODE, 0, for_keyword, end_keyword), + .index = index, + .collection = collection, + .statements = statements, + .for_keyword_loc = PM_LOCATION_TOKEN_VALUE(for_keyword), + .in_keyword_loc = PM_LOCATION_TOKEN_VALUE(in_keyword), + .do_keyword_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(do_keyword), + .end_keyword_loc = PM_LOCATION_TOKEN_VALUE(end_keyword) + }; + + return node; +} + +/** + * Allocate and initialize a new ForwardingArgumentsNode node. + */ +static pm_forwarding_arguments_node_t * +pm_forwarding_arguments_node_create(pm_parser_t *parser, const pm_token_t *token) { + assert(token->type == PM_TOKEN_UDOT_DOT_DOT); + pm_forwarding_arguments_node_t *node = PM_NODE_ALLOC(parser, pm_forwarding_arguments_node_t); + + *node = (pm_forwarding_arguments_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_FORWARDING_ARGUMENTS_NODE, 0, token) + }; + + return node; +} + +/** + * Allocate and initialize a new ForwardingParameterNode node. + */ +static pm_forwarding_parameter_node_t * +pm_forwarding_parameter_node_create(pm_parser_t *parser, const pm_token_t *token) { + assert(token->type == PM_TOKEN_UDOT_DOT_DOT); + pm_forwarding_parameter_node_t *node = PM_NODE_ALLOC(parser, pm_forwarding_parameter_node_t); + + *node = (pm_forwarding_parameter_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_FORWARDING_PARAMETER_NODE, 0, token) + }; + + return node; +} + +/** + * Allocate and initialize a new ForwardingSuper node. + */ +static pm_forwarding_super_node_t * +pm_forwarding_super_node_create(pm_parser_t *parser, const pm_token_t *token, pm_arguments_t *arguments) { + assert(arguments->block == NULL || PM_NODE_TYPE_P(arguments->block, PM_BLOCK_NODE)); + assert(token->type == PM_TOKEN_KEYWORD_SUPER); + pm_forwarding_super_node_t *node = PM_NODE_ALLOC(parser, pm_forwarding_super_node_t); + + pm_block_node_t *block = NULL; + if (arguments->block != NULL) { + block = (pm_block_node_t *) arguments->block; + } + + *node = (pm_forwarding_super_node_t) { + .base = ( + (block == NULL) + ? PM_NODE_INIT_TOKEN(parser, PM_FORWARDING_SUPER_NODE, 0, token) + : PM_NODE_INIT_TOKEN_NODE(parser, PM_FORWARDING_SUPER_NODE, 0, token, block) + ), + .block = block + }; + + return node; +} + +/** + * Allocate and initialize a new hash pattern node from an opening and closing + * token. + */ +static pm_hash_pattern_node_t * +pm_hash_pattern_node_empty_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *closing) { + pm_hash_pattern_node_t *node = PM_NODE_ALLOC(parser, pm_hash_pattern_node_t); + + *node = (pm_hash_pattern_node_t) { + .base = PM_NODE_INIT_TOKENS(parser, PM_HASH_PATTERN_NODE, 0, opening, closing), + .constant = NULL, + .opening_loc = PM_LOCATION_TOKEN_VALUE(opening), + .closing_loc = PM_LOCATION_TOKEN_VALUE(closing), + .elements = { 0 }, + .rest = NULL + }; + + return node; +} + +/** + * Allocate and initialize a new hash pattern node. + */ +static pm_hash_pattern_node_t * +pm_hash_pattern_node_node_list_create(pm_parser_t *parser, pm_node_list_t *elements, pm_node_t *rest) { + pm_hash_pattern_node_t *node = PM_NODE_ALLOC(parser, pm_hash_pattern_node_t); + + const uint8_t *start; + const uint8_t *end; + + if (elements->size > 0) { + if (rest) { + start = MIN(rest->location.start, elements->nodes[0]->location.start); + end = MAX(rest->location.end, elements->nodes[elements->size - 1]->location.end); + } else { + start = elements->nodes[0]->location.start; + end = elements->nodes[elements->size - 1]->location.end; + } + } else { + assert(rest != NULL); + start = rest->location.start; + end = rest->location.end; + } + + *node = (pm_hash_pattern_node_t) { + .base = PM_NODE_INIT(parser, PM_HASH_PATTERN_NODE, 0, start, end), + .constant = NULL, + .elements = { 0 }, + .rest = rest, + .opening_loc = { 0 }, + .closing_loc = { 0 } + }; + + pm_node_list_concat(&node->elements, elements); + return node; +} + +/** + * Retrieve the name from a node that will become a global variable write node. + */ +static pm_constant_id_t +pm_global_variable_write_name(pm_parser_t *parser, const pm_node_t *target) { + switch (PM_NODE_TYPE(target)) { + case PM_GLOBAL_VARIABLE_READ_NODE: + return ((pm_global_variable_read_node_t *) target)->name; + case PM_BACK_REFERENCE_READ_NODE: + return ((pm_back_reference_read_node_t *) target)->name; + case PM_NUMBERED_REFERENCE_READ_NODE: + // This will only ever happen in the event of a syntax error, but we + // still need to provide something for the node. + return pm_parser_constant_id_location(parser, target->location.start, target->location.end); + default: + assert(false && "unreachable"); + return (pm_constant_id_t) -1; + } +} + +/** + * Allocate and initialize a new GlobalVariableAndWriteNode node. + */ +static pm_global_variable_and_write_node_t * +pm_global_variable_and_write_node_create(pm_parser_t *parser, pm_node_t *target, const pm_token_t *operator, pm_node_t *value) { + assert(operator->type == PM_TOKEN_AMPERSAND_AMPERSAND_EQUAL); + pm_global_variable_and_write_node_t *node = PM_NODE_ALLOC(parser, pm_global_variable_and_write_node_t); + + *node = (pm_global_variable_and_write_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_GLOBAL_VARIABLE_AND_WRITE_NODE, 0, target, value), + .name = pm_global_variable_write_name(parser, target), + .name_loc = target->location, + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .value = value + }; + + return node; +} + +/** + * Allocate and initialize a new GlobalVariableOperatorWriteNode node. + */ +static pm_global_variable_operator_write_node_t * +pm_global_variable_operator_write_node_create(pm_parser_t *parser, pm_node_t *target, const pm_token_t *operator, pm_node_t *value) { + pm_global_variable_operator_write_node_t *node = PM_NODE_ALLOC(parser, pm_global_variable_operator_write_node_t); + + *node = (pm_global_variable_operator_write_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_GLOBAL_VARIABLE_OPERATOR_WRITE_NODE, 0, target, value), + .name = pm_global_variable_write_name(parser, target), + .name_loc = target->location, + .binary_operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .value = value, + .binary_operator = pm_parser_constant_id_location(parser, operator->start, operator->end - 1) + }; + + return node; +} + +/** + * Allocate and initialize a new GlobalVariableOrWriteNode node. + */ +static pm_global_variable_or_write_node_t * +pm_global_variable_or_write_node_create(pm_parser_t *parser, pm_node_t *target, const pm_token_t *operator, pm_node_t *value) { + assert(operator->type == PM_TOKEN_PIPE_PIPE_EQUAL); + pm_global_variable_or_write_node_t *node = PM_NODE_ALLOC(parser, pm_global_variable_or_write_node_t); + + *node = (pm_global_variable_or_write_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_GLOBAL_VARIABLE_OR_WRITE_NODE, 0, target, value), + .name = pm_global_variable_write_name(parser, target), + .name_loc = target->location, + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .value = value + }; + + return node; +} + +/** + * Allocate a new GlobalVariableReadNode node. + */ +static pm_global_variable_read_node_t * +pm_global_variable_read_node_create(pm_parser_t *parser, const pm_token_t *name) { + pm_global_variable_read_node_t *node = PM_NODE_ALLOC(parser, pm_global_variable_read_node_t); + + *node = (pm_global_variable_read_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_GLOBAL_VARIABLE_READ_NODE, 0, name), + .name = pm_parser_constant_id_token(parser, name) + }; + + return node; +} + +/** + * Allocate and initialize a new synthesized GlobalVariableReadNode node. + */ +static pm_global_variable_read_node_t * +pm_global_variable_read_node_synthesized_create(pm_parser_t *parser, pm_constant_id_t name) { + pm_global_variable_read_node_t *node = PM_NODE_ALLOC(parser, pm_global_variable_read_node_t); + + *node = (pm_global_variable_read_node_t) { + .base = PM_NODE_INIT_BASE(parser, PM_GLOBAL_VARIABLE_READ_NODE, 0), + .name = name + }; + + return node; +} + +/** + * Allocate and initialize a new GlobalVariableWriteNode node. + */ +static pm_global_variable_write_node_t * +pm_global_variable_write_node_create(pm_parser_t *parser, pm_node_t *target, const pm_token_t *operator, pm_node_t *value) { + pm_global_variable_write_node_t *node = PM_NODE_ALLOC(parser, pm_global_variable_write_node_t); + pm_node_flags_t flags = pm_implicit_array_write_flags(value, PM_WRITE_NODE_FLAGS_IMPLICIT_ARRAY); + + *node = (pm_global_variable_write_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_GLOBAL_VARIABLE_WRITE_NODE, flags, target, value), + .name = pm_global_variable_write_name(parser, target), + .name_loc = PM_LOCATION_NODE_VALUE(target), + .operator_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(operator), + .value = value + }; + + return node; +} + +/** + * Allocate and initialize a new synthesized GlobalVariableWriteNode node. + */ +static pm_global_variable_write_node_t * +pm_global_variable_write_node_synthesized_create(pm_parser_t *parser, pm_constant_id_t name, pm_node_t *value) { + pm_global_variable_write_node_t *node = PM_NODE_ALLOC(parser, pm_global_variable_write_node_t); + + *node = (pm_global_variable_write_node_t) { + .base = PM_NODE_INIT_BASE(parser, PM_GLOBAL_VARIABLE_WRITE_NODE, 0), + .name = name, + .name_loc = PM_LOCATION_NULL_VALUE(parser), + .operator_loc = PM_LOCATION_NULL_VALUE(parser), + .value = value + }; + + return node; +} + +/** + * Allocate a new HashNode node. + */ +static pm_hash_node_t * +pm_hash_node_create(pm_parser_t *parser, const pm_token_t *opening) { + assert(opening != NULL); + pm_hash_node_t *node = PM_NODE_ALLOC(parser, pm_hash_node_t); + + *node = (pm_hash_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_HASH_NODE, PM_NODE_FLAG_STATIC_LITERAL, opening), + .opening_loc = PM_LOCATION_TOKEN_VALUE(opening), + .closing_loc = PM_LOCATION_NULL_VALUE(parser), + .elements = { 0 } + }; + + return node; +} + +/** + * Append a new element to a hash node. + */ +static inline void +pm_hash_node_elements_append(pm_hash_node_t *hash, pm_node_t *element) { + pm_node_list_append(&hash->elements, element); + + bool static_literal = PM_NODE_TYPE_P(element, PM_ASSOC_NODE); + if (static_literal) { + pm_assoc_node_t *assoc = (pm_assoc_node_t *) element; + static_literal = !PM_NODE_TYPE_P(assoc->key, PM_ARRAY_NODE) && !PM_NODE_TYPE_P(assoc->key, PM_HASH_NODE) && !PM_NODE_TYPE_P(assoc->key, PM_RANGE_NODE); + static_literal = static_literal && PM_NODE_FLAG_P(assoc->key, PM_NODE_FLAG_STATIC_LITERAL); + static_literal = static_literal && PM_NODE_FLAG_P(assoc, PM_NODE_FLAG_STATIC_LITERAL); + } + + if (!static_literal) { + pm_node_flag_unset(UP(hash), PM_NODE_FLAG_STATIC_LITERAL); + } +} + +static inline void +pm_hash_node_closing_loc_set(pm_hash_node_t *hash, pm_token_t *token) { + hash->base.location.end = token->end; + hash->closing_loc = PM_LOCATION_TOKEN_VALUE(token); +} + +/** + * Allocate a new IfNode node. + */ +static pm_if_node_t * +pm_if_node_create(pm_parser_t *parser, + const pm_token_t *if_keyword, + pm_node_t *predicate, + const pm_token_t *then_keyword, + pm_statements_node_t *statements, + pm_node_t *subsequent, + const pm_token_t *end_keyword +) { + pm_conditional_predicate(parser, predicate, PM_CONDITIONAL_PREDICATE_TYPE_CONDITIONAL); + pm_if_node_t *node = PM_NODE_ALLOC(parser, pm_if_node_t); + + const uint8_t *end; + if (end_keyword->type != PM_TOKEN_NOT_PROVIDED) { + end = end_keyword->end; + } else if (subsequent != NULL) { + end = subsequent->location.end; + } else if (pm_statements_node_body_length(statements) != 0) { + end = statements->base.location.end; + } else { + end = predicate->location.end; + } + + *node = (pm_if_node_t) { + .base = PM_NODE_INIT(parser, PM_IF_NODE, PM_NODE_FLAG_NEWLINE, if_keyword->start, end), + .if_keyword_loc = PM_LOCATION_TOKEN_VALUE(if_keyword), + .predicate = predicate, + .then_keyword_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(then_keyword), + .statements = statements, + .subsequent = subsequent, + .end_keyword_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(end_keyword) + }; + + return node; +} + +/** + * Allocate and initialize new IfNode node in the modifier form. + */ +static pm_if_node_t * +pm_if_node_modifier_create(pm_parser_t *parser, pm_node_t *statement, const pm_token_t *if_keyword, pm_node_t *predicate) { + pm_conditional_predicate(parser, predicate, PM_CONDITIONAL_PREDICATE_TYPE_CONDITIONAL); + pm_if_node_t *node = PM_NODE_ALLOC(parser, pm_if_node_t); + + pm_statements_node_t *statements = pm_statements_node_create(parser); + pm_statements_node_body_append(parser, statements, statement, true); + + *node = (pm_if_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_IF_NODE, PM_NODE_FLAG_NEWLINE, statement, predicate), + .if_keyword_loc = PM_LOCATION_TOKEN_VALUE(if_keyword), + .predicate = predicate, + .then_keyword_loc = { 0 }, + .statements = statements, + .subsequent = NULL, + .end_keyword_loc = { 0 } + }; + + return node; +} + +/** + * Allocate and initialize an if node from a ternary expression. + */ +static pm_if_node_t * +pm_if_node_ternary_create(pm_parser_t *parser, pm_node_t *predicate, const pm_token_t *qmark, pm_node_t *true_expression, const pm_token_t *colon, pm_node_t *false_expression) { + pm_assert_value_expression(parser, predicate); + pm_conditional_predicate(parser, predicate, PM_CONDITIONAL_PREDICATE_TYPE_CONDITIONAL); + + pm_statements_node_t *if_statements = pm_statements_node_create(parser); + pm_statements_node_body_append(parser, if_statements, true_expression, true); + + pm_statements_node_t *else_statements = pm_statements_node_create(parser); + pm_statements_node_body_append(parser, else_statements, false_expression, true); + + pm_token_t end_keyword = not_provided(parser); + pm_else_node_t *else_node = pm_else_node_create(parser, colon, else_statements, &end_keyword); + + pm_if_node_t *node = PM_NODE_ALLOC(parser, pm_if_node_t); + + *node = (pm_if_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_IF_NODE, PM_NODE_FLAG_NEWLINE, predicate, false_expression), + .if_keyword_loc = { 0 }, + .predicate = predicate, + .then_keyword_loc = PM_LOCATION_TOKEN_VALUE(qmark), + .statements = if_statements, + .subsequent = UP(else_node), + .end_keyword_loc = { 0 } + }; + + return node; + +} + +static inline void +pm_if_node_end_keyword_loc_set(pm_if_node_t *node, const pm_token_t *keyword) { + node->base.location.end = keyword->end; + node->end_keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword); +} + +static inline void +pm_else_node_end_keyword_loc_set(pm_else_node_t *node, const pm_token_t *keyword) { + node->base.location.end = keyword->end; + node->end_keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword); +} + +/** + * Allocate and initialize a new ImplicitNode node. + */ +static pm_implicit_node_t * +pm_implicit_node_create(pm_parser_t *parser, pm_node_t *value) { + pm_implicit_node_t *node = PM_NODE_ALLOC(parser, pm_implicit_node_t); + + *node = (pm_implicit_node_t) { + .base = PM_NODE_INIT_NODE(parser, PM_IMPLICIT_NODE, 0, value), + .value = value + }; + + return node; +} + +/** + * Allocate and initialize a new ImplicitRestNode node. + */ +static pm_implicit_rest_node_t * +pm_implicit_rest_node_create(pm_parser_t *parser, const pm_token_t *token) { + assert(token->type == PM_TOKEN_COMMA); + + pm_implicit_rest_node_t *node = PM_NODE_ALLOC(parser, pm_implicit_rest_node_t); + + *node = (pm_implicit_rest_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_IMPLICIT_REST_NODE, 0, token) + }; + + return node; +} + +/** + * Allocate and initialize a new IntegerNode node. + */ +static pm_integer_node_t * +pm_integer_node_create(pm_parser_t *parser, pm_node_flags_t base, const pm_token_t *token) { + assert(token->type == PM_TOKEN_INTEGER); + pm_integer_node_t *node = PM_NODE_ALLOC(parser, pm_integer_node_t); + + *node = (pm_integer_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_INTEGER_NODE, base | PM_NODE_FLAG_STATIC_LITERAL, token), + .value = { 0 } + }; + + pm_integer_base_t integer_base = PM_INTEGER_BASE_DECIMAL; + switch (base) { + case PM_INTEGER_BASE_FLAGS_BINARY: integer_base = PM_INTEGER_BASE_BINARY; break; + case PM_INTEGER_BASE_FLAGS_OCTAL: integer_base = PM_INTEGER_BASE_OCTAL; break; + case PM_INTEGER_BASE_FLAGS_DECIMAL: break; + case PM_INTEGER_BASE_FLAGS_HEXADECIMAL: integer_base = PM_INTEGER_BASE_HEXADECIMAL; break; + default: assert(false && "unreachable"); break; + } + + pm_integer_parse(&node->value, integer_base, token->start, token->end); + return node; +} + +/** + * Allocate and initialize a new IntegerNode node from an INTEGER_IMAGINARY + * token. + */ +static pm_imaginary_node_t * +pm_integer_node_imaginary_create(pm_parser_t *parser, pm_node_flags_t base, const pm_token_t *token) { + assert(token->type == PM_TOKEN_INTEGER_IMAGINARY); + + pm_imaginary_node_t *node = PM_NODE_ALLOC(parser, pm_imaginary_node_t); + *node = (pm_imaginary_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_IMAGINARY_NODE, PM_NODE_FLAG_STATIC_LITERAL, token), + .numeric = UP(pm_integer_node_create(parser, base, &((pm_token_t) { + .type = PM_TOKEN_INTEGER, + .start = token->start, + .end = token->end - 1 + }))) + }; + + return node; +} + +/** + * Allocate and initialize a new RationalNode node from an INTEGER_RATIONAL + * token. + */ +static pm_rational_node_t * +pm_integer_node_rational_create(pm_parser_t *parser, pm_node_flags_t base, const pm_token_t *token) { + assert(token->type == PM_TOKEN_INTEGER_RATIONAL); + + pm_rational_node_t *node = PM_NODE_ALLOC(parser, pm_rational_node_t); + *node = (pm_rational_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_RATIONAL_NODE, base | PM_NODE_FLAG_STATIC_LITERAL, token), + .numerator = { 0 }, + .denominator = { .value = 1, 0 } + }; + + pm_integer_base_t integer_base = PM_INTEGER_BASE_DECIMAL; + switch (base) { + case PM_INTEGER_BASE_FLAGS_BINARY: integer_base = PM_INTEGER_BASE_BINARY; break; + case PM_INTEGER_BASE_FLAGS_OCTAL: integer_base = PM_INTEGER_BASE_OCTAL; break; + case PM_INTEGER_BASE_FLAGS_DECIMAL: break; + case PM_INTEGER_BASE_FLAGS_HEXADECIMAL: integer_base = PM_INTEGER_BASE_HEXADECIMAL; break; + default: assert(false && "unreachable"); break; + } + + pm_integer_parse(&node->numerator, integer_base, token->start, token->end - 1); + + return node; +} + +/** + * Allocate and initialize a new IntegerNode node from an + * INTEGER_RATIONAL_IMAGINARY token. + */ +static pm_imaginary_node_t * +pm_integer_node_rational_imaginary_create(pm_parser_t *parser, pm_node_flags_t base, const pm_token_t *token) { + assert(token->type == PM_TOKEN_INTEGER_RATIONAL_IMAGINARY); + + pm_imaginary_node_t *node = PM_NODE_ALLOC(parser, pm_imaginary_node_t); + *node = (pm_imaginary_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_IMAGINARY_NODE, PM_NODE_FLAG_STATIC_LITERAL, token), + .numeric = UP(pm_integer_node_rational_create(parser, base, &((pm_token_t) { + .type = PM_TOKEN_INTEGER_RATIONAL, + .start = token->start, + .end = token->end - 1 + }))) + }; + + return node; +} + +/** + * Allocate and initialize a new InNode node. + */ +static pm_in_node_t * +pm_in_node_create(pm_parser_t *parser, pm_node_t *pattern, pm_statements_node_t *statements, const pm_token_t *in_keyword, const pm_token_t *then_keyword) { + pm_in_node_t *node = PM_NODE_ALLOC(parser, pm_in_node_t); + + const uint8_t *end; + if (statements != NULL) { + end = statements->base.location.end; + } else if (then_keyword->type != PM_TOKEN_NOT_PROVIDED) { + end = then_keyword->end; + } else { + end = pattern->location.end; + } + + *node = (pm_in_node_t) { + .base = PM_NODE_INIT(parser, PM_IN_NODE, 0, in_keyword->start, end), + .pattern = pattern, + .statements = statements, + .in_loc = PM_LOCATION_TOKEN_VALUE(in_keyword), + .then_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(then_keyword) + }; + + return node; +} + +/** + * Allocate and initialize a new InstanceVariableAndWriteNode node. + */ +static pm_instance_variable_and_write_node_t * +pm_instance_variable_and_write_node_create(pm_parser_t *parser, pm_instance_variable_read_node_t *target, const pm_token_t *operator, pm_node_t *value) { + assert(operator->type == PM_TOKEN_AMPERSAND_AMPERSAND_EQUAL); + pm_instance_variable_and_write_node_t *node = PM_NODE_ALLOC(parser, pm_instance_variable_and_write_node_t); + + *node = (pm_instance_variable_and_write_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_INSTANCE_VARIABLE_AND_WRITE_NODE, 0, target, value), + .name = target->name, + .name_loc = target->base.location, + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .value = value + }; + + return node; +} + +/** + * Allocate and initialize a new InstanceVariableOperatorWriteNode node. + */ +static pm_instance_variable_operator_write_node_t * +pm_instance_variable_operator_write_node_create(pm_parser_t *parser, pm_instance_variable_read_node_t *target, const pm_token_t *operator, pm_node_t *value) { + pm_instance_variable_operator_write_node_t *node = PM_NODE_ALLOC(parser, pm_instance_variable_operator_write_node_t); + + *node = (pm_instance_variable_operator_write_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_INSTANCE_VARIABLE_OPERATOR_WRITE_NODE, 0, target, value), + .name = target->name, + .name_loc = target->base.location, + .binary_operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .value = value, + .binary_operator = pm_parser_constant_id_location(parser, operator->start, operator->end - 1) + }; + + return node; +} + +/** + * Allocate and initialize a new InstanceVariableOrWriteNode node. + */ +static pm_instance_variable_or_write_node_t * +pm_instance_variable_or_write_node_create(pm_parser_t *parser, pm_instance_variable_read_node_t *target, const pm_token_t *operator, pm_node_t *value) { + assert(operator->type == PM_TOKEN_PIPE_PIPE_EQUAL); + pm_instance_variable_or_write_node_t *node = PM_NODE_ALLOC(parser, pm_instance_variable_or_write_node_t); + + *node = (pm_instance_variable_or_write_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_INSTANCE_VARIABLE_OR_WRITE_NODE, 0, target, value), + .name = target->name, + .name_loc = target->base.location, + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .value = value + }; + + return node; +} + +/** + * Allocate and initialize a new InstanceVariableReadNode node. + */ +static pm_instance_variable_read_node_t * +pm_instance_variable_read_node_create(pm_parser_t *parser, const pm_token_t *token) { + assert(token->type == PM_TOKEN_INSTANCE_VARIABLE); + pm_instance_variable_read_node_t *node = PM_NODE_ALLOC(parser, pm_instance_variable_read_node_t); + + *node = (pm_instance_variable_read_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_INSTANCE_VARIABLE_READ_NODE, 0, token), + .name = pm_parser_constant_id_token(parser, token) + }; + + return node; +} + +/** + * Initialize a new InstanceVariableWriteNode node from an InstanceVariableRead + * node. + */ +static pm_instance_variable_write_node_t * +pm_instance_variable_write_node_create(pm_parser_t *parser, pm_instance_variable_read_node_t *read_node, pm_token_t *operator, pm_node_t *value) { + pm_instance_variable_write_node_t *node = PM_NODE_ALLOC(parser, pm_instance_variable_write_node_t); + pm_node_flags_t flags = pm_implicit_array_write_flags(value, PM_WRITE_NODE_FLAGS_IMPLICIT_ARRAY); + + *node = (pm_instance_variable_write_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_INSTANCE_VARIABLE_WRITE_NODE, flags, read_node, value), + .name = read_node->name, + .name_loc = PM_LOCATION_NODE_VALUE(read_node), + .operator_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(operator), + .value = value + }; + + return node; +} + +/** + * Append a part into a list of string parts. Importantly this handles nested + * interpolated strings by not necessarily removing the marker for static + * literals. + */ +static void +pm_interpolated_node_append(pm_node_t *node, pm_node_list_t *parts, pm_node_t *part) { + switch (PM_NODE_TYPE(part)) { + case PM_STRING_NODE: + pm_node_flag_set(part, PM_NODE_FLAG_STATIC_LITERAL | PM_STRING_FLAGS_FROZEN); + break; + case PM_EMBEDDED_STATEMENTS_NODE: { + pm_embedded_statements_node_t *cast = (pm_embedded_statements_node_t *) part; + pm_node_t *embedded = (cast->statements != NULL && cast->statements->body.size == 1) ? cast->statements->body.nodes[0] : NULL; + + if (embedded == NULL) { + // If there are no statements or more than one statement, then + // we lose the static literal flag. + pm_node_flag_unset(node, PM_NODE_FLAG_STATIC_LITERAL); + } else if (PM_NODE_TYPE_P(embedded, PM_STRING_NODE)) { + // If the embedded statement is a string, then we can keep the + // static literal flag and mark the string as frozen. + pm_node_flag_set(embedded, PM_NODE_FLAG_STATIC_LITERAL | PM_STRING_FLAGS_FROZEN); + } else if (PM_NODE_TYPE_P(embedded, PM_INTERPOLATED_STRING_NODE) && PM_NODE_FLAG_P(embedded, PM_NODE_FLAG_STATIC_LITERAL)) { + // If the embedded statement is an interpolated string and it's + // a static literal, then we can keep the static literal flag. + } else { + // Otherwise we lose the static literal flag. + pm_node_flag_unset(node, PM_NODE_FLAG_STATIC_LITERAL); + } + + break; + } + case PM_EMBEDDED_VARIABLE_NODE: + pm_node_flag_unset(UP(node), PM_NODE_FLAG_STATIC_LITERAL); + break; + default: + assert(false && "unexpected node type"); + break; + } + + pm_node_list_append(parts, part); +} + +/** + * Allocate a new InterpolatedRegularExpressionNode node. + */ +static pm_interpolated_regular_expression_node_t * +pm_interpolated_regular_expression_node_create(pm_parser_t *parser, const pm_token_t *opening) { + pm_interpolated_regular_expression_node_t *node = PM_NODE_ALLOC(parser, pm_interpolated_regular_expression_node_t); + + *node = (pm_interpolated_regular_expression_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_INTERPOLATED_REGULAR_EXPRESSION_NODE, PM_NODE_FLAG_STATIC_LITERAL, opening), + .opening_loc = PM_LOCATION_TOKEN_VALUE(opening), + .closing_loc = PM_LOCATION_TOKEN_VALUE(opening), + .parts = { 0 } + }; + + return node; +} + +static inline void +pm_interpolated_regular_expression_node_append(pm_interpolated_regular_expression_node_t *node, pm_node_t *part) { + if (node->base.location.start > part->location.start) { + node->base.location.start = part->location.start; + } + if (node->base.location.end < part->location.end) { + node->base.location.end = part->location.end; + } + + pm_interpolated_node_append(UP(node), &node->parts, part); +} + +static inline void +pm_interpolated_regular_expression_node_closing_set(pm_parser_t *parser, pm_interpolated_regular_expression_node_t *node, const pm_token_t *closing) { + node->closing_loc = PM_LOCATION_TOKEN_VALUE(closing); + node->base.location.end = closing->end; + pm_node_flag_set(UP(node), pm_regular_expression_flags_create(parser, closing)); +} + +/** + * Append a part to an InterpolatedStringNode node. + * + * This has some somewhat complicated semantics, because we need to update + * multiple flags that have somewhat confusing interactions. + * + * PM_NODE_FLAG_STATIC_LITERAL indicates that the node should be treated as a + * single static literal string that can be pushed onto the stack on its own. + * Note that this doesn't necessarily mean that the string will be frozen or + * not; the instructions in CRuby will be either putobject or putstring, + * depending on the combination of `--enable-frozen-string-literal`, + * `# frozen_string_literal: true`, and whether or not there is interpolation. + * + * PM_INTERPOLATED_STRING_NODE_FLAGS_FROZEN indicates that the string should be + * explicitly frozen. This will only happen if the string is comprised entirely + * of string parts that are themselves static literals and frozen. + * + * PM_INTERPOLATED_STRING_NODE_FLAGS_MUTABLE indicates that the string should + * be explicitly marked as mutable. This will happen from + * `--disable-frozen-string-literal` or `# frozen_string_literal: false`. This + * is necessary to indicate that the string should be left up to the runtime, + * which could potentially use a chilled string otherwise. + */ +static inline void +pm_interpolated_string_node_append(pm_interpolated_string_node_t *node, pm_node_t *part) { +#define CLEAR_FLAGS(node) \ + node->base.flags = (pm_node_flags_t) (FL(node) & ~(PM_NODE_FLAG_STATIC_LITERAL | PM_INTERPOLATED_STRING_NODE_FLAGS_FROZEN | PM_INTERPOLATED_STRING_NODE_FLAGS_MUTABLE)) + +#define MUTABLE_FLAGS(node) \ + node->base.flags = (pm_node_flags_t) ((FL(node) | PM_INTERPOLATED_STRING_NODE_FLAGS_MUTABLE) & ~PM_INTERPOLATED_STRING_NODE_FLAGS_FROZEN); + + if (node->parts.size == 0 && node->opening_loc.start == NULL) { + node->base.location.start = part->location.start; + } + + node->base.location.end = MAX(node->base.location.end, part->location.end); + + switch (PM_NODE_TYPE(part)) { + case PM_STRING_NODE: + // If inner string is not frozen, it stops being a static literal. We should *not* clear other flags, + // because concatenating two frozen strings (`'foo' 'bar'`) is still frozen. This holds true for + // as long as this interpolation only consists of other string literals. + if (!PM_NODE_FLAG_P(part, PM_STRING_FLAGS_FROZEN)) { + pm_node_flag_unset(UP(node), PM_NODE_FLAG_STATIC_LITERAL); + } + part->flags = (pm_node_flags_t) ((part->flags | PM_NODE_FLAG_STATIC_LITERAL | PM_STRING_FLAGS_FROZEN) & ~PM_STRING_FLAGS_MUTABLE); + break; + case PM_INTERPOLATED_STRING_NODE: + if (PM_NODE_FLAG_P(part, PM_NODE_FLAG_STATIC_LITERAL)) { + // If the string that we're concatenating is a static literal, + // then we can keep the static literal flag for this string. + } else { + // Otherwise, we lose the static literal flag here and we should + // also clear the mutability flags. + CLEAR_FLAGS(node); + } + break; + case PM_EMBEDDED_STATEMENTS_NODE: { + pm_embedded_statements_node_t *cast = (pm_embedded_statements_node_t *) part; + pm_node_t *embedded = (cast->statements != NULL && cast->statements->body.size == 1) ? cast->statements->body.nodes[0] : NULL; + + if (embedded == NULL) { + // If we're embedding multiple statements or no statements, then + // the string is not longer a static literal. + CLEAR_FLAGS(node); + } else if (PM_NODE_TYPE_P(embedded, PM_STRING_NODE)) { + // If the embedded statement is a string, then we can make that + // string as frozen and static literal, and not touch the static + // literal status of this string. + embedded->flags = (pm_node_flags_t) ((embedded->flags | PM_NODE_FLAG_STATIC_LITERAL | PM_STRING_FLAGS_FROZEN) & ~PM_STRING_FLAGS_MUTABLE); + + if (PM_NODE_FLAG_P(node, PM_NODE_FLAG_STATIC_LITERAL)) { + MUTABLE_FLAGS(node); + } + } else if (PM_NODE_TYPE_P(embedded, PM_INTERPOLATED_STRING_NODE) && PM_NODE_FLAG_P(embedded, PM_NODE_FLAG_STATIC_LITERAL)) { + // If the embedded statement is an interpolated string, but that + // string is marked as static literal, then we can keep our + // static literal status for this string. + if (PM_NODE_FLAG_P(node, PM_NODE_FLAG_STATIC_LITERAL)) { + MUTABLE_FLAGS(node); + } + } else { + // In all other cases, we lose the static literal flag here and + // become mutable. + CLEAR_FLAGS(node); + } + + break; + } + case PM_EMBEDDED_VARIABLE_NODE: + // Embedded variables clear static literal, which means we also + // should clear the mutability flags. + CLEAR_FLAGS(node); + break; + case PM_X_STRING_NODE: + case PM_INTERPOLATED_X_STRING_NODE: + case PM_SYMBOL_NODE: + case PM_INTERPOLATED_SYMBOL_NODE: + // These will only happen in error cases. But we want to handle it + // here so that we don't fail the assertion. + CLEAR_FLAGS(node); + break; + default: + assert(false && "unexpected node type"); + break; + } + + pm_node_list_append(&node->parts, part); + +#undef CLEAR_FLAGS +#undef MUTABLE_FLAGS +} + +/** + * Allocate and initialize a new InterpolatedStringNode node. + */ +static pm_interpolated_string_node_t * +pm_interpolated_string_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_node_list_t *parts, const pm_token_t *closing) { + pm_interpolated_string_node_t *node = PM_NODE_ALLOC(parser, pm_interpolated_string_node_t); + pm_node_flags_t flags = PM_NODE_FLAG_STATIC_LITERAL; + + switch (parser->frozen_string_literal) { + case PM_OPTIONS_FROZEN_STRING_LITERAL_DISABLED: + flags |= PM_INTERPOLATED_STRING_NODE_FLAGS_MUTABLE; + break; + case PM_OPTIONS_FROZEN_STRING_LITERAL_ENABLED: + flags |= PM_INTERPOLATED_STRING_NODE_FLAGS_FROZEN; + break; + } + + *node = (pm_interpolated_string_node_t) { + .base = PM_NODE_INIT_TOKENS(parser, PM_INTERPOLATED_STRING_NODE, flags, opening, closing), + .opening_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(opening), + .closing_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(closing), + .parts = { 0 } + }; + + if (parts != NULL) { + pm_node_t *part; + PM_NODE_LIST_FOREACH(parts, index, part) { + pm_interpolated_string_node_append(node, part); + } + } + + return node; +} + +/** + * Set the closing token of the given InterpolatedStringNode node. + */ +static void +pm_interpolated_string_node_closing_set(pm_interpolated_string_node_t *node, const pm_token_t *closing) { + node->closing_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(closing); + node->base.location.end = closing->end; +} + +static void +pm_interpolated_symbol_node_append(pm_interpolated_symbol_node_t *node, pm_node_t *part) { + if (node->parts.size == 0 && node->opening_loc.start == NULL) { + node->base.location.start = part->location.start; + } + + pm_interpolated_node_append(UP(node), &node->parts, part); + node->base.location.end = MAX(node->base.location.end, part->location.end); +} + +static void +pm_interpolated_symbol_node_closing_loc_set(pm_interpolated_symbol_node_t *node, const pm_token_t *closing) { + node->closing_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(closing); + node->base.location.end = closing->end; +} + +/** + * Allocate and initialize a new InterpolatedSymbolNode node. + */ +static pm_interpolated_symbol_node_t * +pm_interpolated_symbol_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_node_list_t *parts, const pm_token_t *closing) { + pm_interpolated_symbol_node_t *node = PM_NODE_ALLOC(parser, pm_interpolated_symbol_node_t); + + *node = (pm_interpolated_symbol_node_t) { + .base = PM_NODE_INIT_TOKENS(parser, PM_INTERPOLATED_SYMBOL_NODE, PM_NODE_FLAG_STATIC_LITERAL, opening, closing), + .opening_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(opening), + .closing_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(closing), + .parts = { 0 } + }; + + if (parts != NULL) { + pm_node_t *part; + PM_NODE_LIST_FOREACH(parts, index, part) { + pm_interpolated_symbol_node_append(node, part); + } + } + + return node; +} + +/** + * Allocate a new InterpolatedXStringNode node. + */ +static pm_interpolated_x_string_node_t * +pm_interpolated_xstring_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *closing) { + pm_interpolated_x_string_node_t *node = PM_NODE_ALLOC(parser, pm_interpolated_x_string_node_t); + + *node = (pm_interpolated_x_string_node_t) { + .base = PM_NODE_INIT_TOKENS(parser, PM_INTERPOLATED_X_STRING_NODE, 0, opening, closing), + .opening_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(opening), + .closing_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(closing), + .parts = { 0 } + }; + + return node; +} + +static inline void +pm_interpolated_xstring_node_append(pm_interpolated_x_string_node_t *node, pm_node_t *part) { + pm_interpolated_node_append(UP(node), &node->parts, part); + node->base.location.end = part->location.end; +} + +static inline void +pm_interpolated_xstring_node_closing_set(pm_interpolated_x_string_node_t *node, const pm_token_t *closing) { + node->closing_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(closing); + node->base.location.end = closing->end; +} + +/** + * Create a local variable read that is reading the implicit 'it' variable. + */ +static pm_it_local_variable_read_node_t * +pm_it_local_variable_read_node_create(pm_parser_t *parser, const pm_token_t *name) { + pm_it_local_variable_read_node_t *node = PM_NODE_ALLOC(parser, pm_it_local_variable_read_node_t); + + *node = (pm_it_local_variable_read_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_IT_LOCAL_VARIABLE_READ_NODE, 0, name), + }; + + return node; +} + +/** + * Allocate and initialize a new ItParametersNode node. + */ +static pm_it_parameters_node_t * +pm_it_parameters_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *closing) { + pm_it_parameters_node_t *node = PM_NODE_ALLOC(parser, pm_it_parameters_node_t); + + *node = (pm_it_parameters_node_t) { + .base = PM_NODE_INIT_TOKENS(parser, PM_IT_PARAMETERS_NODE, 0, opening, closing), + }; + + return node; +} + +/** + * Allocate a new KeywordHashNode node. + */ +static pm_keyword_hash_node_t * +pm_keyword_hash_node_create(pm_parser_t *parser) { + pm_keyword_hash_node_t *node = PM_NODE_ALLOC(parser, pm_keyword_hash_node_t); + + *node = (pm_keyword_hash_node_t) { + .base = PM_NODE_INIT_UNSET(parser, PM_KEYWORD_HASH_NODE, PM_KEYWORD_HASH_NODE_FLAGS_SYMBOL_KEYS), + .elements = { 0 } + }; + + return node; +} + +/** + * Append an element to a KeywordHashNode node. + */ +static void +pm_keyword_hash_node_elements_append(pm_keyword_hash_node_t *hash, pm_node_t *element) { + // If the element being added is not an AssocNode or does not have a symbol + // key, then we want to turn the SYMBOL_KEYS flag off. + if (!PM_NODE_TYPE_P(element, PM_ASSOC_NODE) || !PM_NODE_TYPE_P(((pm_assoc_node_t *) element)->key, PM_SYMBOL_NODE)) { + pm_node_flag_unset(UP(hash), PM_KEYWORD_HASH_NODE_FLAGS_SYMBOL_KEYS); + } + + pm_node_list_append(&hash->elements, element); + if (hash->base.location.start == NULL) { + hash->base.location.start = element->location.start; + } + hash->base.location.end = element->location.end; +} + +/** + * Allocate and initialize a new RequiredKeywordParameterNode node. + */ +static pm_required_keyword_parameter_node_t * +pm_required_keyword_parameter_node_create(pm_parser_t *parser, const pm_token_t *name) { + pm_required_keyword_parameter_node_t *node = PM_NODE_ALLOC(parser, pm_required_keyword_parameter_node_t); + + *node = (pm_required_keyword_parameter_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_REQUIRED_KEYWORD_PARAMETER_NODE, 0, name), + .name = pm_parser_constant_id_location(parser, name->start, name->end - 1), + .name_loc = PM_LOCATION_TOKEN_VALUE(name), + }; + + return node; +} + +/** + * Allocate a new OptionalKeywordParameterNode node. + */ +static pm_optional_keyword_parameter_node_t * +pm_optional_keyword_parameter_node_create(pm_parser_t *parser, const pm_token_t *name, pm_node_t *value) { + pm_optional_keyword_parameter_node_t *node = PM_NODE_ALLOC(parser, pm_optional_keyword_parameter_node_t); + + *node = (pm_optional_keyword_parameter_node_t) { + .base = PM_NODE_INIT_TOKEN_NODE(parser, PM_OPTIONAL_KEYWORD_PARAMETER_NODE, 0, name, value), + .name = pm_parser_constant_id_location(parser, name->start, name->end - 1), + .name_loc = PM_LOCATION_TOKEN_VALUE(name), + .value = value + }; + + return node; +} + +/** + * Allocate a new KeywordRestParameterNode node. + */ +static pm_keyword_rest_parameter_node_t * +pm_keyword_rest_parameter_node_create(pm_parser_t *parser, const pm_token_t *operator, const pm_token_t *name) { + pm_keyword_rest_parameter_node_t *node = PM_NODE_ALLOC(parser, pm_keyword_rest_parameter_node_t); + + *node = (pm_keyword_rest_parameter_node_t) { + .base = ( + (name->type == PM_TOKEN_NOT_PROVIDED) + ? PM_NODE_INIT_TOKEN(parser, PM_KEYWORD_REST_PARAMETER_NODE, 0, operator) + : PM_NODE_INIT_TOKENS(parser, PM_KEYWORD_REST_PARAMETER_NODE, 0, operator, name) + ), + .name = pm_parser_optional_constant_id_token(parser, name), + .name_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(name), + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator) + }; + + return node; +} + +/** + * Allocate a new LambdaNode node. + */ +static pm_lambda_node_t * +pm_lambda_node_create( + pm_parser_t *parser, + pm_constant_id_list_t *locals, + const pm_token_t *operator, + const pm_token_t *opening, + const pm_token_t *closing, + pm_node_t *parameters, + pm_node_t *body +) { + pm_lambda_node_t *node = PM_NODE_ALLOC(parser, pm_lambda_node_t); + + *node = (pm_lambda_node_t) { + .base = PM_NODE_INIT_TOKENS(parser, PM_LAMBDA_NODE, 0, operator, closing), + .locals = *locals, + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .opening_loc = PM_LOCATION_TOKEN_VALUE(opening), + .closing_loc = PM_LOCATION_TOKEN_VALUE(closing), + .parameters = parameters, + .body = body + }; + + return node; +} + +/** + * Allocate and initialize a new LocalVariableAndWriteNode node. + */ +static pm_local_variable_and_write_node_t * +pm_local_variable_and_write_node_create(pm_parser_t *parser, pm_node_t *target, const pm_token_t *operator, pm_node_t *value, pm_constant_id_t name, uint32_t depth) { + assert(PM_NODE_TYPE_P(target, PM_LOCAL_VARIABLE_READ_NODE) || PM_NODE_TYPE_P(target, PM_IT_LOCAL_VARIABLE_READ_NODE) || PM_NODE_TYPE_P(target, PM_CALL_NODE)); + assert(operator->type == PM_TOKEN_AMPERSAND_AMPERSAND_EQUAL); + pm_local_variable_and_write_node_t *node = PM_NODE_ALLOC(parser, pm_local_variable_and_write_node_t); + + *node = (pm_local_variable_and_write_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_LOCAL_VARIABLE_AND_WRITE_NODE, 0, target, value), + .name_loc = target->location, + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .value = value, + .name = name, + .depth = depth + }; + + return node; +} + +/** + * Allocate and initialize a new LocalVariableOperatorWriteNode node. + */ +static pm_local_variable_operator_write_node_t * +pm_local_variable_operator_write_node_create(pm_parser_t *parser, pm_node_t *target, const pm_token_t *operator, pm_node_t *value, pm_constant_id_t name, uint32_t depth) { + pm_local_variable_operator_write_node_t *node = PM_NODE_ALLOC(parser, pm_local_variable_operator_write_node_t); + + *node = (pm_local_variable_operator_write_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_LOCAL_VARIABLE_OPERATOR_WRITE_NODE, 0, target, value), + .name_loc = target->location, + .binary_operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .value = value, + .name = name, + .binary_operator = pm_parser_constant_id_location(parser, operator->start, operator->end - 1), + .depth = depth + }; + + return node; +} + +/** + * Allocate and initialize a new LocalVariableOrWriteNode node. + */ +static pm_local_variable_or_write_node_t * +pm_local_variable_or_write_node_create(pm_parser_t *parser, pm_node_t *target, const pm_token_t *operator, pm_node_t *value, pm_constant_id_t name, uint32_t depth) { + assert(PM_NODE_TYPE_P(target, PM_LOCAL_VARIABLE_READ_NODE) || PM_NODE_TYPE_P(target, PM_IT_LOCAL_VARIABLE_READ_NODE) || PM_NODE_TYPE_P(target, PM_CALL_NODE)); + assert(operator->type == PM_TOKEN_PIPE_PIPE_EQUAL); + pm_local_variable_or_write_node_t *node = PM_NODE_ALLOC(parser, pm_local_variable_or_write_node_t); + + *node = (pm_local_variable_or_write_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_LOCAL_VARIABLE_OR_WRITE_NODE, 0, target, value), + .name_loc = target->location, + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .value = value, + .name = name, + .depth = depth + }; + + return node; +} + +/** + * Allocate a new LocalVariableReadNode node with constant_id. + */ +static pm_local_variable_read_node_t * +pm_local_variable_read_node_create_constant_id(pm_parser_t *parser, const pm_token_t *name, pm_constant_id_t name_id, uint32_t depth, bool missing) { + if (!missing) pm_locals_read(&pm_parser_scope_find(parser, depth)->locals, name_id); + + pm_local_variable_read_node_t *node = PM_NODE_ALLOC(parser, pm_local_variable_read_node_t); + + *node = (pm_local_variable_read_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_LOCAL_VARIABLE_READ_NODE, 0, name), + .name = name_id, + .depth = depth + }; + + return node; +} + +/** + * Allocate and initialize a new LocalVariableReadNode node. + */ +static pm_local_variable_read_node_t * +pm_local_variable_read_node_create(pm_parser_t *parser, const pm_token_t *name, uint32_t depth) { + pm_constant_id_t name_id = pm_parser_constant_id_token(parser, name); + return pm_local_variable_read_node_create_constant_id(parser, name, name_id, depth, false); +} + +/** + * Allocate and initialize a new LocalVariableReadNode node for a missing local + * variable. (This will only happen when there is a syntax error.) + */ +static pm_local_variable_read_node_t * +pm_local_variable_read_node_missing_create(pm_parser_t *parser, const pm_token_t *name, uint32_t depth) { + pm_constant_id_t name_id = pm_parser_constant_id_token(parser, name); + return pm_local_variable_read_node_create_constant_id(parser, name, name_id, depth, true); +} + +/** + * Allocate and initialize a new LocalVariableWriteNode node. + */ +static pm_local_variable_write_node_t * +pm_local_variable_write_node_create(pm_parser_t *parser, pm_constant_id_t name, uint32_t depth, pm_node_t *value, const pm_location_t *name_loc, const pm_token_t *operator) { + pm_local_variable_write_node_t *node = PM_NODE_ALLOC(parser, pm_local_variable_write_node_t); + pm_node_flags_t flags = pm_implicit_array_write_flags(value, PM_WRITE_NODE_FLAGS_IMPLICIT_ARRAY); + + *node = (pm_local_variable_write_node_t) { + .base = PM_NODE_INIT_TOKEN_NODE(parser, PM_LOCAL_VARIABLE_WRITE_NODE, flags, name_loc, value), + .name = name, + .depth = depth, + .value = value, + .name_loc = *name_loc, + .operator_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(operator) + }; + + return node; +} + +/** + * Returns true if the given bounds comprise `it`. + */ +static inline bool +pm_token_is_it(const uint8_t *start, const uint8_t *end) { + return (end - start == 2) && (start[0] == 'i') && (start[1] == 't'); +} + +/** + * Returns true if the given bounds comprise a numbered parameter (i.e., they + * are of the form /^_\d$/). + */ +static inline bool +pm_token_is_numbered_parameter(const uint8_t *start, const uint8_t *end) { + return (end - start == 2) && (start[0] == '_') && (start[1] != '0') && (pm_char_is_decimal_digit(start[1])); +} + +/** + * Ensure the given bounds do not comprise a numbered parameter. If they do, add + * an appropriate error message to the parser. + */ +static inline void +pm_refute_numbered_parameter(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) { + if (pm_token_is_numbered_parameter(start, end)) { + PM_PARSER_ERR_FORMAT(parser, start, end, PM_ERR_PARAMETER_NUMBERED_RESERVED, start); + } +} + +/** + * Allocate and initialize a new LocalVariableTargetNode node with the given + * name and depth. + */ +static pm_local_variable_target_node_t * +pm_local_variable_target_node_create(pm_parser_t *parser, const pm_location_t *location, pm_constant_id_t name, uint32_t depth) { + pm_refute_numbered_parameter(parser, location->start, location->end); + pm_local_variable_target_node_t *node = PM_NODE_ALLOC(parser, pm_local_variable_target_node_t); + + *node = (pm_local_variable_target_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_LOCAL_VARIABLE_TARGET_NODE, 0, location), + .name = name, + .depth = depth + }; + + return node; +} + +/** + * Allocate and initialize a new MatchPredicateNode node. + */ +static pm_match_predicate_node_t * +pm_match_predicate_node_create(pm_parser_t *parser, pm_node_t *value, pm_node_t *pattern, const pm_token_t *operator) { + pm_assert_value_expression(parser, value); + + pm_match_predicate_node_t *node = PM_NODE_ALLOC(parser, pm_match_predicate_node_t); + + *node = (pm_match_predicate_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_MATCH_PREDICATE_NODE, 0, value, pattern), + .value = value, + .pattern = pattern, + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator) + }; + + return node; +} + +/** + * Allocate and initialize a new MatchRequiredNode node. + */ +static pm_match_required_node_t * +pm_match_required_node_create(pm_parser_t *parser, pm_node_t *value, pm_node_t *pattern, const pm_token_t *operator) { + pm_assert_value_expression(parser, value); + + pm_match_required_node_t *node = PM_NODE_ALLOC(parser, pm_match_required_node_t); + + *node = (pm_match_required_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_MATCH_REQUIRED_NODE, 0, value, pattern), + .value = value, + .pattern = pattern, + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator) + }; + + return node; +} + +/** + * Allocate and initialize a new MatchWriteNode node. + */ +static pm_match_write_node_t * +pm_match_write_node_create(pm_parser_t *parser, pm_call_node_t *call) { + pm_match_write_node_t *node = PM_NODE_ALLOC(parser, pm_match_write_node_t); + + *node = (pm_match_write_node_t) { + .base = PM_NODE_INIT_NODE(parser, PM_MATCH_WRITE_NODE, 0, call), + .call = call, + .targets = { 0 } + }; + + return node; +} + +/** + * Allocate a new ModuleNode node. + */ +static pm_module_node_t * +pm_module_node_create(pm_parser_t *parser, pm_constant_id_list_t *locals, const pm_token_t *module_keyword, pm_node_t *constant_path, const pm_token_t *name, pm_node_t *body, const pm_token_t *end_keyword) { + pm_module_node_t *node = PM_NODE_ALLOC(parser, pm_module_node_t); + + *node = (pm_module_node_t) { + .base = PM_NODE_INIT_TOKENS(parser, PM_MODULE_NODE, 0, module_keyword, end_keyword), + .locals = (locals == NULL ? ((pm_constant_id_list_t) { .ids = NULL, .size = 0, .capacity = 0 }) : *locals), + .module_keyword_loc = PM_LOCATION_TOKEN_VALUE(module_keyword), + .constant_path = constant_path, + .body = body, + .end_keyword_loc = PM_LOCATION_TOKEN_VALUE(end_keyword), + .name = pm_parser_constant_id_token(parser, name) + }; + + return node; +} + +/** + * Allocate and initialize new MultiTargetNode node. + */ +static pm_multi_target_node_t * +pm_multi_target_node_create(pm_parser_t *parser) { + pm_multi_target_node_t *node = PM_NODE_ALLOC(parser, pm_multi_target_node_t); + + *node = (pm_multi_target_node_t) { + .base = PM_NODE_INIT_UNSET(parser, PM_MULTI_TARGET_NODE, 0), + .lefts = { 0 }, + .rest = NULL, + .rights = { 0 }, + .lparen_loc = { 0 }, + .rparen_loc = { 0 } + }; + + return node; +} + +/** + * Append a target to a MultiTargetNode node. + */ +static void +pm_multi_target_node_targets_append(pm_parser_t *parser, pm_multi_target_node_t *node, pm_node_t *target) { + if (PM_NODE_TYPE_P(target, PM_SPLAT_NODE)) { + if (node->rest == NULL) { + node->rest = target; + } else { + pm_parser_err_node(parser, target, PM_ERR_MULTI_ASSIGN_MULTI_SPLATS); + pm_node_list_append(&node->rights, target); + } + } else if (PM_NODE_TYPE_P(target, PM_IMPLICIT_REST_NODE)) { + if (node->rest == NULL) { + node->rest = target; + } else { + PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, parser->current, PM_ERR_MULTI_ASSIGN_UNEXPECTED_REST); + pm_node_list_append(&node->rights, target); + } + } else if (node->rest == NULL) { + pm_node_list_append(&node->lefts, target); + } else { + pm_node_list_append(&node->rights, target); + } + + if (node->base.location.start == NULL || (node->base.location.start > target->location.start)) { + node->base.location.start = target->location.start; + } + + if (node->base.location.end == NULL || (node->base.location.end < target->location.end)) { + node->base.location.end = target->location.end; + } +} + +/** + * Set the opening of a MultiTargetNode node. + */ +static void +pm_multi_target_node_opening_set(pm_multi_target_node_t *node, const pm_token_t *lparen) { + node->base.location.start = lparen->start; + node->lparen_loc = PM_LOCATION_TOKEN_VALUE(lparen); +} + +/** + * Set the closing of a MultiTargetNode node. + */ +static void +pm_multi_target_node_closing_set(pm_multi_target_node_t *node, const pm_token_t *rparen) { + node->base.location.end = rparen->end; + node->rparen_loc = PM_LOCATION_TOKEN_VALUE(rparen); +} + +/** + * Allocate a new MultiWriteNode node. + */ +static pm_multi_write_node_t * +pm_multi_write_node_create(pm_parser_t *parser, pm_multi_target_node_t *target, const pm_token_t *operator, pm_node_t *value) { + pm_multi_write_node_t *node = PM_NODE_ALLOC(parser, pm_multi_write_node_t); + pm_node_flags_t flags = pm_implicit_array_write_flags(value, PM_WRITE_NODE_FLAGS_IMPLICIT_ARRAY); + + *node = (pm_multi_write_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_MULTI_WRITE_NODE, flags, target, value), + .lefts = target->lefts, + .rest = target->rest, + .rights = target->rights, + .lparen_loc = target->lparen_loc, + .rparen_loc = target->rparen_loc, + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .value = value + }; + + // Explicitly do not call pm_node_destroy here because we want to keep + // around all of the information within the MultiWriteNode node. + xfree(target); + + return node; +} + +/** + * Allocate and initialize a new NextNode node. + */ +static pm_next_node_t * +pm_next_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_arguments_node_t *arguments) { + assert(keyword->type == PM_TOKEN_KEYWORD_NEXT); + pm_next_node_t *node = PM_NODE_ALLOC(parser, pm_next_node_t); + + *node = (pm_next_node_t) { + .base = ( + (arguments == NULL) + ? PM_NODE_INIT_TOKEN(parser, PM_NEXT_NODE, 0, keyword) + : PM_NODE_INIT_TOKEN_NODE(parser, PM_NEXT_NODE, 0, keyword, arguments) + ), + .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword), + .arguments = arguments + }; + + return node; +} + +/** + * Allocate and initialize a new NilNode node. + */ +static pm_nil_node_t * +pm_nil_node_create(pm_parser_t *parser, const pm_token_t *token) { + assert(token->type == PM_TOKEN_KEYWORD_NIL); + pm_nil_node_t *node = PM_NODE_ALLOC(parser, pm_nil_node_t); + + *node = (pm_nil_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_NIL_NODE, PM_NODE_FLAG_STATIC_LITERAL, token) + }; + + return node; +} + +/** + * Allocate and initialize a new NoKeywordsParameterNode node. + */ +static pm_no_keywords_parameter_node_t * +pm_no_keywords_parameter_node_create(pm_parser_t *parser, const pm_token_t *operator, const pm_token_t *keyword) { + assert(operator->type == PM_TOKEN_USTAR_STAR || operator->type == PM_TOKEN_STAR_STAR); + assert(keyword->type == PM_TOKEN_KEYWORD_NIL); + pm_no_keywords_parameter_node_t *node = PM_NODE_ALLOC(parser, pm_no_keywords_parameter_node_t); + + *node = (pm_no_keywords_parameter_node_t) { + .base = PM_NODE_INIT_TOKENS(parser, PM_NO_KEYWORDS_PARAMETER_NODE, 0, operator, keyword), + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword) + }; + + return node; +} + +/** + * Allocate and initialize a new NumberedParametersNode node. + */ +static pm_numbered_parameters_node_t * +pm_numbered_parameters_node_create(pm_parser_t *parser, const pm_location_t *location, uint8_t maximum) { + pm_numbered_parameters_node_t *node = PM_NODE_ALLOC(parser, pm_numbered_parameters_node_t); + + *node = (pm_numbered_parameters_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_NUMBERED_PARAMETERS_NODE, 0, location), + .maximum = maximum + }; + + return node; +} + +/** + * The maximum numbered reference value is defined as the maximum value that an + * integer can hold minus 1 bit for CRuby instruction sequence operand tagging. + */ +#define NTH_REF_MAX ((uint32_t) (INT_MAX >> 1)) + +/** + * Parse the decimal number represented by the range of bytes. Returns + * 0 if the number fails to parse or if the number is greater than the maximum + * value representable by a numbered reference. This function assumes that the + * range of bytes has already been validated to contain only decimal digits. + */ +static uint32_t +pm_numbered_reference_read_node_number(pm_parser_t *parser, const pm_token_t *token) { + const uint8_t *start = token->start + 1; + const uint8_t *end = token->end; + + ptrdiff_t diff = end - start; + assert(diff > 0); +#if PTRDIFF_MAX > SIZE_MAX + assert(diff < (ptrdiff_t) SIZE_MAX); +#endif + size_t length = (size_t) diff; + + char *digits = xcalloc(length + 1, sizeof(char)); + memcpy(digits, start, length); + digits[length] = '\0'; + + char *endptr; + errno = 0; + unsigned long value = strtoul(digits, &endptr, 10); + + if ((digits == endptr) || (*endptr != '\0')) { + pm_parser_err(parser, start, end, PM_ERR_INVALID_NUMBER_DECIMAL); + value = 0; + } + + xfree(digits); + + if ((errno == ERANGE) || (value > NTH_REF_MAX)) { + PM_PARSER_WARN_FORMAT(parser, start, end, PM_WARN_INVALID_NUMBERED_REFERENCE, (int) (length + 1), (const char *) token->start); + value = 0; + } + + return (uint32_t) value; +} + +#undef NTH_REF_MAX + +/** + * Allocate and initialize a new NthReferenceReadNode node. + */ +static pm_numbered_reference_read_node_t * +pm_numbered_reference_read_node_create(pm_parser_t *parser, const pm_token_t *name) { + assert(name->type == PM_TOKEN_NUMBERED_REFERENCE); + pm_numbered_reference_read_node_t *node = PM_NODE_ALLOC(parser, pm_numbered_reference_read_node_t); + + *node = (pm_numbered_reference_read_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_NUMBERED_REFERENCE_READ_NODE, 0, name), + .number = pm_numbered_reference_read_node_number(parser, name) + }; + + return node; +} + +/** + * Allocate a new OptionalParameterNode node. + */ +static pm_optional_parameter_node_t * +pm_optional_parameter_node_create(pm_parser_t *parser, const pm_token_t *name, const pm_token_t *operator, pm_node_t *value) { + pm_optional_parameter_node_t *node = PM_NODE_ALLOC(parser, pm_optional_parameter_node_t); + + *node = (pm_optional_parameter_node_t) { + .base = PM_NODE_INIT_TOKEN_NODE(parser, PM_OPTIONAL_PARAMETER_NODE, 0, name, value), + .name = pm_parser_constant_id_token(parser, name), + .name_loc = PM_LOCATION_TOKEN_VALUE(name), + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .value = value + }; + + return node; +} + +/** + * Allocate and initialize a new OrNode node. + */ +static pm_or_node_t * +pm_or_node_create(pm_parser_t *parser, pm_node_t *left, const pm_token_t *operator, pm_node_t *right) { + pm_assert_value_expression(parser, left); + + pm_or_node_t *node = PM_NODE_ALLOC(parser, pm_or_node_t); + + *node = (pm_or_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_OR_NODE, 0, left, right), + .left = left, + .right = right, + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator) + }; + + return node; +} + +/** + * Allocate and initialize a new ParametersNode node. + */ +static pm_parameters_node_t * +pm_parameters_node_create(pm_parser_t *parser) { + pm_parameters_node_t *node = PM_NODE_ALLOC(parser, pm_parameters_node_t); + + *node = (pm_parameters_node_t) { + .base = PM_NODE_INIT_UNSET(parser, PM_PARAMETERS_NODE, 0), + .rest = NULL, + .keyword_rest = NULL, + .block = NULL, + .requireds = { 0 }, + .optionals = { 0 }, + .posts = { 0 }, + .keywords = { 0 } + }; + + return node; +} + +/** + * Set the location properly for the parameters node. + */ +static void +pm_parameters_node_location_set(pm_parameters_node_t *params, pm_node_t *param) { + if (params->base.location.start == NULL) { + params->base.location.start = param->location.start; + } else { + params->base.location.start = params->base.location.start < param->location.start ? params->base.location.start : param->location.start; + } + + if (params->base.location.end == NULL) { + params->base.location.end = param->location.end; + } else { + params->base.location.end = params->base.location.end > param->location.end ? params->base.location.end : param->location.end; + } +} + +/** + * Append a required parameter to a ParametersNode node. + */ +static void +pm_parameters_node_requireds_append(pm_parameters_node_t *params, pm_node_t *param) { + pm_parameters_node_location_set(params, param); + pm_node_list_append(¶ms->requireds, param); +} + +/** + * Append an optional parameter to a ParametersNode node. + */ +static void +pm_parameters_node_optionals_append(pm_parameters_node_t *params, pm_optional_parameter_node_t *param) { + pm_parameters_node_location_set(params, UP(param)); + pm_node_list_append(¶ms->optionals, UP(param)); +} + +/** + * Append a post optional arguments parameter to a ParametersNode node. + */ +static void +pm_parameters_node_posts_append(pm_parameters_node_t *params, pm_node_t *param) { + pm_parameters_node_location_set(params, param); + pm_node_list_append(¶ms->posts, param); +} + +/** + * Set the rest parameter on a ParametersNode node. + */ +static void +pm_parameters_node_rest_set(pm_parameters_node_t *params, pm_node_t *param) { + pm_parameters_node_location_set(params, param); + params->rest = param; +} + +/** + * Append a keyword parameter to a ParametersNode node. + */ +static void +pm_parameters_node_keywords_append(pm_parameters_node_t *params, pm_node_t *param) { + pm_parameters_node_location_set(params, param); + pm_node_list_append(¶ms->keywords, param); +} + +/** + * Set the keyword rest parameter on a ParametersNode node. + */ +static void +pm_parameters_node_keyword_rest_set(pm_parameters_node_t *params, pm_node_t *param) { + assert(params->keyword_rest == NULL); + pm_parameters_node_location_set(params, param); + params->keyword_rest = param; +} + +/** + * Set the block parameter on a ParametersNode node. + */ +static void +pm_parameters_node_block_set(pm_parameters_node_t *params, pm_block_parameter_node_t *param) { + assert(params->block == NULL); + pm_parameters_node_location_set(params, UP(param)); + params->block = param; +} + +/** + * Allocate a new ProgramNode node. + */ +static pm_program_node_t * +pm_program_node_create(pm_parser_t *parser, pm_constant_id_list_t *locals, pm_statements_node_t *statements) { + pm_program_node_t *node = PM_NODE_ALLOC(parser, pm_program_node_t); + + *node = (pm_program_node_t) { + .base = PM_NODE_INIT_NODE(parser, PM_PROGRAM_NODE, 0, statements), + .locals = *locals, + .statements = statements + }; + + return node; +} + +/** + * Allocate and initialize new ParenthesesNode node. + */ +static pm_parentheses_node_t * +pm_parentheses_node_create(pm_parser_t *parser, const pm_token_t *opening, pm_node_t *body, const pm_token_t *closing, pm_node_flags_t flags) { + pm_parentheses_node_t *node = PM_NODE_ALLOC(parser, pm_parentheses_node_t); + + *node = (pm_parentheses_node_t) { + .base = PM_NODE_INIT_TOKENS(parser, PM_PARENTHESES_NODE, flags, opening, closing), + .body = body, + .opening_loc = PM_LOCATION_TOKEN_VALUE(opening), + .closing_loc = PM_LOCATION_TOKEN_VALUE(closing) + }; + + return node; +} + +/** + * Allocate and initialize a new PinnedExpressionNode node. + */ +static pm_pinned_expression_node_t * +pm_pinned_expression_node_create(pm_parser_t *parser, pm_node_t *expression, const pm_token_t *operator, const pm_token_t *lparen, const pm_token_t *rparen) { + pm_pinned_expression_node_t *node = PM_NODE_ALLOC(parser, pm_pinned_expression_node_t); + + *node = (pm_pinned_expression_node_t) { + .base = PM_NODE_INIT_TOKENS(parser, PM_PINNED_EXPRESSION_NODE, 0, operator, rparen), + .expression = expression, + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .lparen_loc = PM_LOCATION_TOKEN_VALUE(lparen), + .rparen_loc = PM_LOCATION_TOKEN_VALUE(rparen) + }; + + return node; +} + +/** + * Allocate and initialize a new PinnedVariableNode node. + */ +static pm_pinned_variable_node_t * +pm_pinned_variable_node_create(pm_parser_t *parser, const pm_token_t *operator, pm_node_t *variable) { + pm_pinned_variable_node_t *node = PM_NODE_ALLOC(parser, pm_pinned_variable_node_t); + + *node = (pm_pinned_variable_node_t) { + .base = PM_NODE_INIT_TOKEN_NODE(parser, PM_PINNED_VARIABLE_NODE, 0, operator, variable), + .variable = variable, + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator) + }; + + return node; +} + +/** + * Allocate and initialize a new PostExecutionNode node. + */ +static pm_post_execution_node_t * +pm_post_execution_node_create(pm_parser_t *parser, const pm_token_t *keyword, const pm_token_t *opening, pm_statements_node_t *statements, const pm_token_t *closing) { + pm_post_execution_node_t *node = PM_NODE_ALLOC(parser, pm_post_execution_node_t); + + *node = (pm_post_execution_node_t) { + .base = PM_NODE_INIT_TOKENS(parser, PM_POST_EXECUTION_NODE, 0, keyword, closing), + .statements = statements, + .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword), + .opening_loc = PM_LOCATION_TOKEN_VALUE(opening), + .closing_loc = PM_LOCATION_TOKEN_VALUE(closing) + }; + + return node; +} + +/** + * Allocate and initialize a new PreExecutionNode node. + */ +static pm_pre_execution_node_t * +pm_pre_execution_node_create(pm_parser_t *parser, const pm_token_t *keyword, const pm_token_t *opening, pm_statements_node_t *statements, const pm_token_t *closing) { + pm_pre_execution_node_t *node = PM_NODE_ALLOC(parser, pm_pre_execution_node_t); + + *node = (pm_pre_execution_node_t) { + .base = PM_NODE_INIT_TOKENS(parser, PM_PRE_EXECUTION_NODE, 0, keyword, closing), + .statements = statements, + .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword), + .opening_loc = PM_LOCATION_TOKEN_VALUE(opening), + .closing_loc = PM_LOCATION_TOKEN_VALUE(closing) + }; + + return node; +} + +/** + * Allocate and initialize new RangeNode node. + */ +static pm_range_node_t * +pm_range_node_create(pm_parser_t *parser, pm_node_t *left, const pm_token_t *operator, pm_node_t *right) { + pm_assert_value_expression(parser, left); + pm_assert_value_expression(parser, right); + + pm_range_node_t *node = PM_NODE_ALLOC(parser, pm_range_node_t); + pm_node_flags_t flags = 0; + + // Indicate that this node is an exclusive range if the operator is `...`. + if (operator->type == PM_TOKEN_DOT_DOT_DOT || operator->type == PM_TOKEN_UDOT_DOT_DOT) { + flags |= PM_RANGE_FLAGS_EXCLUDE_END; + } + + // Indicate that this node is a static literal (i.e., can be compiled with + // a putobject in CRuby) if the left and right are implicit nil, explicit + // nil, or integers. + if ( + (left == NULL || PM_NODE_TYPE_P(left, PM_NIL_NODE) || PM_NODE_TYPE_P(left, PM_INTEGER_NODE)) && + (right == NULL || PM_NODE_TYPE_P(right, PM_NIL_NODE) || PM_NODE_TYPE_P(right, PM_INTEGER_NODE)) + ) { + flags |= PM_NODE_FLAG_STATIC_LITERAL; + } + + *node = (pm_range_node_t) { + .base = PM_NODE_INIT(parser, PM_RANGE_NODE, flags, (left == NULL ? operator->start : left->location.start), (right == NULL ? operator->end : right->location.end)), + .left = left, + .right = right, + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator) + }; + + return node; +} + +/** + * Allocate and initialize a new RedoNode node. + */ +static pm_redo_node_t * +pm_redo_node_create(pm_parser_t *parser, const pm_token_t *token) { + assert(token->type == PM_TOKEN_KEYWORD_REDO); + pm_redo_node_t *node = PM_NODE_ALLOC(parser, pm_redo_node_t); + + *node = (pm_redo_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_REDO_NODE, 0, token) + }; + + return node; +} + +/** + * Allocate a new initialize a new RegularExpressionNode node with the given + * unescaped string. + */ +static pm_regular_expression_node_t * +pm_regular_expression_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing, const pm_string_t *unescaped) { + pm_regular_expression_node_t *node = PM_NODE_ALLOC(parser, pm_regular_expression_node_t); + pm_node_flags_t flags = pm_regular_expression_flags_create(parser, closing) | PM_NODE_FLAG_STATIC_LITERAL; + + *node = (pm_regular_expression_node_t) { + .base = PM_NODE_INIT_TOKENS(parser, PM_REGULAR_EXPRESSION_NODE, flags, opening, closing), + .opening_loc = PM_LOCATION_TOKEN_VALUE(opening), + .content_loc = PM_LOCATION_TOKEN_VALUE(content), + .closing_loc = PM_LOCATION_TOKEN_VALUE(closing), + .unescaped = *unescaped + }; + + return node; +} + +/** + * Allocate a new initialize a new RegularExpressionNode node. + */ +static inline pm_regular_expression_node_t * +pm_regular_expression_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing) { + return pm_regular_expression_node_create_unescaped(parser, opening, content, closing, &PM_STRING_EMPTY); +} + +/** + * Allocate a new RequiredParameterNode node. + */ +static pm_required_parameter_node_t * +pm_required_parameter_node_create(pm_parser_t *parser, const pm_token_t *token) { + pm_required_parameter_node_t *node = PM_NODE_ALLOC(parser, pm_required_parameter_node_t); + + *node = (pm_required_parameter_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_REQUIRED_PARAMETER_NODE, 0, token), + .name = pm_parser_constant_id_token(parser, token) + }; + + return node; +} + +/** + * Allocate a new RescueModifierNode node. + */ +static pm_rescue_modifier_node_t * +pm_rescue_modifier_node_create(pm_parser_t *parser, pm_node_t *expression, const pm_token_t *keyword, pm_node_t *rescue_expression) { + pm_rescue_modifier_node_t *node = PM_NODE_ALLOC(parser, pm_rescue_modifier_node_t); + + *node = (pm_rescue_modifier_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_RESCUE_MODIFIER_NODE, 0, expression, rescue_expression), + .expression = expression, + .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword), + .rescue_expression = rescue_expression + }; + + return node; +} + +/** + * Allocate and initialize a new RescueNode node. + */ +static pm_rescue_node_t * +pm_rescue_node_create(pm_parser_t *parser, const pm_token_t *keyword) { + pm_rescue_node_t *node = PM_NODE_ALLOC(parser, pm_rescue_node_t); + + *node = (pm_rescue_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_RESCUE_NODE, 0, keyword), + .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword), + .operator_loc = { 0 }, + .then_keyword_loc = { 0 }, + .reference = NULL, + .statements = NULL, + .subsequent = NULL, + .exceptions = { 0 } + }; + + return node; +} + +static inline void +pm_rescue_node_operator_set(pm_rescue_node_t *node, const pm_token_t *operator) { + node->operator_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(operator); +} + +/** + * Set the reference of a rescue node, and update the location of the node. + */ +static void +pm_rescue_node_reference_set(pm_rescue_node_t *node, pm_node_t *reference) { + node->reference = reference; + node->base.location.end = reference->location.end; +} + +/** + * Set the statements of a rescue node, and update the location of the node. + */ +static void +pm_rescue_node_statements_set(pm_rescue_node_t *node, pm_statements_node_t *statements) { + node->statements = statements; + if (pm_statements_node_body_length(statements) > 0) { + node->base.location.end = statements->base.location.end; + } +} + +/** + * Set the subsequent of a rescue node, and update the location. + */ +static void +pm_rescue_node_subsequent_set(pm_rescue_node_t *node, pm_rescue_node_t *subsequent) { + node->subsequent = subsequent; + node->base.location.end = subsequent->base.location.end; +} + +/** + * Append an exception node to a rescue node, and update the location. + */ +static void +pm_rescue_node_exceptions_append(pm_rescue_node_t *node, pm_node_t *exception) { + pm_node_list_append(&node->exceptions, exception); + node->base.location.end = exception->location.end; +} + +/** + * Allocate a new RestParameterNode node. + */ +static pm_rest_parameter_node_t * +pm_rest_parameter_node_create(pm_parser_t *parser, const pm_token_t *operator, const pm_token_t *name) { + pm_rest_parameter_node_t *node = PM_NODE_ALLOC(parser, pm_rest_parameter_node_t); + + *node = (pm_rest_parameter_node_t) { + .base = ( + (name->type == PM_TOKEN_NOT_PROVIDED) + ? PM_NODE_INIT_TOKEN(parser, PM_REST_PARAMETER_NODE, 0, operator) + : PM_NODE_INIT_TOKENS(parser, PM_REST_PARAMETER_NODE, 0, operator, name) + ), + .name = pm_parser_optional_constant_id_token(parser, name), + .name_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(name), + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator) + }; + + return node; +} + +/** + * Allocate and initialize a new RetryNode node. + */ +static pm_retry_node_t * +pm_retry_node_create(pm_parser_t *parser, const pm_token_t *token) { + assert(token->type == PM_TOKEN_KEYWORD_RETRY); + pm_retry_node_t *node = PM_NODE_ALLOC(parser, pm_retry_node_t); + + *node = (pm_retry_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_RETRY_NODE, 0, token) + }; + + return node; +} + +/** + * Allocate a new ReturnNode node. + */ +static pm_return_node_t * +pm_return_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_arguments_node_t *arguments) { + pm_return_node_t *node = PM_NODE_ALLOC(parser, pm_return_node_t); + + *node = (pm_return_node_t) { + .base = ( + (arguments == NULL) + ? PM_NODE_INIT_TOKEN(parser, PM_RETURN_NODE, 0, keyword) + : PM_NODE_INIT_TOKEN_NODE(parser, PM_RETURN_NODE, 0, keyword, arguments) + ), + .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword), + .arguments = arguments + }; + + return node; +} + +/** + * Allocate and initialize a new SelfNode node. + */ +static pm_self_node_t * +pm_self_node_create(pm_parser_t *parser, const pm_token_t *token) { + assert(token->type == PM_TOKEN_KEYWORD_SELF); + pm_self_node_t *node = PM_NODE_ALLOC(parser, pm_self_node_t); + + *node = (pm_self_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_SELF_NODE, 0, token) + }; + + return node; +} + +/** + * Allocate and initialize a new ShareableConstantNode node. + */ +static pm_shareable_constant_node_t * +pm_shareable_constant_node_create(pm_parser_t *parser, pm_node_t *write, pm_shareable_constant_value_t value) { + pm_shareable_constant_node_t *node = PM_NODE_ALLOC(parser, pm_shareable_constant_node_t); + + *node = (pm_shareable_constant_node_t) { + .base = PM_NODE_INIT_NODE(parser, PM_SHAREABLE_CONSTANT_NODE, (pm_node_flags_t) value, write), + .write = write + }; + + return node; +} + +/** + * Allocate a new SingletonClassNode node. + */ +static pm_singleton_class_node_t * +pm_singleton_class_node_create(pm_parser_t *parser, pm_constant_id_list_t *locals, const pm_token_t *class_keyword, const pm_token_t *operator, pm_node_t *expression, pm_node_t *body, const pm_token_t *end_keyword) { + pm_singleton_class_node_t *node = PM_NODE_ALLOC(parser, pm_singleton_class_node_t); + + *node = (pm_singleton_class_node_t) { + .base = PM_NODE_INIT_TOKENS(parser, PM_SINGLETON_CLASS_NODE, 0, class_keyword, end_keyword), + .locals = *locals, + .class_keyword_loc = PM_LOCATION_TOKEN_VALUE(class_keyword), + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .expression = expression, + .body = body, + .end_keyword_loc = PM_LOCATION_TOKEN_VALUE(end_keyword) + }; + + return node; +} + +/** + * Allocate and initialize a new SourceEncodingNode node. + */ +static pm_source_encoding_node_t * +pm_source_encoding_node_create(pm_parser_t *parser, const pm_token_t *token) { + assert(token->type == PM_TOKEN_KEYWORD___ENCODING__); + pm_source_encoding_node_t *node = PM_NODE_ALLOC(parser, pm_source_encoding_node_t); + + *node = (pm_source_encoding_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_SOURCE_ENCODING_NODE, PM_NODE_FLAG_STATIC_LITERAL, token) + }; + + return node; +} + +/** + * Allocate and initialize a new SourceFileNode node. + */ +static pm_source_file_node_t* +pm_source_file_node_create(pm_parser_t *parser, const pm_token_t *file_keyword) { + pm_source_file_node_t *node = PM_NODE_ALLOC(parser, pm_source_file_node_t); + assert(file_keyword->type == PM_TOKEN_KEYWORD___FILE__); + + pm_node_flags_t flags = 0; + + switch (parser->frozen_string_literal) { + case PM_OPTIONS_FROZEN_STRING_LITERAL_DISABLED: + flags |= PM_STRING_FLAGS_MUTABLE; + break; + case PM_OPTIONS_FROZEN_STRING_LITERAL_ENABLED: + flags |= PM_NODE_FLAG_STATIC_LITERAL | PM_STRING_FLAGS_FROZEN; + break; + } + + *node = (pm_source_file_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_SOURCE_FILE_NODE, flags, file_keyword), + .filepath = parser->filepath + }; + + return node; +} + +/** + * Allocate and initialize a new SourceLineNode node. + */ +static pm_source_line_node_t * +pm_source_line_node_create(pm_parser_t *parser, const pm_token_t *token) { + assert(token->type == PM_TOKEN_KEYWORD___LINE__); + pm_source_line_node_t *node = PM_NODE_ALLOC(parser, pm_source_line_node_t); + + *node = (pm_source_line_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_SOURCE_LINE_NODE, PM_NODE_FLAG_STATIC_LITERAL, token) + }; + + return node; +} + +/** + * Allocate a new SplatNode node. + */ +static pm_splat_node_t * +pm_splat_node_create(pm_parser_t *parser, const pm_token_t *operator, pm_node_t *expression) { + pm_splat_node_t *node = PM_NODE_ALLOC(parser, pm_splat_node_t); + + *node = (pm_splat_node_t) { + .base = ( + (expression == NULL) + ? PM_NODE_INIT_TOKEN(parser, PM_SPLAT_NODE, 0, operator) + : PM_NODE_INIT_TOKEN_NODE(parser, PM_SPLAT_NODE, 0, operator, expression) + ), + .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), + .expression = expression + }; + + return node; +} + +/** + * Allocate and initialize a new StatementsNode node. + */ +static pm_statements_node_t * +pm_statements_node_create(pm_parser_t *parser) { + pm_statements_node_t *node = PM_NODE_ALLOC(parser, pm_statements_node_t); + + *node = (pm_statements_node_t) { + .base = PM_NODE_INIT_BASE(parser, PM_STATEMENTS_NODE, 0), + .body = { 0 } + }; + + return node; +} + +/** + * Get the length of the given StatementsNode node's body. + */ +static size_t +pm_statements_node_body_length(pm_statements_node_t *node) { + return node && node->body.size; +} + +/** + * Set the location of the given StatementsNode. + */ +static void +pm_statements_node_location_set(pm_statements_node_t *node, const uint8_t *start, const uint8_t *end) { + node->base.location = (pm_location_t) { .start = start, .end = end }; +} + +/** + * Update the location of the statements node based on the statement that is + * being added to the list. + */ +static inline void +pm_statements_node_body_update(pm_statements_node_t *node, pm_node_t *statement) { + if (pm_statements_node_body_length(node) == 0 || statement->location.start < node->base.location.start) { + node->base.location.start = statement->location.start; + } + + if (statement->location.end > node->base.location.end) { + node->base.location.end = statement->location.end; + } +} + +/** + * Append a new node to the given StatementsNode node's body. + */ +static void +pm_statements_node_body_append(pm_parser_t *parser, pm_statements_node_t *node, pm_node_t *statement, bool newline) { + pm_statements_node_body_update(node, statement); + + if (node->body.size > 0) { + const pm_node_t *previous = node->body.nodes[node->body.size - 1]; + + switch (PM_NODE_TYPE(previous)) { + case PM_BREAK_NODE: + case PM_NEXT_NODE: + case PM_REDO_NODE: + case PM_RETRY_NODE: + case PM_RETURN_NODE: + pm_parser_warn_node(parser, statement, PM_WARN_UNREACHABLE_STATEMENT); + break; + default: + break; + } + } + + pm_node_list_append(&node->body, statement); + if (newline) pm_node_flag_set(statement, PM_NODE_FLAG_NEWLINE); +} + +/** + * Prepend a new node to the given StatementsNode node's body. + */ +static void +pm_statements_node_body_prepend(pm_statements_node_t *node, pm_node_t *statement) { + pm_statements_node_body_update(node, statement); + pm_node_list_prepend(&node->body, statement); + pm_node_flag_set(statement, PM_NODE_FLAG_NEWLINE); +} + +/** + * Allocate a new StringNode node with the current string on the parser. + */ +static inline pm_string_node_t * +pm_string_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing, const pm_string_t *string) { + pm_string_node_t *node = PM_NODE_ALLOC(parser, pm_string_node_t); + pm_node_flags_t flags = 0; + + switch (parser->frozen_string_literal) { + case PM_OPTIONS_FROZEN_STRING_LITERAL_DISABLED: + flags = PM_STRING_FLAGS_MUTABLE; + break; + case PM_OPTIONS_FROZEN_STRING_LITERAL_ENABLED: + flags = PM_NODE_FLAG_STATIC_LITERAL | PM_STRING_FLAGS_FROZEN; + break; + } + + const uint8_t *start = (opening->type == PM_TOKEN_NOT_PROVIDED ? content->start : opening->start); + const uint8_t *end = (closing->type == PM_TOKEN_NOT_PROVIDED ? content->end : closing->end); + + *node = (pm_string_node_t) { + .base = PM_NODE_INIT(parser, PM_STRING_NODE, flags, start, end), + .opening_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(opening), + .content_loc = PM_LOCATION_TOKEN_VALUE(content), + .closing_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(closing), + .unescaped = *string + }; + + return node; +} + +/** + * Allocate a new StringNode node. + */ +static pm_string_node_t * +pm_string_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing) { + return pm_string_node_create_unescaped(parser, opening, content, closing, &PM_STRING_EMPTY); +} + +/** + * Allocate a new StringNode node and create it using the current string on the + * parser. + */ +static pm_string_node_t * +pm_string_node_create_current_string(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing) { + pm_string_node_t *node = pm_string_node_create_unescaped(parser, opening, content, closing, &parser->current_string); + parser->current_string = PM_STRING_EMPTY; + return node; +} + +/** + * Allocate and initialize a new SuperNode node. + */ +static pm_super_node_t * +pm_super_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_arguments_t *arguments) { + assert(keyword->type == PM_TOKEN_KEYWORD_SUPER); + pm_super_node_t *node = PM_NODE_ALLOC(parser, pm_super_node_t); + + const uint8_t *end = pm_arguments_end(arguments); + if (end == NULL) { + assert(false && "unreachable"); + } + + *node = (pm_super_node_t) { + .base = PM_NODE_INIT(parser, PM_SUPER_NODE, 0, keyword->start, end), + .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword), + .lparen_loc = arguments->opening_loc, + .arguments = arguments->arguments, + .rparen_loc = arguments->closing_loc, + .block = arguments->block + }; + + return node; +} + +/** + * Read through the contents of a string and check if it consists solely of + * US-ASCII code points. + */ +static bool +pm_ascii_only_p(const pm_string_t *contents) { + const size_t length = pm_string_length(contents); + const uint8_t *source = pm_string_source(contents); + + for (size_t index = 0; index < length; index++) { + if (source[index] & 0x80) return false; + } + + return true; +} + +/** + * Validate that the contents of the given symbol are all valid UTF-8. + */ +static void +parse_symbol_encoding_validate_utf8(pm_parser_t *parser, const pm_token_t *location, const pm_string_t *contents) { + for (const uint8_t *cursor = pm_string_source(contents), *end = cursor + pm_string_length(contents); cursor < end;) { + size_t width = pm_encoding_utf_8_char_width(cursor, end - cursor); + + if (width == 0) { + pm_parser_err(parser, location->start, location->end, PM_ERR_INVALID_SYMBOL); + break; + } + + cursor += width; + } +} + +/** + * Validate that the contents of the given symbol are all valid in the encoding + * of the parser. + */ +static void +parse_symbol_encoding_validate_other(pm_parser_t *parser, const pm_token_t *location, const pm_string_t *contents) { + const pm_encoding_t *encoding = parser->encoding; + + for (const uint8_t *cursor = pm_string_source(contents), *end = cursor + pm_string_length(contents); cursor < end;) { + size_t width = encoding->char_width(cursor, end - cursor); + + if (width == 0) { + pm_parser_err(parser, location->start, location->end, PM_ERR_INVALID_SYMBOL); + break; + } + + cursor += width; + } +} + +/** + * Ruby "downgrades" the encoding of Symbols to US-ASCII if the associated + * encoding is ASCII-compatible and the Symbol consists only of US-ASCII code + * points. Otherwise, the encoding may be explicitly set with an escape + * sequence. + * + * If the validate flag is set, then it will check the contents of the symbol + * to ensure that all characters are valid in the encoding. + */ +static inline pm_node_flags_t +parse_symbol_encoding(pm_parser_t *parser, const pm_token_t *location, const pm_string_t *contents, bool validate) { + if (parser->explicit_encoding != NULL) { + // A Symbol may optionally have its encoding explicitly set. This will + // happen if an escape sequence results in a non-ASCII code point. + if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) { + if (validate) parse_symbol_encoding_validate_utf8(parser, location, contents); + return PM_SYMBOL_FLAGS_FORCED_UTF8_ENCODING; + } else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) { + return PM_SYMBOL_FLAGS_FORCED_BINARY_ENCODING; + } else if (validate) { + parse_symbol_encoding_validate_other(parser, location, contents); + } + } else if (pm_ascii_only_p(contents)) { + // Ruby stipulates that all source files must use an ASCII-compatible + // encoding. Thus, all symbols appearing in source are eligible for + // "downgrading" to US-ASCII. + return PM_SYMBOL_FLAGS_FORCED_US_ASCII_ENCODING; + } else if (validate) { + parse_symbol_encoding_validate_other(parser, location, contents); + } + + return 0; +} + +static pm_node_flags_t +parse_and_validate_regular_expression_encoding_modifier(pm_parser_t *parser, const pm_string_t *source, bool ascii_only, pm_node_flags_t flags, char modifier, const pm_encoding_t *modifier_encoding) { + assert ((modifier == 'n' && modifier_encoding == PM_ENCODING_ASCII_8BIT_ENTRY) || + (modifier == 'u' && modifier_encoding == PM_ENCODING_UTF_8_ENTRY) || + (modifier == 'e' && modifier_encoding == PM_ENCODING_EUC_JP_ENTRY) || + (modifier == 's' && modifier_encoding == PM_ENCODING_WINDOWS_31J_ENTRY)); + + // There's special validation logic used if a string does not contain any character escape sequences. + if (parser->explicit_encoding == NULL) { + // If an ASCII-only string without character escapes is used with an encoding modifier, then resulting Regexp + // has the modifier encoding, unless the ASCII-8BIT modifier is used, in which case the Regexp "downgrades" to + // the US-ASCII encoding. + if (ascii_only) { + return modifier == 'n' ? PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING : flags; + } + + if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) { + if (!ascii_only) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name); + } + } else if (parser->encoding != modifier_encoding) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_ENCODING_OPTION_MISMATCH, modifier, parser->encoding->name); + + if (modifier == 'n' && !ascii_only) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_NON_ESCAPED_MBC, (int) pm_string_length(source), (const char *) pm_string_source(source)); + } + } + + return flags; + } + + // TODO (nirvdrum 21-Feb-2024): To validate regexp sources with character escape sequences we need to know whether hex or Unicode escape sequences were used and Prism doesn't currently provide that data. We handle a subset of unambiguous cases in the meanwhile. + bool mixed_encoding = false; + + if (mixed_encoding) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_ESCAPE, (int) pm_string_length(source), (const char *) pm_string_source(source)); + } else if (modifier != 'n' && parser->explicit_encoding == PM_ENCODING_ASCII_8BIT_ENTRY) { + // TODO (nirvdrum 21-Feb-2024): Validate the content is valid in the modifier encoding. Do this on-demand so we don't pay the cost of computation unnecessarily. + bool valid_string_in_modifier_encoding = true; + + if (!valid_string_in_modifier_encoding) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_ESCAPE, (int) pm_string_length(source), (const char *) pm_string_source(source)); + } + } else if (modifier != 'u' && parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) { + // TODO (nirvdrum 21-Feb-2024): There's currently no way to tell if the source used hex or Unicode character escapes from `explicit_encoding` alone. If the source encoding was already UTF-8, both character escape types would set `explicit_encoding` to UTF-8, but need to be processed differently. Skip for now. + if (parser->encoding != PM_ENCODING_UTF_8_ENTRY) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING, (int) pm_string_length(source), (const char *) pm_string_source(source)); + } + } + + // We've determined the encoding would naturally be EUC-JP and there is no need to force the encoding to anything else. + return flags; +} + +/** + * Ruby "downgrades" the encoding of Regexps to US-ASCII if the associated encoding is ASCII-compatible and + * the unescaped representation of a Regexp source consists only of US-ASCII code points. This is true even + * when the Regexp is explicitly given an ASCII-8BIT encoding via the (/n) modifier. Otherwise, the encoding + * may be explicitly set with an escape sequence. + */ +static pm_node_flags_t +parse_and_validate_regular_expression_encoding(pm_parser_t *parser, const pm_string_t *source, bool ascii_only, pm_node_flags_t flags) { + // TODO (nirvdrum 22-Feb-2024): CRuby reports a special Regexp-specific error for invalid Unicode ranges. We either need to scan again or modify the "invalid Unicode escape sequence" message we already report. + bool valid_unicode_range = true; + if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY && !valid_unicode_range) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_INVALID_UNICODE_RANGE, (int) pm_string_length(source), (const char *) pm_string_source(source)); + return flags; + } + + // US-ASCII strings do not admit multi-byte character literals. However, character escape sequences corresponding + // to multi-byte characters are allowed. + if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY && parser->explicit_encoding == NULL && !ascii_only) { + // CRuby will continue processing even though a SyntaxError has already been detected. It may result in the + // following error message appearing twice. We do the same for compatibility. + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name); + } + + /** + * Start checking modifier flags. We need to process these before considering any explicit encodings that may have + * been set by character literals. The order in which the encoding modifiers is checked does not matter. In the + * event that both an encoding modifier and an explicit encoding would result in the same encoding we do not set + * the corresponding "forced_<encoding>" flag. Instead, the caller should check the encoding modifier flag and + * determine the encoding that way. + */ + + if (flags & PM_REGULAR_EXPRESSION_FLAGS_ASCII_8BIT) { + return parse_and_validate_regular_expression_encoding_modifier(parser, source, ascii_only, flags, 'n', PM_ENCODING_ASCII_8BIT_ENTRY); + } + + if (flags & PM_REGULAR_EXPRESSION_FLAGS_UTF_8) { + return parse_and_validate_regular_expression_encoding_modifier(parser, source, ascii_only, flags, 'u', PM_ENCODING_UTF_8_ENTRY); + } + + if (flags & PM_REGULAR_EXPRESSION_FLAGS_EUC_JP) { + return parse_and_validate_regular_expression_encoding_modifier(parser, source, ascii_only, flags, 'e', PM_ENCODING_EUC_JP_ENTRY); + } + + if (flags & PM_REGULAR_EXPRESSION_FLAGS_WINDOWS_31J) { + return parse_and_validate_regular_expression_encoding_modifier(parser, source, ascii_only, flags, 's', PM_ENCODING_WINDOWS_31J_ENTRY); + } + + // At this point no encoding modifiers will be present on the regular expression as they would have already + // been processed. Ruby stipulates that all source files must use an ASCII-compatible encoding. Thus, all + // regular expressions without an encoding modifier appearing in source are eligible for "downgrading" to US-ASCII. + if (ascii_only) { + return PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING; + } + + // A Regexp may optionally have its encoding explicitly set via a character escape sequence in the source string + // or by specifying a modifier. + // + // NB: an explicitly set encoding is ignored by Ruby if the Regexp consists of only US ASCII code points. + if (parser->explicit_encoding != NULL) { + if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) { + return PM_REGULAR_EXPRESSION_FLAGS_FORCED_UTF8_ENCODING; + } else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) { + return PM_REGULAR_EXPRESSION_FLAGS_FORCED_BINARY_ENCODING; + } + } + + return 0; +} + +/** + * Allocate and initialize a new SymbolNode node with the given unescaped + * string. + */ +static pm_symbol_node_t * +pm_symbol_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing, const pm_string_t *unescaped, pm_node_flags_t flags) { + pm_symbol_node_t *node = PM_NODE_ALLOC(parser, pm_symbol_node_t); + + const uint8_t *start = (opening->type == PM_TOKEN_NOT_PROVIDED ? value->start : opening->start); + const uint8_t *end = (closing->type == PM_TOKEN_NOT_PROVIDED ? value->end : closing->end); + + *node = (pm_symbol_node_t) { + .base = PM_NODE_INIT(parser, PM_SYMBOL_NODE, PM_NODE_FLAG_STATIC_LITERAL | flags, start, end), + .opening_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(opening), + .value_loc = PM_LOCATION_TOKEN_VALUE(value), + .closing_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(closing), + .unescaped = *unescaped + }; + + return node; +} + +/** + * Allocate and initialize a new SymbolNode node. + */ +static inline pm_symbol_node_t * +pm_symbol_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing) { + return pm_symbol_node_create_unescaped(parser, opening, value, closing, &PM_STRING_EMPTY, 0); +} + +/** + * Allocate and initialize a new SymbolNode node with the current string. + */ +static pm_symbol_node_t * +pm_symbol_node_create_current_string(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing) { + pm_symbol_node_t *node = pm_symbol_node_create_unescaped(parser, opening, value, closing, &parser->current_string, parse_symbol_encoding(parser, value, &parser->current_string, false)); + parser->current_string = PM_STRING_EMPTY; + return node; +} + +/** + * Allocate and initialize a new SymbolNode node from a label. + */ +static pm_symbol_node_t * +pm_symbol_node_label_create(pm_parser_t *parser, const pm_token_t *token) { + pm_symbol_node_t *node; + + switch (token->type) { + case PM_TOKEN_LABEL: { + pm_token_t opening = not_provided(parser); + pm_token_t closing = { .type = PM_TOKEN_LABEL_END, .start = token->end - 1, .end = token->end }; + + pm_token_t label = { .type = PM_TOKEN_LABEL, .start = token->start, .end = token->end - 1 }; + node = pm_symbol_node_create(parser, &opening, &label, &closing); + + assert((label.end - label.start) >= 0); + pm_string_shared_init(&node->unescaped, label.start, label.end); + pm_node_flag_set(UP(node), parse_symbol_encoding(parser, &label, &node->unescaped, false)); + + break; + } + case PM_TOKEN_MISSING: { + pm_token_t opening = not_provided(parser); + pm_token_t closing = not_provided(parser); + + pm_token_t label = { .type = PM_TOKEN_LABEL, .start = token->start, .end = token->end }; + node = pm_symbol_node_create(parser, &opening, &label, &closing); + break; + } + default: + assert(false && "unreachable"); + node = NULL; + break; + } + + return node; +} + +/** + * Allocate and initialize a new synthesized SymbolNode node. + */ +static pm_symbol_node_t * +pm_symbol_node_synthesized_create(pm_parser_t *parser, const char *content) { + pm_symbol_node_t *node = PM_NODE_ALLOC(parser, pm_symbol_node_t); + + *node = (pm_symbol_node_t) { + .base = PM_NODE_INIT_BASE(parser, PM_SYMBOL_NODE, PM_NODE_FLAG_STATIC_LITERAL | PM_SYMBOL_FLAGS_FORCED_US_ASCII_ENCODING), + .value_loc = PM_LOCATION_NULL_VALUE(parser), + .unescaped = { 0 } + }; + + pm_string_constant_init(&node->unescaped, content, strlen(content)); + return node; +} + +/** + * Check if the given node is a label in a hash. + */ +static bool +pm_symbol_node_label_p(pm_node_t *node) { + const uint8_t *end = NULL; + + switch (PM_NODE_TYPE(node)) { + case PM_SYMBOL_NODE: + end = ((pm_symbol_node_t *) node)->closing_loc.end; + break; + case PM_INTERPOLATED_SYMBOL_NODE: + end = ((pm_interpolated_symbol_node_t *) node)->closing_loc.end; + break; + default: + return false; + } + + return (end != NULL) && (end[-1] == ':'); +} + +/** + * Convert the given StringNode node to a SymbolNode node. + */ +static pm_symbol_node_t * +pm_string_node_to_symbol_node(pm_parser_t *parser, pm_string_node_t *node, const pm_token_t *opening, const pm_token_t *closing) { + pm_symbol_node_t *new_node = PM_NODE_ALLOC(parser, pm_symbol_node_t); + + *new_node = (pm_symbol_node_t) { + .base = PM_NODE_INIT_TOKENS(parser, PM_SYMBOL_NODE, PM_NODE_FLAG_STATIC_LITERAL, opening, closing), + .opening_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(opening), + .value_loc = node->content_loc, + .closing_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(closing), + .unescaped = node->unescaped + }; + + pm_token_t content = { .type = PM_TOKEN_IDENTIFIER, .start = node->content_loc.start, .end = node->content_loc.end }; + pm_node_flag_set(UP(new_node), parse_symbol_encoding(parser, &content, &node->unescaped, true)); + + // We are explicitly _not_ using pm_node_destroy here because we don't want + // to trash the unescaped string. We could instead copy the string if we + // know that it is owned, but we're taking the fast path for now. + xfree(node); + + return new_node; +} + +/** + * Convert the given SymbolNode node to a StringNode node. + */ +static pm_string_node_t * +pm_symbol_node_to_string_node(pm_parser_t *parser, pm_symbol_node_t *node) { + pm_string_node_t *new_node = PM_NODE_ALLOC(parser, pm_string_node_t); + pm_node_flags_t flags = 0; + + switch (parser->frozen_string_literal) { + case PM_OPTIONS_FROZEN_STRING_LITERAL_DISABLED: + flags = PM_STRING_FLAGS_MUTABLE; + break; + case PM_OPTIONS_FROZEN_STRING_LITERAL_ENABLED: + flags = PM_NODE_FLAG_STATIC_LITERAL | PM_STRING_FLAGS_FROZEN; + break; + } + + *new_node = (pm_string_node_t) { + .base = PM_NODE_INIT_NODE(parser, PM_STRING_NODE, flags, node), + .opening_loc = node->opening_loc, + .content_loc = node->value_loc, + .closing_loc = node->closing_loc, + .unescaped = node->unescaped + }; + + // We are explicitly _not_ using pm_node_destroy here because we don't want + // to trash the unescaped string. We could instead copy the string if we + // know that it is owned, but we're taking the fast path for now. + xfree(node); + + return new_node; +} + +/** + * Allocate and initialize a new TrueNode node. + */ +static pm_true_node_t * +pm_true_node_create(pm_parser_t *parser, const pm_token_t *token) { + assert(token->type == PM_TOKEN_KEYWORD_TRUE); + pm_true_node_t *node = PM_NODE_ALLOC(parser, pm_true_node_t); + + *node = (pm_true_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_TRUE_NODE, PM_NODE_FLAG_STATIC_LITERAL, token) + }; + + return node; +} + +/** + * Allocate and initialize a new synthesized TrueNode node. + */ +static pm_true_node_t * +pm_true_node_synthesized_create(pm_parser_t *parser) { + pm_true_node_t *node = PM_NODE_ALLOC(parser, pm_true_node_t); + + *node = (pm_true_node_t) { + .base = PM_NODE_INIT_BASE(parser, PM_TRUE_NODE, PM_NODE_FLAG_STATIC_LITERAL) + }; + + return node; +} + +/** + * Allocate and initialize a new UndefNode node. + */ +static pm_undef_node_t * +pm_undef_node_create(pm_parser_t *parser, const pm_token_t *token) { + assert(token->type == PM_TOKEN_KEYWORD_UNDEF); + pm_undef_node_t *node = PM_NODE_ALLOC(parser, pm_undef_node_t); + + *node = (pm_undef_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_UNDEF_NODE, 0, token), + .keyword_loc = PM_LOCATION_TOKEN_VALUE(token), + .names = { 0 } + }; + + return node; +} + +/** + * Append a name to an undef node. + */ +static void +pm_undef_node_append(pm_undef_node_t *node, pm_node_t *name) { + node->base.location.end = name->location.end; + pm_node_list_append(&node->names, name); +} + +/** + * Allocate a new UnlessNode node. + */ +static pm_unless_node_t * +pm_unless_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_node_t *predicate, const pm_token_t *then_keyword, pm_statements_node_t *statements) { + pm_conditional_predicate(parser, predicate, PM_CONDITIONAL_PREDICATE_TYPE_CONDITIONAL); + + pm_unless_node_t *node = PM_NODE_ALLOC(parser, pm_unless_node_t); + pm_node_t *end = statements == NULL ? predicate : UP(statements); + + *node = (pm_unless_node_t) { + .base = PM_NODE_INIT_TOKEN_NODE(parser, PM_UNLESS_NODE, PM_NODE_FLAG_NEWLINE, keyword, end), + .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword), + .predicate = predicate, + .then_keyword_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(then_keyword), + .statements = statements, + .else_clause = NULL, + .end_keyword_loc = { 0 } + }; + + return node; +} + +/** + * Allocate and initialize new UnlessNode node in the modifier form. + */ +static pm_unless_node_t * +pm_unless_node_modifier_create(pm_parser_t *parser, pm_node_t *statement, const pm_token_t *unless_keyword, pm_node_t *predicate) { + pm_conditional_predicate(parser, predicate, PM_CONDITIONAL_PREDICATE_TYPE_CONDITIONAL); + pm_unless_node_t *node = PM_NODE_ALLOC(parser, pm_unless_node_t); + + pm_statements_node_t *statements = pm_statements_node_create(parser); + pm_statements_node_body_append(parser, statements, statement, true); + + *node = (pm_unless_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_UNLESS_NODE, PM_NODE_FLAG_NEWLINE, statement, predicate), + .keyword_loc = PM_LOCATION_TOKEN_VALUE(unless_keyword), + .predicate = predicate, + .then_keyword_loc = { 0 }, + .statements = statements, + .else_clause = NULL, + .end_keyword_loc = { 0 } + }; + + return node; +} + +static inline void +pm_unless_node_end_keyword_loc_set(pm_unless_node_t *node, const pm_token_t *end_keyword) { + node->end_keyword_loc = PM_LOCATION_TOKEN_VALUE(end_keyword); + node->base.location.end = end_keyword->end; +} + +/** + * Loop modifiers could potentially modify an expression that contains block + * exits. In this case we need to loop through them and remove them from the + * list of block exits so that they do not later get marked as invalid. + */ +static void +pm_loop_modifier_block_exits(pm_parser_t *parser, pm_statements_node_t *statements) { + assert(parser->current_block_exits != NULL); + + // All of the block exits that we want to remove should be within the + // statements, and since we are modifying the statements, we shouldn't have + // to check the end location. + const uint8_t *start = statements->base.location.start; + + for (size_t index = parser->current_block_exits->size; index > 0; index--) { + pm_node_t *block_exit = parser->current_block_exits->nodes[index - 1]; + if (block_exit->location.start < start) break; + + // Implicitly remove from the list by lowering the size. + parser->current_block_exits->size--; + } +} + +/** + * Allocate a new UntilNode node. + */ +static pm_until_node_t * +pm_until_node_create(pm_parser_t *parser, const pm_token_t *keyword, const pm_token_t *do_keyword, const pm_token_t *closing, pm_node_t *predicate, pm_statements_node_t *statements, pm_node_flags_t flags) { + pm_until_node_t *node = PM_NODE_ALLOC(parser, pm_until_node_t); + pm_conditional_predicate(parser, predicate, PM_CONDITIONAL_PREDICATE_TYPE_CONDITIONAL); + + *node = (pm_until_node_t) { + .base = PM_NODE_INIT_TOKENS(parser, PM_UNTIL_NODE, flags, keyword, closing), + .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword), + .do_keyword_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(do_keyword), + .closing_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(closing), + .predicate = predicate, + .statements = statements + }; + + return node; +} + +/** + * Allocate a new UntilNode node. + */ +static pm_until_node_t * +pm_until_node_modifier_create(pm_parser_t *parser, const pm_token_t *keyword, pm_node_t *predicate, pm_statements_node_t *statements, pm_node_flags_t flags) { + pm_until_node_t *node = PM_NODE_ALLOC(parser, pm_until_node_t); + pm_conditional_predicate(parser, predicate, PM_CONDITIONAL_PREDICATE_TYPE_CONDITIONAL); + pm_loop_modifier_block_exits(parser, statements); + + *node = (pm_until_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_UNTIL_NODE, flags, statements, predicate), + .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword), + .do_keyword_loc = { 0 }, + .closing_loc = { 0 }, + .predicate = predicate, + .statements = statements + }; + + return node; +} + +/** + * Allocate and initialize a new WhenNode node. + */ +static pm_when_node_t * +pm_when_node_create(pm_parser_t *parser, const pm_token_t *keyword) { + pm_when_node_t *node = PM_NODE_ALLOC(parser, pm_when_node_t); + + *node = (pm_when_node_t) { + .base = PM_NODE_INIT_TOKEN(parser, PM_WHEN_NODE, 0, keyword), + .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword), + .statements = NULL, + .then_keyword_loc = { 0 }, + .conditions = { 0 } + }; + + return node; +} + +/** + * Append a new condition to a when node. + */ +static void +pm_when_node_conditions_append(pm_when_node_t *node, pm_node_t *condition) { + node->base.location.end = condition->location.end; + pm_node_list_append(&node->conditions, condition); +} + +/** + * Set the location of the then keyword of a when node. + */ +static inline void +pm_when_node_then_keyword_loc_set(pm_when_node_t *node, const pm_token_t *then_keyword) { + node->base.location.end = then_keyword->end; + node->then_keyword_loc = PM_LOCATION_TOKEN_VALUE(then_keyword); +} + +/** + * Set the statements list of a when node. + */ +static void +pm_when_node_statements_set(pm_when_node_t *node, pm_statements_node_t *statements) { + if (statements->base.location.end > node->base.location.end) { + node->base.location.end = statements->base.location.end; + } + + node->statements = statements; +} + +/** + * Allocate a new WhileNode node. + */ +static pm_while_node_t * +pm_while_node_create(pm_parser_t *parser, const pm_token_t *keyword, const pm_token_t *do_keyword, const pm_token_t *closing, pm_node_t *predicate, pm_statements_node_t *statements, pm_node_flags_t flags) { + pm_while_node_t *node = PM_NODE_ALLOC(parser, pm_while_node_t); + pm_conditional_predicate(parser, predicate, PM_CONDITIONAL_PREDICATE_TYPE_CONDITIONAL); + + *node = (pm_while_node_t) { + .base = PM_NODE_INIT_TOKENS(parser, PM_WHILE_NODE, flags, keyword, closing), + .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword), + .do_keyword_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(do_keyword), + .closing_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(closing), + .predicate = predicate, + .statements = statements + }; + + return node; +} + +/** + * Allocate a new WhileNode node. + */ +static pm_while_node_t * +pm_while_node_modifier_create(pm_parser_t *parser, const pm_token_t *keyword, pm_node_t *predicate, pm_statements_node_t *statements, pm_node_flags_t flags) { + pm_while_node_t *node = PM_NODE_ALLOC(parser, pm_while_node_t); + pm_conditional_predicate(parser, predicate, PM_CONDITIONAL_PREDICATE_TYPE_CONDITIONAL); + pm_loop_modifier_block_exits(parser, statements); + + *node = (pm_while_node_t) { + .base = PM_NODE_INIT_NODES(parser, PM_WHILE_NODE, flags, statements, predicate), + .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword), + .do_keyword_loc = { 0 }, + .closing_loc = { 0 }, + .predicate = predicate, + .statements = statements + }; + + return node; +} + +/** + * Allocate and initialize a new synthesized while loop. + */ +static pm_while_node_t * +pm_while_node_synthesized_create(pm_parser_t *parser, pm_node_t *predicate, pm_statements_node_t *statements) { + pm_while_node_t *node = PM_NODE_ALLOC(parser, pm_while_node_t); + + *node = (pm_while_node_t) { + .base = PM_NODE_INIT_BASE(parser, PM_WHILE_NODE, 0), + .keyword_loc = PM_LOCATION_NULL_VALUE(parser), + .do_keyword_loc = PM_LOCATION_NULL_VALUE(parser), + .closing_loc = PM_LOCATION_NULL_VALUE(parser), + .predicate = predicate, + .statements = statements + }; + + return node; +} + +/** + * Allocate and initialize a new XStringNode node with the given unescaped + * string. + */ +static pm_x_string_node_t * +pm_xstring_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing, const pm_string_t *unescaped) { + pm_x_string_node_t *node = PM_NODE_ALLOC(parser, pm_x_string_node_t); + + *node = (pm_x_string_node_t) { + .base = PM_NODE_INIT_TOKENS(parser, PM_X_STRING_NODE, PM_STRING_FLAGS_FROZEN, opening, closing), + .opening_loc = PM_LOCATION_TOKEN_VALUE(opening), + .content_loc = PM_LOCATION_TOKEN_VALUE(content), + .closing_loc = PM_LOCATION_TOKEN_VALUE(closing), + .unescaped = *unescaped + }; + + return node; +} + +/** + * Allocate and initialize a new XStringNode node. + */ +static inline pm_x_string_node_t * +pm_xstring_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing) { + return pm_xstring_node_create_unescaped(parser, opening, content, closing, &PM_STRING_EMPTY); +} + +/** + * Allocate a new YieldNode node. + */ +static pm_yield_node_t * +pm_yield_node_create(pm_parser_t *parser, const pm_token_t *keyword, const pm_location_t *lparen_loc, pm_arguments_node_t *arguments, const pm_location_t *rparen_loc) { + pm_yield_node_t *node = PM_NODE_ALLOC(parser, pm_yield_node_t); + + const uint8_t *end; + if (rparen_loc->start != NULL) { + end = rparen_loc->end; + } else if (arguments != NULL) { + end = arguments->base.location.end; + } else if (lparen_loc->start != NULL) { + end = lparen_loc->end; + } else { + end = keyword->end; + } + + *node = (pm_yield_node_t) { + .base = PM_NODE_INIT(parser, PM_YIELD_NODE, 0, keyword->start, end), + .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword), + .lparen_loc = *lparen_loc, + .arguments = arguments, + .rparen_loc = *rparen_loc + }; + + return node; +} + +/** + * Check if any of the currently visible scopes contain a local variable + * described by the given constant id. + */ +static int +pm_parser_local_depth_constant_id(pm_parser_t *parser, pm_constant_id_t constant_id) { + pm_scope_t *scope = parser->current_scope; + int depth = 0; + + while (scope != NULL) { + if (pm_locals_find(&scope->locals, constant_id) != UINT32_MAX) return depth; + if (scope->closed) break; + + scope = scope->previous; + depth++; + } + + return -1; +} + +/** + * Check if any of the currently visible scopes contain a local variable + * described by the given token. This function implicitly inserts a constant + * into the constant pool. + */ +static inline int +pm_parser_local_depth(pm_parser_t *parser, pm_token_t *token) { + return pm_parser_local_depth_constant_id(parser, pm_parser_constant_id_token(parser, token)); +} + +/** + * Add a constant id to the local table of the current scope. + */ +static inline void +pm_parser_local_add(pm_parser_t *parser, pm_constant_id_t constant_id, const uint8_t *start, const uint8_t *end, uint32_t reads) { + pm_locals_write(&parser->current_scope->locals, constant_id, start, end, reads); +} + +/** + * Add a local variable from a location to the current scope. + */ +static pm_constant_id_t +pm_parser_local_add_location(pm_parser_t *parser, const uint8_t *start, const uint8_t *end, uint32_t reads) { + pm_constant_id_t constant_id = pm_parser_constant_id_location(parser, start, end); + if (constant_id != 0) pm_parser_local_add(parser, constant_id, start, end, reads); + return constant_id; +} + +/** + * Add a local variable from a token to the current scope. + */ +static inline pm_constant_id_t +pm_parser_local_add_token(pm_parser_t *parser, pm_token_t *token, uint32_t reads) { + return pm_parser_local_add_location(parser, token->start, token->end, reads); +} + +/** + * Add a local variable from an owned string to the current scope. + */ +static pm_constant_id_t +pm_parser_local_add_owned(pm_parser_t *parser, uint8_t *start, size_t length) { + pm_constant_id_t constant_id = pm_parser_constant_id_owned(parser, start, length); + if (constant_id != 0) pm_parser_local_add(parser, constant_id, parser->start, parser->start, 1); + return constant_id; +} + +/** + * Add a local variable from a constant string to the current scope. + */ +static pm_constant_id_t +pm_parser_local_add_constant(pm_parser_t *parser, const char *start, size_t length) { + pm_constant_id_t constant_id = pm_parser_constant_id_constant(parser, start, length); + if (constant_id != 0) pm_parser_local_add(parser, constant_id, parser->start, parser->start, 1); + return constant_id; +} + +/** + * Add a parameter name to the current scope and check whether the name of the + * parameter is unique or not. + * + * Returns `true` if this is a duplicate parameter name, otherwise returns + * false. + */ +static bool +pm_parser_parameter_name_check(pm_parser_t *parser, const pm_token_t *name) { + // We want to check whether the parameter name is a numbered parameter or + // not. + pm_refute_numbered_parameter(parser, name->start, name->end); + + // Otherwise we'll fetch the constant id for the parameter name and check + // whether it's already in the current scope. + pm_constant_id_t constant_id = pm_parser_constant_id_token(parser, name); + + if (pm_locals_find(&parser->current_scope->locals, constant_id) != UINT32_MAX) { + // Add an error if the parameter doesn't start with _ and has been seen before + if ((name->start < name->end) && (*name->start != '_')) { + pm_parser_err_token(parser, name, PM_ERR_PARAMETER_NAME_DUPLICATED); + } + return true; + } + return false; +} + +/** + * Pop the current scope off the scope stack. + */ +static void +pm_parser_scope_pop(pm_parser_t *parser) { + pm_scope_t *scope = parser->current_scope; + parser->current_scope = scope->previous; + pm_locals_free(&scope->locals); + pm_node_list_free(&scope->implicit_parameters); + xfree(scope); +} + +/******************************************************************************/ +/* Stack helpers */ +/******************************************************************************/ + +/** + * Pushes a value onto the stack. + */ +static inline void +pm_state_stack_push(pm_state_stack_t *stack, bool value) { + *stack = (*stack << 1) | (value & 1); +} + +/** + * Pops a value off the stack. + */ +static inline void +pm_state_stack_pop(pm_state_stack_t *stack) { + *stack >>= 1; +} + +/** + * Returns the value at the top of the stack. + */ +static inline bool +pm_state_stack_p(const pm_state_stack_t *stack) { + return *stack & 1; +} + +static inline void +pm_accepts_block_stack_push(pm_parser_t *parser, bool value) { + // Use the negation of the value to prevent stack overflow. + pm_state_stack_push(&parser->accepts_block_stack, !value); +} + +static inline void +pm_accepts_block_stack_pop(pm_parser_t *parser) { + pm_state_stack_pop(&parser->accepts_block_stack); +} + +static inline bool +pm_accepts_block_stack_p(pm_parser_t *parser) { + return !pm_state_stack_p(&parser->accepts_block_stack); +} + +static inline void +pm_do_loop_stack_push(pm_parser_t *parser, bool value) { + pm_state_stack_push(&parser->do_loop_stack, value); +} + +static inline void +pm_do_loop_stack_pop(pm_parser_t *parser) { + pm_state_stack_pop(&parser->do_loop_stack); +} + +static inline bool +pm_do_loop_stack_p(pm_parser_t *parser) { + return pm_state_stack_p(&parser->do_loop_stack); +} + +/******************************************************************************/ +/* Lexer check helpers */ +/******************************************************************************/ + +/** + * Get the next character in the source starting from +cursor+. If that position + * is beyond the end of the source then return '\0'. + */ +static inline uint8_t +peek_at(const pm_parser_t *parser, const uint8_t *cursor) { + if (cursor < parser->end) { + return *cursor; + } else { + return '\0'; + } +} + +/** + * Get the next character in the source starting from parser->current.end and + * adding the given offset. If that position is beyond the end of the source + * then return '\0'. + */ +static inline uint8_t +peek_offset(pm_parser_t *parser, ptrdiff_t offset) { + return peek_at(parser, parser->current.end + offset); +} + +/** + * Get the next character in the source starting from parser->current.end. If + * that position is beyond the end of the source then return '\0'. + */ +static inline uint8_t +peek(const pm_parser_t *parser) { + return peek_at(parser, parser->current.end); +} + +/** + * If the character to be read matches the given value, then returns true and + * advances the current pointer. + */ +static inline bool +match(pm_parser_t *parser, uint8_t value) { + if (peek(parser) == value) { + parser->current.end++; + return true; + } + return false; +} + +/** + * Return the length of the line ending string starting at +cursor+, or 0 if it + * is not a line ending. This function is intended to be CRLF/LF agnostic. + */ +static inline size_t +match_eol_at(pm_parser_t *parser, const uint8_t *cursor) { + if (peek_at(parser, cursor) == '\n') { + return 1; + } + if (peek_at(parser, cursor) == '\r' && peek_at(parser, cursor + 1) == '\n') { + return 2; + } + return 0; +} + +/** + * Return the length of the line ending string starting at + * `parser->current.end + offset`, or 0 if it is not a line ending. This + * function is intended to be CRLF/LF agnostic. + */ +static inline size_t +match_eol_offset(pm_parser_t *parser, ptrdiff_t offset) { + return match_eol_at(parser, parser->current.end + offset); +} + +/** + * Return the length of the line ending string starting at parser->current.end, + * or 0 if it is not a line ending. This function is intended to be CRLF/LF + * agnostic. + */ +static inline size_t +match_eol(pm_parser_t *parser) { + return match_eol_at(parser, parser->current.end); +} + +/** + * Skip to the next newline character or NUL byte. + */ +static inline const uint8_t * +next_newline(const uint8_t *cursor, ptrdiff_t length) { + assert(length >= 0); + + // Note that it's okay for us to use memchr here to look for \n because none + // of the encodings that we support have \n as a component of a multi-byte + // character. + return memchr(cursor, '\n', (size_t) length); +} + +/** + * This is equivalent to the predicate of warn_balanced in CRuby. + */ +static inline bool +ambiguous_operator_p(const pm_parser_t *parser, bool space_seen) { + return !lex_state_p(parser, PM_LEX_STATE_CLASS | PM_LEX_STATE_DOT | PM_LEX_STATE_FNAME | PM_LEX_STATE_ENDFN) && space_seen && !pm_char_is_whitespace(peek(parser)); +} + +/** + * Here we're going to check if this is a "magic" comment, and perform whatever + * actions are necessary for it here. + */ +static bool +parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) { + const pm_encoding_t *encoding = pm_encoding_find(start, end); + + if (encoding != NULL) { + if (parser->encoding != encoding) { + parser->encoding = encoding; + if (parser->encoding_changed_callback != NULL) parser->encoding_changed_callback(parser); + } + + parser->encoding_changed = (encoding != PM_ENCODING_UTF_8_ENTRY); + return true; + } + + return false; +} + +/** + * Look for a specific pattern of "coding" and potentially set the encoding on + * the parser. + */ +static void +parser_lex_magic_comment_encoding(pm_parser_t *parser) { + const uint8_t *cursor = parser->current.start + 1; + const uint8_t *end = parser->current.end; + + bool separator = false; + while (true) { + if (end - cursor <= 6) return; + switch (cursor[6]) { + case 'C': case 'c': cursor += 6; continue; + case 'O': case 'o': cursor += 5; continue; + case 'D': case 'd': cursor += 4; continue; + case 'I': case 'i': cursor += 3; continue; + case 'N': case 'n': cursor += 2; continue; + case 'G': case 'g': cursor += 1; continue; + case '=': case ':': + separator = true; + cursor += 6; + break; + default: + cursor += 6; + if (pm_char_is_whitespace(*cursor)) break; + continue; + } + if (pm_strncasecmp(cursor - 6, (const uint8_t *) "coding", 6) == 0) break; + separator = false; + } + + while (true) { + do { + if (++cursor >= end) return; + } while (pm_char_is_whitespace(*cursor)); + + if (separator) break; + if (*cursor != '=' && *cursor != ':') return; + + separator = true; + cursor++; + } + + const uint8_t *value_start = cursor; + while ((*cursor == '-' || *cursor == '_' || parser->encoding->alnum_char(cursor, 1)) && ++cursor < end); + + if (!parser_lex_magic_comment_encoding_value(parser, value_start, cursor)) { + // If we were unable to parse the encoding value, then we've got an + // issue because we didn't understand the encoding that the user was + // trying to use. In this case we'll keep using the default encoding but + // add an error to the parser to indicate an unsuccessful parse. + pm_parser_err(parser, value_start, cursor, PM_ERR_INVALID_ENCODING_MAGIC_COMMENT); + } +} + +typedef enum { + PM_MAGIC_COMMENT_BOOLEAN_VALUE_TRUE, + PM_MAGIC_COMMENT_BOOLEAN_VALUE_FALSE, + PM_MAGIC_COMMENT_BOOLEAN_VALUE_INVALID +} pm_magic_comment_boolean_value_t; + +/** + * Check if this is a magic comment that includes the frozen_string_literal + * pragma. If it does, set that field on the parser. + */ +static pm_magic_comment_boolean_value_t +parser_lex_magic_comment_boolean_value(const uint8_t *value_start, uint32_t value_length) { + if (value_length == 4 && pm_strncasecmp(value_start, (const uint8_t *) "true", 4) == 0) { + return PM_MAGIC_COMMENT_BOOLEAN_VALUE_TRUE; + } else if (value_length == 5 && pm_strncasecmp(value_start, (const uint8_t *) "false", 5) == 0) { + return PM_MAGIC_COMMENT_BOOLEAN_VALUE_FALSE; + } else { + return PM_MAGIC_COMMENT_BOOLEAN_VALUE_INVALID; + } +} + +static inline bool +pm_char_is_magic_comment_key_delimiter(const uint8_t b) { + return b == '\'' || b == '"' || b == ':' || b == ';'; +} + +/** + * Find an emacs magic comment marker (-*-) within the given bounds. If one is + * found, it returns a pointer to the start of the marker. Otherwise it returns + * NULL. + */ +static inline const uint8_t * +parser_lex_magic_comment_emacs_marker(pm_parser_t *parser, const uint8_t *cursor, const uint8_t *end) { + while ((cursor + 3 <= end) && (cursor = pm_memchr(cursor, '-', (size_t) (end - cursor), parser->encoding_changed, parser->encoding)) != NULL) { + if (cursor + 3 <= end && cursor[1] == '*' && cursor[2] == '-') { + return cursor; + } + cursor++; + } + return NULL; +} + +/** + * Parse the current token on the parser to see if it's a magic comment and + * potentially perform some action based on that. A regular expression that this + * function is effectively matching is: + * + * %r"([^\\s\'\":;]+)\\s*:\\s*(\"(?:\\\\.|[^\"])*\"|[^\"\\s;]+)[\\s;]*" + * + * It returns true if it consumes the entire comment. Otherwise it returns + * false. + */ +static inline bool +parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) { + bool result = true; + + const uint8_t *start = parser->current.start + 1; + const uint8_t *end = parser->current.end; + if (end - start <= 7) return false; + + const uint8_t *cursor; + bool indicator = false; + + if ((cursor = parser_lex_magic_comment_emacs_marker(parser, start, end)) != NULL) { + start = cursor + 3; + + if ((cursor = parser_lex_magic_comment_emacs_marker(parser, start, end)) != NULL) { + end = cursor; + indicator = true; + } else { + // If we have a start marker but not an end marker, then we cannot + // have a magic comment. + return false; + } + } + + cursor = start; + while (cursor < end) { + while (cursor < end && (pm_char_is_magic_comment_key_delimiter(*cursor) || pm_char_is_whitespace(*cursor))) cursor++; + + const uint8_t *key_start = cursor; + while (cursor < end && (!pm_char_is_magic_comment_key_delimiter(*cursor) && !pm_char_is_whitespace(*cursor))) cursor++; + + const uint8_t *key_end = cursor; + while (cursor < end && pm_char_is_whitespace(*cursor)) cursor++; + if (cursor == end) break; + + if (*cursor == ':') { + cursor++; + } else { + if (!indicator) return false; + continue; + } + + while (cursor < end && pm_char_is_whitespace(*cursor)) cursor++; + if (cursor == end) break; + + const uint8_t *value_start; + const uint8_t *value_end; + + if (*cursor == '"') { + value_start = ++cursor; + for (; cursor < end && *cursor != '"'; cursor++) { + if (*cursor == '\\' && (cursor + 1 < end)) cursor++; + } + value_end = cursor; + if (cursor < end && *cursor == '"') cursor++; + } else { + value_start = cursor; + while (cursor < end && *cursor != '"' && *cursor != ';' && !pm_char_is_whitespace(*cursor)) cursor++; + value_end = cursor; + } + + if (indicator) { + while (cursor < end && (*cursor == ';' || pm_char_is_whitespace(*cursor))) cursor++; + } else { + while (cursor < end && pm_char_is_whitespace(*cursor)) cursor++; + if (cursor != end) return false; + } + + // Here, we need to do some processing on the key to swap out dashes for + // underscores. We only need to do this if there _is_ a dash in the key. + pm_string_t key; + const size_t key_length = (size_t) (key_end - key_start); + const uint8_t *dash = pm_memchr(key_start, '-', key_length, parser->encoding_changed, parser->encoding); + + if (dash == NULL) { + pm_string_shared_init(&key, key_start, key_end); + } else { + uint8_t *buffer = xmalloc(key_length); + if (buffer == NULL) break; + + memcpy(buffer, key_start, key_length); + buffer[dash - key_start] = '_'; + + while ((dash = pm_memchr(dash + 1, '-', (size_t) (key_end - dash - 1), parser->encoding_changed, parser->encoding)) != NULL) { + buffer[dash - key_start] = '_'; + } + + pm_string_owned_init(&key, buffer, key_length); + } + + // Finally, we can start checking the key against the list of known + // magic comment keys, and potentially change state based on that. + const uint8_t *key_source = pm_string_source(&key); + uint32_t value_length = (uint32_t) (value_end - value_start); + + // We only want to attempt to compare against encoding comments if it's + // the first line in the file (or the second in the case of a shebang). + if (parser->current.start == parser->encoding_comment_start && !parser->encoding_locked) { + if ( + (key_length == 8 && pm_strncasecmp(key_source, (const uint8_t *) "encoding", 8) == 0) || + (key_length == 6 && pm_strncasecmp(key_source, (const uint8_t *) "coding", 6) == 0) + ) { + result = parser_lex_magic_comment_encoding_value(parser, value_start, value_end); + } + } + + if (key_length == 11) { + if (pm_strncasecmp(key_source, (const uint8_t *) "warn_indent", 11) == 0) { + switch (parser_lex_magic_comment_boolean_value(value_start, value_length)) { + case PM_MAGIC_COMMENT_BOOLEAN_VALUE_INVALID: + PM_PARSER_WARN_TOKEN_FORMAT( + parser, + parser->current, + PM_WARN_INVALID_MAGIC_COMMENT_VALUE, + (int) key_length, + (const char *) key_source, + (int) value_length, + (const char *) value_start + ); + break; + case PM_MAGIC_COMMENT_BOOLEAN_VALUE_FALSE: + parser->warn_mismatched_indentation = false; + break; + case PM_MAGIC_COMMENT_BOOLEAN_VALUE_TRUE: + parser->warn_mismatched_indentation = true; + break; + } + } + } else if (key_length == 21) { + if (pm_strncasecmp(key_source, (const uint8_t *) "frozen_string_literal", 21) == 0) { + // We only want to handle frozen string literal comments if it's + // before any semantic tokens have been seen. + if (semantic_token_seen) { + pm_parser_warn_token(parser, &parser->current, PM_WARN_IGNORED_FROZEN_STRING_LITERAL); + } else { + switch (parser_lex_magic_comment_boolean_value(value_start, value_length)) { + case PM_MAGIC_COMMENT_BOOLEAN_VALUE_INVALID: + PM_PARSER_WARN_TOKEN_FORMAT( + parser, + parser->current, + PM_WARN_INVALID_MAGIC_COMMENT_VALUE, + (int) key_length, + (const char *) key_source, + (int) value_length, + (const char *) value_start + ); + break; + case PM_MAGIC_COMMENT_BOOLEAN_VALUE_FALSE: + parser->frozen_string_literal = PM_OPTIONS_FROZEN_STRING_LITERAL_DISABLED; + break; + case PM_MAGIC_COMMENT_BOOLEAN_VALUE_TRUE: + parser->frozen_string_literal = PM_OPTIONS_FROZEN_STRING_LITERAL_ENABLED; + break; + } + } + } + } else if (key_length == 24) { + if (pm_strncasecmp(key_source, (const uint8_t *) "shareable_constant_value", 24) == 0) { + const uint8_t *cursor = parser->current.start; + while ((cursor > parser->start) && ((cursor[-1] == ' ') || (cursor[-1] == '\t'))) cursor--; + + if (!((cursor == parser->start) || (cursor[-1] == '\n'))) { + pm_parser_warn_token(parser, &parser->current, PM_WARN_SHAREABLE_CONSTANT_VALUE_LINE); + } else if (value_length == 4 && pm_strncasecmp(value_start, (const uint8_t *) "none", 4) == 0) { + pm_parser_scope_shareable_constant_set(parser, PM_SCOPE_SHAREABLE_CONSTANT_NONE); + } else if (value_length == 7 && pm_strncasecmp(value_start, (const uint8_t *) "literal", 7) == 0) { + pm_parser_scope_shareable_constant_set(parser, PM_SCOPE_SHAREABLE_CONSTANT_LITERAL); + } else if (value_length == 23 && pm_strncasecmp(value_start, (const uint8_t *) "experimental_everything", 23) == 0) { + pm_parser_scope_shareable_constant_set(parser, PM_SCOPE_SHAREABLE_CONSTANT_EXPERIMENTAL_EVERYTHING); + } else if (value_length == 17 && pm_strncasecmp(value_start, (const uint8_t *) "experimental_copy", 17) == 0) { + pm_parser_scope_shareable_constant_set(parser, PM_SCOPE_SHAREABLE_CONSTANT_EXPERIMENTAL_COPY); + } else { + PM_PARSER_WARN_TOKEN_FORMAT( + parser, + parser->current, + PM_WARN_INVALID_MAGIC_COMMENT_VALUE, + (int) key_length, + (const char *) key_source, + (int) value_length, + (const char *) value_start + ); + } + } + } + + // When we're done, we want to free the string in case we had to + // allocate memory for it. + pm_string_free(&key); + + // Allocate a new magic comment node to append to the parser's list. + pm_magic_comment_t *magic_comment; + if ((magic_comment = (pm_magic_comment_t *) xcalloc(1, sizeof(pm_magic_comment_t))) != NULL) { + magic_comment->key_start = key_start; + magic_comment->value_start = value_start; + magic_comment->key_length = (uint32_t) key_length; + magic_comment->value_length = value_length; + pm_list_append(&parser->magic_comment_list, (pm_list_node_t *) magic_comment); + } + } + + return result; +} + +/******************************************************************************/ +/* Context manipulations */ +/******************************************************************************/ + +static const uint32_t context_terminators[] = { + [PM_CONTEXT_NONE] = 0, + [PM_CONTEXT_BEGIN] = (1U << PM_TOKEN_KEYWORD_ENSURE) | (1U << PM_TOKEN_KEYWORD_RESCUE) | (1U << PM_TOKEN_KEYWORD_ELSE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_BEGIN_ENSURE] = (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_BEGIN_ELSE] = (1U << PM_TOKEN_KEYWORD_ENSURE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_BEGIN_RESCUE] = (1U << PM_TOKEN_KEYWORD_ENSURE) | (1U << PM_TOKEN_KEYWORD_RESCUE) | (1U << PM_TOKEN_KEYWORD_ELSE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_BLOCK_BRACES] = (1U << PM_TOKEN_BRACE_RIGHT), + [PM_CONTEXT_BLOCK_KEYWORDS] = (1U << PM_TOKEN_KEYWORD_END) | (1U << PM_TOKEN_KEYWORD_RESCUE) | (1U << PM_TOKEN_KEYWORD_ENSURE), + [PM_CONTEXT_BLOCK_ENSURE] = (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_BLOCK_ELSE] = (1U << PM_TOKEN_KEYWORD_ENSURE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_BLOCK_PARAMETERS] = (1U << PM_TOKEN_PIPE), + [PM_CONTEXT_BLOCK_RESCUE] = (1U << PM_TOKEN_KEYWORD_ENSURE) | (1U << PM_TOKEN_KEYWORD_RESCUE) | (1U << PM_TOKEN_KEYWORD_ELSE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_CASE_WHEN] = (1U << PM_TOKEN_KEYWORD_WHEN) | (1U << PM_TOKEN_KEYWORD_END) | (1U << PM_TOKEN_KEYWORD_ELSE), + [PM_CONTEXT_CASE_IN] = (1U << PM_TOKEN_KEYWORD_IN) | (1U << PM_TOKEN_KEYWORD_END) | (1U << PM_TOKEN_KEYWORD_ELSE), + [PM_CONTEXT_CLASS] = (1U << PM_TOKEN_KEYWORD_END) | (1U << PM_TOKEN_KEYWORD_RESCUE) | (1U << PM_TOKEN_KEYWORD_ENSURE), + [PM_CONTEXT_CLASS_ENSURE] = (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_CLASS_ELSE] = (1U << PM_TOKEN_KEYWORD_ENSURE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_CLASS_RESCUE] = (1U << PM_TOKEN_KEYWORD_ENSURE) | (1U << PM_TOKEN_KEYWORD_RESCUE) | (1U << PM_TOKEN_KEYWORD_ELSE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_DEF] = (1U << PM_TOKEN_KEYWORD_END) | (1U << PM_TOKEN_KEYWORD_RESCUE) | (1U << PM_TOKEN_KEYWORD_ENSURE), + [PM_CONTEXT_DEF_ENSURE] = (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_DEF_ELSE] = (1U << PM_TOKEN_KEYWORD_ENSURE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_DEF_RESCUE] = (1U << PM_TOKEN_KEYWORD_ENSURE) | (1U << PM_TOKEN_KEYWORD_RESCUE) | (1U << PM_TOKEN_KEYWORD_ELSE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_DEF_PARAMS] = (1U << PM_TOKEN_EOF), + [PM_CONTEXT_DEFINED] = (1U << PM_TOKEN_EOF), + [PM_CONTEXT_DEFAULT_PARAMS] = (1U << PM_TOKEN_COMMA) | (1U << PM_TOKEN_PARENTHESIS_RIGHT), + [PM_CONTEXT_ELSE] = (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_ELSIF] = (1U << PM_TOKEN_KEYWORD_ELSE) | (1U << PM_TOKEN_KEYWORD_ELSIF) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_EMBEXPR] = (1U << PM_TOKEN_EMBEXPR_END), + [PM_CONTEXT_FOR] = (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_FOR_INDEX] = (1U << PM_TOKEN_KEYWORD_IN), + [PM_CONTEXT_IF] = (1U << PM_TOKEN_KEYWORD_ELSE) | (1U << PM_TOKEN_KEYWORD_ELSIF) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_LAMBDA_BRACES] = (1U << PM_TOKEN_BRACE_RIGHT), + [PM_CONTEXT_LAMBDA_DO_END] = (1U << PM_TOKEN_KEYWORD_END) | (1U << PM_TOKEN_KEYWORD_RESCUE) | (1U << PM_TOKEN_KEYWORD_ENSURE), + [PM_CONTEXT_LAMBDA_ENSURE] = (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_LAMBDA_ELSE] = (1U << PM_TOKEN_KEYWORD_ENSURE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_LAMBDA_RESCUE] = (1U << PM_TOKEN_KEYWORD_ENSURE) | (1U << PM_TOKEN_KEYWORD_RESCUE) | (1U << PM_TOKEN_KEYWORD_ELSE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_LOOP_PREDICATE] = (1U << PM_TOKEN_KEYWORD_DO) | (1U << PM_TOKEN_KEYWORD_THEN), + [PM_CONTEXT_MAIN] = (1U << PM_TOKEN_EOF), + [PM_CONTEXT_MODULE] = (1U << PM_TOKEN_KEYWORD_END) | (1U << PM_TOKEN_KEYWORD_RESCUE) | (1U << PM_TOKEN_KEYWORD_ENSURE), + [PM_CONTEXT_MODULE_ENSURE] = (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_MODULE_ELSE] = (1U << PM_TOKEN_KEYWORD_ENSURE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_MODULE_RESCUE] = (1U << PM_TOKEN_KEYWORD_ENSURE) | (1U << PM_TOKEN_KEYWORD_RESCUE) | (1U << PM_TOKEN_KEYWORD_ELSE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_MULTI_TARGET] = (1U << PM_TOKEN_EOF), + [PM_CONTEXT_PARENS] = (1U << PM_TOKEN_PARENTHESIS_RIGHT), + [PM_CONTEXT_POSTEXE] = (1U << PM_TOKEN_BRACE_RIGHT), + [PM_CONTEXT_PREDICATE] = (1U << PM_TOKEN_KEYWORD_THEN) | (1U << PM_TOKEN_NEWLINE) | (1U << PM_TOKEN_SEMICOLON), + [PM_CONTEXT_PREEXE] = (1U << PM_TOKEN_BRACE_RIGHT), + [PM_CONTEXT_RESCUE_MODIFIER] = (1U << PM_TOKEN_EOF), + [PM_CONTEXT_SCLASS] = (1U << PM_TOKEN_KEYWORD_END) | (1U << PM_TOKEN_KEYWORD_RESCUE) | (1U << PM_TOKEN_KEYWORD_ENSURE), + [PM_CONTEXT_SCLASS_ENSURE] = (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_SCLASS_ELSE] = (1U << PM_TOKEN_KEYWORD_ENSURE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_SCLASS_RESCUE] = (1U << PM_TOKEN_KEYWORD_ENSURE) | (1U << PM_TOKEN_KEYWORD_RESCUE) | (1U << PM_TOKEN_KEYWORD_ELSE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_TERNARY] = (1U << PM_TOKEN_EOF), + [PM_CONTEXT_UNLESS] = (1U << PM_TOKEN_KEYWORD_ELSE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_UNTIL] = (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_WHILE] = (1U << PM_TOKEN_KEYWORD_END), +}; + +static inline bool +context_terminator(pm_context_t context, pm_token_t *token) { + return token->type < 32 && (context_terminators[context] & (1U << token->type)); +} + +/** + * Returns the context that the given token is found to be terminating, or + * returns PM_CONTEXT_NONE. + */ +static pm_context_t +context_recoverable(const pm_parser_t *parser, pm_token_t *token) { + pm_context_node_t *context_node = parser->current_context; + + while (context_node != NULL) { + if (context_terminator(context_node->context, token)) return context_node->context; + context_node = context_node->prev; + } + + return PM_CONTEXT_NONE; +} + +static bool +context_push(pm_parser_t *parser, pm_context_t context) { + pm_context_node_t *context_node = (pm_context_node_t *) xmalloc(sizeof(pm_context_node_t)); + if (context_node == NULL) return false; + + *context_node = (pm_context_node_t) { .context = context, .prev = NULL }; + + if (parser->current_context == NULL) { + parser->current_context = context_node; + } else { + context_node->prev = parser->current_context; + parser->current_context = context_node; + } + + return true; +} + +static void +context_pop(pm_parser_t *parser) { + pm_context_node_t *prev = parser->current_context->prev; + xfree(parser->current_context); + parser->current_context = prev; +} + +static bool +context_p(const pm_parser_t *parser, pm_context_t context) { + pm_context_node_t *context_node = parser->current_context; + + while (context_node != NULL) { + if (context_node->context == context) return true; + context_node = context_node->prev; + } + + return false; +} + +static bool +context_def_p(const pm_parser_t *parser) { + pm_context_node_t *context_node = parser->current_context; + + while (context_node != NULL) { + switch (context_node->context) { + case PM_CONTEXT_DEF: + case PM_CONTEXT_DEF_PARAMS: + case PM_CONTEXT_DEF_ENSURE: + case PM_CONTEXT_DEF_RESCUE: + case PM_CONTEXT_DEF_ELSE: + return true; + case PM_CONTEXT_CLASS: + case PM_CONTEXT_CLASS_ENSURE: + case PM_CONTEXT_CLASS_RESCUE: + case PM_CONTEXT_CLASS_ELSE: + case PM_CONTEXT_MODULE: + case PM_CONTEXT_MODULE_ENSURE: + case PM_CONTEXT_MODULE_RESCUE: + case PM_CONTEXT_MODULE_ELSE: + case PM_CONTEXT_SCLASS: + case PM_CONTEXT_SCLASS_ENSURE: + case PM_CONTEXT_SCLASS_RESCUE: + case PM_CONTEXT_SCLASS_ELSE: + return false; + default: + context_node = context_node->prev; + } + } + + return false; +} + +/** + * Returns a human readable string for the given context, used in error + * messages. + */ +static const char * +context_human(pm_context_t context) { + switch (context) { + case PM_CONTEXT_NONE: + assert(false && "unreachable"); + return ""; + case PM_CONTEXT_BEGIN: return "begin statement"; + case PM_CONTEXT_BLOCK_BRACES: return "'{'..'}' block"; + case PM_CONTEXT_BLOCK_KEYWORDS: return "'do'..'end' block"; + case PM_CONTEXT_BLOCK_PARAMETERS: return "'|'..'|' block parameter"; + case PM_CONTEXT_CASE_WHEN: return "'when' clause"; + case PM_CONTEXT_CASE_IN: return "'in' clause"; + case PM_CONTEXT_CLASS: return "class definition"; + case PM_CONTEXT_DEF: return "method definition"; + case PM_CONTEXT_DEF_PARAMS: return "method parameters"; + case PM_CONTEXT_DEFAULT_PARAMS: return "parameter default value"; + case PM_CONTEXT_DEFINED: return "'defined?' expression"; + case PM_CONTEXT_ELSE: + case PM_CONTEXT_BEGIN_ELSE: + case PM_CONTEXT_BLOCK_ELSE: + case PM_CONTEXT_CLASS_ELSE: + case PM_CONTEXT_DEF_ELSE: + case PM_CONTEXT_LAMBDA_ELSE: + case PM_CONTEXT_MODULE_ELSE: + case PM_CONTEXT_SCLASS_ELSE: return "'else' clause"; + case PM_CONTEXT_ELSIF: return "'elsif' clause"; + case PM_CONTEXT_EMBEXPR: return "embedded expression"; + case PM_CONTEXT_BEGIN_ENSURE: + case PM_CONTEXT_BLOCK_ENSURE: + case PM_CONTEXT_CLASS_ENSURE: + case PM_CONTEXT_DEF_ENSURE: + case PM_CONTEXT_LAMBDA_ENSURE: + case PM_CONTEXT_MODULE_ENSURE: + case PM_CONTEXT_SCLASS_ENSURE: return "'ensure' clause"; + case PM_CONTEXT_FOR: return "for loop"; + case PM_CONTEXT_FOR_INDEX: return "for loop index"; + case PM_CONTEXT_IF: return "if statement"; + case PM_CONTEXT_LAMBDA_BRACES: return "'{'..'}' lambda block"; + case PM_CONTEXT_LAMBDA_DO_END: return "'do'..'end' lambda block"; + case PM_CONTEXT_LOOP_PREDICATE: return "loop predicate"; + case PM_CONTEXT_MAIN: return "top level context"; + case PM_CONTEXT_MODULE: return "module definition"; + case PM_CONTEXT_MULTI_TARGET: return "multiple targets"; + case PM_CONTEXT_PARENS: return "parentheses"; + case PM_CONTEXT_POSTEXE: return "'END' block"; + case PM_CONTEXT_PREDICATE: return "predicate"; + case PM_CONTEXT_PREEXE: return "'BEGIN' block"; + case PM_CONTEXT_BEGIN_RESCUE: + case PM_CONTEXT_BLOCK_RESCUE: + case PM_CONTEXT_CLASS_RESCUE: + case PM_CONTEXT_DEF_RESCUE: + case PM_CONTEXT_LAMBDA_RESCUE: + case PM_CONTEXT_MODULE_RESCUE: + case PM_CONTEXT_RESCUE_MODIFIER: + case PM_CONTEXT_SCLASS_RESCUE: return "'rescue' clause"; + case PM_CONTEXT_SCLASS: return "singleton class definition"; + case PM_CONTEXT_TERNARY: return "ternary expression"; + case PM_CONTEXT_UNLESS: return "unless statement"; + case PM_CONTEXT_UNTIL: return "until statement"; + case PM_CONTEXT_WHILE: return "while statement"; + } + + assert(false && "unreachable"); + return ""; +} + +/******************************************************************************/ +/* Specific token lexers */ +/******************************************************************************/ + +static inline void +pm_strspn_number_validate(pm_parser_t *parser, const uint8_t *string, size_t length, const uint8_t *invalid) { + if (invalid != NULL) { + pm_diagnostic_id_t diag_id = (invalid == (string + length - 1)) ? PM_ERR_INVALID_NUMBER_UNDERSCORE_TRAILING : PM_ERR_INVALID_NUMBER_UNDERSCORE_INNER; + pm_parser_err(parser, invalid, invalid + 1, diag_id); + } +} + +static size_t +pm_strspn_binary_number_validate(pm_parser_t *parser, const uint8_t *string) { + const uint8_t *invalid = NULL; + size_t length = pm_strspn_binary_number(string, parser->end - string, &invalid); + pm_strspn_number_validate(parser, string, length, invalid); + return length; +} + +static size_t +pm_strspn_octal_number_validate(pm_parser_t *parser, const uint8_t *string) { + const uint8_t *invalid = NULL; + size_t length = pm_strspn_octal_number(string, parser->end - string, &invalid); + pm_strspn_number_validate(parser, string, length, invalid); + return length; +} + +static size_t +pm_strspn_decimal_number_validate(pm_parser_t *parser, const uint8_t *string) { + const uint8_t *invalid = NULL; + size_t length = pm_strspn_decimal_number(string, parser->end - string, &invalid); + pm_strspn_number_validate(parser, string, length, invalid); + return length; +} + +static size_t +pm_strspn_hexadecimal_number_validate(pm_parser_t *parser, const uint8_t *string) { + const uint8_t *invalid = NULL; + size_t length = pm_strspn_hexadecimal_number(string, parser->end - string, &invalid); + pm_strspn_number_validate(parser, string, length, invalid); + return length; +} + +static pm_token_type_t +lex_optional_float_suffix(pm_parser_t *parser, bool* seen_e) { + pm_token_type_t type = PM_TOKEN_INTEGER; + + // Here we're going to attempt to parse the optional decimal portion of a + // float. If it's not there, then it's okay and we'll just continue on. + if (peek(parser) == '.') { + if (pm_char_is_decimal_digit(peek_offset(parser, 1))) { + parser->current.end += 2; + parser->current.end += pm_strspn_decimal_number_validate(parser, parser->current.end); + type = PM_TOKEN_FLOAT; + } else { + // If we had a . and then something else, then it's not a float + // suffix on a number it's a method call or something else. + return type; + } + } + + // Here we're going to attempt to parse the optional exponent portion of a + // float. If it's not there, it's okay and we'll just continue on. + if ((peek(parser) == 'e') || (peek(parser) == 'E')) { + if ((peek_offset(parser, 1) == '+') || (peek_offset(parser, 1) == '-')) { + parser->current.end += 2; + + if (pm_char_is_decimal_digit(peek(parser))) { + parser->current.end++; + parser->current.end += pm_strspn_decimal_number_validate(parser, parser->current.end); + } else { + pm_parser_err_current(parser, PM_ERR_INVALID_FLOAT_EXPONENT); + } + } else if (pm_char_is_decimal_digit(peek_offset(parser, 1))) { + parser->current.end++; + parser->current.end += pm_strspn_decimal_number_validate(parser, parser->current.end); + } else { + return type; + } + + *seen_e = true; + type = PM_TOKEN_FLOAT; + } + + return type; +} + +static pm_token_type_t +lex_numeric_prefix(pm_parser_t *parser, bool* seen_e) { + pm_token_type_t type = PM_TOKEN_INTEGER; + *seen_e = false; + + if (peek_offset(parser, -1) == '0') { + switch (*parser->current.end) { + // 0d1111 is a decimal number + case 'd': + case 'D': + parser->current.end++; + if (pm_char_is_decimal_digit(peek(parser))) { + parser->current.end += pm_strspn_decimal_number_validate(parser, parser->current.end); + } else { + match(parser, '_'); + pm_parser_err_current(parser, PM_ERR_INVALID_NUMBER_DECIMAL); + } + + break; + + // 0b1111 is a binary number + case 'b': + case 'B': + parser->current.end++; + if (pm_char_is_binary_digit(peek(parser))) { + parser->current.end += pm_strspn_binary_number_validate(parser, parser->current.end); + } else { + match(parser, '_'); + pm_parser_err_current(parser, PM_ERR_INVALID_NUMBER_BINARY); + } + + parser->integer_base = PM_INTEGER_BASE_FLAGS_BINARY; + break; + + // 0o1111 is an octal number + case 'o': + case 'O': + parser->current.end++; + if (pm_char_is_octal_digit(peek(parser))) { + parser->current.end += pm_strspn_octal_number_validate(parser, parser->current.end); + } else { + match(parser, '_'); + pm_parser_err_current(parser, PM_ERR_INVALID_NUMBER_OCTAL); + } + + parser->integer_base = PM_INTEGER_BASE_FLAGS_OCTAL; + break; + + // 01111 is an octal number + case '_': + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + parser->current.end += pm_strspn_octal_number_validate(parser, parser->current.end); + parser->integer_base = PM_INTEGER_BASE_FLAGS_OCTAL; + break; + + // 0x1111 is a hexadecimal number + case 'x': + case 'X': + parser->current.end++; + if (pm_char_is_hexadecimal_digit(peek(parser))) { + parser->current.end += pm_strspn_hexadecimal_number_validate(parser, parser->current.end); + } else { + match(parser, '_'); + pm_parser_err_current(parser, PM_ERR_INVALID_NUMBER_HEXADECIMAL); + } + + parser->integer_base = PM_INTEGER_BASE_FLAGS_HEXADECIMAL; + break; + + // 0.xxx is a float + case '.': { + type = lex_optional_float_suffix(parser, seen_e); + break; + } + + // 0exxx is a float + case 'e': + case 'E': { + type = lex_optional_float_suffix(parser, seen_e); + break; + } + } + } else { + // If it didn't start with a 0, then we'll lex as far as we can into a + // decimal number. + parser->current.end += pm_strspn_decimal_number_validate(parser, parser->current.end); + + // Afterward, we'll lex as far as we can into an optional float suffix. + type = lex_optional_float_suffix(parser, seen_e); + } + + // At this point we have a completed number, but we want to provide the user + // with a good experience if they put an additional .xxx fractional + // component on the end, so we'll check for that here. + if (peek_offset(parser, 0) == '.' && pm_char_is_decimal_digit(peek_offset(parser, 1))) { + const uint8_t *fraction_start = parser->current.end; + const uint8_t *fraction_end = parser->current.end + 2; + fraction_end += pm_strspn_decimal_digit(fraction_end, parser->end - fraction_end); + pm_parser_err(parser, fraction_start, fraction_end, PM_ERR_INVALID_NUMBER_FRACTION); + } + + return type; +} + +static pm_token_type_t +lex_numeric(pm_parser_t *parser) { + pm_token_type_t type = PM_TOKEN_INTEGER; + parser->integer_base = PM_INTEGER_BASE_FLAGS_DECIMAL; + + if (parser->current.end < parser->end) { + bool seen_e = false; + type = lex_numeric_prefix(parser, &seen_e); + + const uint8_t *end = parser->current.end; + pm_token_type_t suffix_type = type; + + if (type == PM_TOKEN_INTEGER) { + if (match(parser, 'r')) { + suffix_type = PM_TOKEN_INTEGER_RATIONAL; + + if (match(parser, 'i')) { + suffix_type = PM_TOKEN_INTEGER_RATIONAL_IMAGINARY; + } + } else if (match(parser, 'i')) { + suffix_type = PM_TOKEN_INTEGER_IMAGINARY; + } + } else { + if (!seen_e && match(parser, 'r')) { + suffix_type = PM_TOKEN_FLOAT_RATIONAL; + + if (match(parser, 'i')) { + suffix_type = PM_TOKEN_FLOAT_RATIONAL_IMAGINARY; + } + } else if (match(parser, 'i')) { + suffix_type = PM_TOKEN_FLOAT_IMAGINARY; + } + } + + const uint8_t b = peek(parser); + if (b != '\0' && (b >= 0x80 || ((b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z')) || b == '_')) { + parser->current.end = end; + } else { + type = suffix_type; + } + } + + return type; +} + +static pm_token_type_t +lex_global_variable(pm_parser_t *parser) { + if (parser->current.end >= parser->end) { + pm_parser_err_token(parser, &parser->current, PM_ERR_GLOBAL_VARIABLE_BARE); + return PM_TOKEN_GLOBAL_VARIABLE; + } + + // True if multiple characters are allowed after the declaration of the + // global variable. Not true when it starts with "$-". + bool allow_multiple = true; + + switch (*parser->current.end) { + case '~': // $~: match-data + case '*': // $*: argv + case '$': // $$: pid + case '?': // $?: last status + case '!': // $!: error string + case '@': // $@: error position + case '/': // $/: input record separator + case '\\': // $\: output record separator + case ';': // $;: field separator + case ',': // $,: output field separator + case '.': // $.: last read line number + case '=': // $=: ignorecase + case ':': // $:: load path + case '<': // $<: reading filename + case '>': // $>: default output handle + case '\"': // $": already loaded files + parser->current.end++; + return PM_TOKEN_GLOBAL_VARIABLE; + + case '&': // $&: last match + case '`': // $`: string before last match + case '\'': // $': string after last match + case '+': // $+: string matches last paren. + parser->current.end++; + return lex_state_p(parser, PM_LEX_STATE_FNAME) ? PM_TOKEN_GLOBAL_VARIABLE : PM_TOKEN_BACK_REFERENCE; + + case '0': { + parser->current.end++; + size_t width; + + if ((width = char_is_identifier(parser, parser->current.end, parser->end - parser->current.end)) > 0) { + do { + parser->current.end += width; + } while ((width = char_is_identifier(parser, parser->current.end, parser->end - parser->current.end)) > 0); + + // $0 isn't allowed to be followed by anything. + pm_diagnostic_id_t diag_id = parser->version <= PM_OPTIONS_VERSION_CRUBY_3_3 ? PM_ERR_INVALID_VARIABLE_GLOBAL_3_3 : PM_ERR_INVALID_VARIABLE_GLOBAL; + PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, parser->current, diag_id); + } + + return PM_TOKEN_GLOBAL_VARIABLE; + } + + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + parser->current.end += pm_strspn_decimal_digit(parser->current.end, parser->end - parser->current.end); + return lex_state_p(parser, PM_LEX_STATE_FNAME) ? PM_TOKEN_GLOBAL_VARIABLE : PM_TOKEN_NUMBERED_REFERENCE; + + case '-': + parser->current.end++; + allow_multiple = false; + PRISM_FALLTHROUGH + default: { + size_t width; + + if ((width = char_is_identifier(parser, parser->current.end, parser->end - parser->current.end)) > 0) { + do { + parser->current.end += width; + } while (allow_multiple && (width = char_is_identifier(parser, parser->current.end, parser->end - parser->current.end)) > 0); + } else if (pm_char_is_whitespace(peek(parser))) { + // If we get here, then we have a $ followed by whitespace, + // which is not allowed. + pm_parser_err_token(parser, &parser->current, PM_ERR_GLOBAL_VARIABLE_BARE); + } else { + // If we get here, then we have a $ followed by something that + // isn't recognized as a global variable. + pm_diagnostic_id_t diag_id = parser->version <= PM_OPTIONS_VERSION_CRUBY_3_3 ? PM_ERR_INVALID_VARIABLE_GLOBAL_3_3 : PM_ERR_INVALID_VARIABLE_GLOBAL; + const uint8_t *end = parser->current.end + parser->encoding->char_width(parser->current.end, parser->end - parser->current.end); + PM_PARSER_ERR_FORMAT(parser, parser->current.start, end, diag_id, (int) (end - parser->current.start), (const char *) parser->current.start); + } + + return PM_TOKEN_GLOBAL_VARIABLE; + } + } +} + +/** + * This function checks if the current token matches a keyword. If it does, it + * returns the token type. Otherwise, it returns PM_TOKEN_EOF. The arguments are as follows: + * + * * `parser` - the parser object + * * `current_start` - pointer to the start of the current token + * * `value` - the literal string that we're checking for + * * `vlen` - the length of the token + * * `state` - the state that we should transition to if the token matches + * * `type` - the expected token type + * * `modifier_type` - the expected modifier token type + */ +static inline pm_token_type_t +lex_keyword(pm_parser_t *parser, const uint8_t *current_start, const char *value, size_t vlen, pm_lex_state_t state, pm_token_type_t type, pm_token_type_t modifier_type) { + if (memcmp(current_start, value, vlen) == 0) { + pm_lex_state_t last_state = parser->lex_state; + + if (parser->lex_state & PM_LEX_STATE_FNAME) { + lex_state_set(parser, PM_LEX_STATE_ENDFN); + } else { + lex_state_set(parser, state); + if (state == PM_LEX_STATE_BEG) { + parser->command_start = true; + } + + if ((modifier_type != PM_TOKEN_EOF) && !(last_state & (PM_LEX_STATE_BEG | PM_LEX_STATE_LABELED | PM_LEX_STATE_CLASS))) { + lex_state_set(parser, PM_LEX_STATE_BEG | PM_LEX_STATE_LABEL); + return modifier_type; + } + } + + return type; + } + + return PM_TOKEN_EOF; +} + +static pm_token_type_t +lex_identifier(pm_parser_t *parser, bool previous_command_start) { + // Lex as far as we can into the current identifier. + size_t width; + const uint8_t *end = parser->end; + const uint8_t *current_start = parser->current.start; + const uint8_t *current_end = parser->current.end; + bool encoding_changed = parser->encoding_changed; + + if (encoding_changed) { + while ((width = char_is_identifier(parser, current_end, end - current_end)) > 0) { + current_end += width; + } + } else { + while ((width = char_is_identifier_utf8(current_end, end - current_end)) > 0) { + current_end += width; + } + } + parser->current.end = current_end; + + // Now cache the length of the identifier so that we can quickly compare it + // against known keywords. + width = (size_t) (current_end - current_start); + + if (current_end < end) { + if (((current_end + 1 >= end) || (current_end[1] != '=')) && (match(parser, '!') || match(parser, '?'))) { + // First we'll attempt to extend the identifier by a ! or ?. Then we'll + // check if we're returning the defined? keyword or just an identifier. + width++; + + if ( + ((lex_state_p(parser, PM_LEX_STATE_LABEL | PM_LEX_STATE_ENDFN) && !previous_command_start) || lex_state_arg_p(parser)) && + (peek(parser) == ':') && (peek_offset(parser, 1) != ':') + ) { + // If we're in a position where we can accept a : at the end of an + // identifier, then we'll optionally accept it. + lex_state_set(parser, PM_LEX_STATE_ARG | PM_LEX_STATE_LABELED); + (void) match(parser, ':'); + return PM_TOKEN_LABEL; + } + + if (parser->lex_state != PM_LEX_STATE_DOT) { + if (width == 8 && (lex_keyword(parser, current_start, "defined?", width, PM_LEX_STATE_ARG, PM_TOKEN_KEYWORD_DEFINED, PM_TOKEN_EOF) != PM_TOKEN_EOF)) { + return PM_TOKEN_KEYWORD_DEFINED; + } + } + + return PM_TOKEN_METHOD_NAME; + } + + if (lex_state_p(parser, PM_LEX_STATE_FNAME) && peek_offset(parser, 1) != '~' && peek_offset(parser, 1) != '>' && (peek_offset(parser, 1) != '=' || peek_offset(parser, 2) == '>') && match(parser, '=')) { + // If we're in a position where we can accept a = at the end of an + // identifier, then we'll optionally accept it. + return PM_TOKEN_IDENTIFIER; + } + + if ( + ((lex_state_p(parser, PM_LEX_STATE_LABEL | PM_LEX_STATE_ENDFN) && !previous_command_start) || lex_state_arg_p(parser)) && + peek(parser) == ':' && peek_offset(parser, 1) != ':' + ) { + // If we're in a position where we can accept a : at the end of an + // identifier, then we'll optionally accept it. + lex_state_set(parser, PM_LEX_STATE_ARG | PM_LEX_STATE_LABELED); + (void) match(parser, ':'); + return PM_TOKEN_LABEL; + } + } + + if (parser->lex_state != PM_LEX_STATE_DOT) { + pm_token_type_t type; + switch (width) { + case 2: + if (lex_keyword(parser, current_start, "do", width, PM_LEX_STATE_BEG, PM_TOKEN_KEYWORD_DO, PM_TOKEN_EOF) != PM_TOKEN_EOF) { + if (pm_do_loop_stack_p(parser)) { + return PM_TOKEN_KEYWORD_DO_LOOP; + } + return PM_TOKEN_KEYWORD_DO; + } + + if ((type = lex_keyword(parser, current_start, "if", width, PM_LEX_STATE_BEG, PM_TOKEN_KEYWORD_IF, PM_TOKEN_KEYWORD_IF_MODIFIER)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "in", width, PM_LEX_STATE_BEG, PM_TOKEN_KEYWORD_IN, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "or", width, PM_LEX_STATE_BEG, PM_TOKEN_KEYWORD_OR, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + break; + case 3: + if ((type = lex_keyword(parser, current_start, "and", width, PM_LEX_STATE_BEG, PM_TOKEN_KEYWORD_AND, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "def", width, PM_LEX_STATE_FNAME, PM_TOKEN_KEYWORD_DEF, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "end", width, PM_LEX_STATE_END, PM_TOKEN_KEYWORD_END, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "END", width, PM_LEX_STATE_END, PM_TOKEN_KEYWORD_END_UPCASE, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "for", width, PM_LEX_STATE_BEG, PM_TOKEN_KEYWORD_FOR, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "nil", width, PM_LEX_STATE_END, PM_TOKEN_KEYWORD_NIL, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "not", width, PM_LEX_STATE_ARG, PM_TOKEN_KEYWORD_NOT, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + break; + case 4: + if ((type = lex_keyword(parser, current_start, "case", width, PM_LEX_STATE_BEG, PM_TOKEN_KEYWORD_CASE, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "else", width, PM_LEX_STATE_BEG, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "next", width, PM_LEX_STATE_MID, PM_TOKEN_KEYWORD_NEXT, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "redo", width, PM_LEX_STATE_END, PM_TOKEN_KEYWORD_REDO, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "self", width, PM_LEX_STATE_END, PM_TOKEN_KEYWORD_SELF, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "then", width, PM_LEX_STATE_BEG, PM_TOKEN_KEYWORD_THEN, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "true", width, PM_LEX_STATE_END, PM_TOKEN_KEYWORD_TRUE, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "when", width, PM_LEX_STATE_BEG, PM_TOKEN_KEYWORD_WHEN, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + break; + case 5: + if ((type = lex_keyword(parser, current_start, "alias", width, PM_LEX_STATE_FNAME | PM_LEX_STATE_FITEM, PM_TOKEN_KEYWORD_ALIAS, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "begin", width, PM_LEX_STATE_BEG, PM_TOKEN_KEYWORD_BEGIN, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "BEGIN", width, PM_LEX_STATE_END, PM_TOKEN_KEYWORD_BEGIN_UPCASE, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "break", width, PM_LEX_STATE_MID, PM_TOKEN_KEYWORD_BREAK, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "class", width, PM_LEX_STATE_CLASS, PM_TOKEN_KEYWORD_CLASS, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "elsif", width, PM_LEX_STATE_BEG, PM_TOKEN_KEYWORD_ELSIF, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "false", width, PM_LEX_STATE_END, PM_TOKEN_KEYWORD_FALSE, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "retry", width, PM_LEX_STATE_END, PM_TOKEN_KEYWORD_RETRY, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "super", width, PM_LEX_STATE_ARG, PM_TOKEN_KEYWORD_SUPER, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "undef", width, PM_LEX_STATE_FNAME | PM_LEX_STATE_FITEM, PM_TOKEN_KEYWORD_UNDEF, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "until", width, PM_LEX_STATE_BEG, PM_TOKEN_KEYWORD_UNTIL, PM_TOKEN_KEYWORD_UNTIL_MODIFIER)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "while", width, PM_LEX_STATE_BEG, PM_TOKEN_KEYWORD_WHILE, PM_TOKEN_KEYWORD_WHILE_MODIFIER)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "yield", width, PM_LEX_STATE_ARG, PM_TOKEN_KEYWORD_YIELD, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + break; + case 6: + if ((type = lex_keyword(parser, current_start, "ensure", width, PM_LEX_STATE_BEG, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "module", width, PM_LEX_STATE_BEG, PM_TOKEN_KEYWORD_MODULE, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "rescue", width, PM_LEX_STATE_MID, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_RESCUE_MODIFIER)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "return", width, PM_LEX_STATE_MID, PM_TOKEN_KEYWORD_RETURN, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "unless", width, PM_LEX_STATE_BEG, PM_TOKEN_KEYWORD_UNLESS, PM_TOKEN_KEYWORD_UNLESS_MODIFIER)) != PM_TOKEN_EOF) return type; + break; + case 8: + if ((type = lex_keyword(parser, current_start, "__LINE__", width, PM_LEX_STATE_END, PM_TOKEN_KEYWORD___LINE__, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + if ((type = lex_keyword(parser, current_start, "__FILE__", width, PM_LEX_STATE_END, PM_TOKEN_KEYWORD___FILE__, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + break; + case 12: + if ((type = lex_keyword(parser, current_start, "__ENCODING__", width, PM_LEX_STATE_END, PM_TOKEN_KEYWORD___ENCODING__, PM_TOKEN_EOF)) != PM_TOKEN_EOF) return type; + break; + } + } + + if (encoding_changed) { + return parser->encoding->isupper_char(current_start, end - current_start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER; + } + return pm_encoding_utf_8_isupper_char(current_start, end - current_start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER; +} + +/** + * Returns true if the current token that the parser is considering is at the + * beginning of a line or the beginning of the source. + */ +static bool +current_token_starts_line(pm_parser_t *parser) { + return (parser->current.start == parser->start) || (parser->current.start[-1] == '\n'); +} + +/** + * When we hit a # while lexing something like a string, we need to potentially + * handle interpolation. This function performs that check. It returns a token + * type representing what it found. Those cases are: + * + * * PM_TOKEN_NOT_PROVIDED - No interpolation was found at this point. The + * caller should keep lexing. + * * PM_TOKEN_STRING_CONTENT - No interpolation was found at this point. The + * caller should return this token type. + * * PM_TOKEN_EMBEXPR_BEGIN - An embedded expression was found. The caller + * should return this token type. + * * PM_TOKEN_EMBVAR - An embedded variable was found. The caller should return + * this token type. + */ +static pm_token_type_t +lex_interpolation(pm_parser_t *parser, const uint8_t *pound) { + // If there is no content following this #, then we're at the end of + // the string and we can safely return string content. + if (pound + 1 >= parser->end) { + parser->current.end = pound + 1; + return PM_TOKEN_STRING_CONTENT; + } + + // Now we'll check against the character that follows the #. If it constitutes + // valid interplation, we'll handle that, otherwise we'll return + // PM_TOKEN_NOT_PROVIDED. + switch (pound[1]) { + case '@': { + // In this case we may have hit an embedded instance or class variable. + if (pound + 2 >= parser->end) { + parser->current.end = pound + 1; + return PM_TOKEN_STRING_CONTENT; + } + + // If we're looking at a @ and there's another @, then we'll skip past the + // second @. + const uint8_t *variable = pound + 2; + if (*variable == '@' && pound + 3 < parser->end) variable++; + + if (char_is_identifier_start(parser, variable, parser->end - variable)) { + // At this point we're sure that we've either hit an embedded instance + // or class variable. In this case we'll first need to check if we've + // already consumed content. + if (pound > parser->current.start) { + parser->current.end = pound; + return PM_TOKEN_STRING_CONTENT; + } + + // Otherwise we need to return the embedded variable token + // and then switch to the embedded variable lex mode. + lex_mode_push(parser, (pm_lex_mode_t) { .mode = PM_LEX_EMBVAR }); + parser->current.end = pound + 1; + return PM_TOKEN_EMBVAR; + } + + // If we didn't get a valid interpolation, then this is just regular + // string content. This is like if we get "#@-". In this case the caller + // should keep lexing. + parser->current.end = pound + 1; + return PM_TOKEN_NOT_PROVIDED; + } + case '$': + // In this case we may have hit an embedded global variable. If there's + // not enough room, then we'll just return string content. + if (pound + 2 >= parser->end) { + parser->current.end = pound + 1; + return PM_TOKEN_STRING_CONTENT; + } + + // This is the character that we're going to check to see if it is the + // start of an identifier that would indicate that this is a global + // variable. + const uint8_t *check = pound + 2; + + if (pound[2] == '-') { + if (pound + 3 >= parser->end) { + parser->current.end = pound + 2; + return PM_TOKEN_STRING_CONTENT; + } + + check++; + } + + // If the character that we're going to check is the start of an + // identifier, or we don't have a - and the character is a decimal number + // or a global name punctuation character, then we've hit an embedded + // global variable. + if ( + char_is_identifier_start(parser, check, parser->end - check) || + (pound[2] != '-' && (pm_char_is_decimal_digit(pound[2]) || char_is_global_name_punctuation(pound[2]))) + ) { + // In this case we've hit an embedded global variable. First check to + // see if we've already consumed content. If we have, then we need to + // return that content as string content first. + if (pound > parser->current.start) { + parser->current.end = pound; + return PM_TOKEN_STRING_CONTENT; + } + + // Otherwise, we need to return the embedded variable token and switch + // to the embedded variable lex mode. + lex_mode_push(parser, (pm_lex_mode_t) { .mode = PM_LEX_EMBVAR }); + parser->current.end = pound + 1; + return PM_TOKEN_EMBVAR; + } + + // In this case we've hit a #$ that does not indicate a global variable. + // In this case we'll continue lexing past it. + parser->current.end = pound + 1; + return PM_TOKEN_NOT_PROVIDED; + case '{': + // In this case it's the start of an embedded expression. If we have + // already consumed content, then we need to return that content as string + // content first. + if (pound > parser->current.start) { + parser->current.end = pound; + return PM_TOKEN_STRING_CONTENT; + } + + parser->enclosure_nesting++; + + // Otherwise we'll skip past the #{ and begin lexing the embedded + // expression. + lex_mode_push(parser, (pm_lex_mode_t) { .mode = PM_LEX_EMBEXPR }); + parser->current.end = pound + 2; + parser->command_start = true; + pm_do_loop_stack_push(parser, false); + return PM_TOKEN_EMBEXPR_BEGIN; + default: + // In this case we've hit a # that doesn't constitute interpolation. We'll + // mark that by returning the not provided token type. This tells the + // consumer to keep lexing forward. + parser->current.end = pound + 1; + return PM_TOKEN_NOT_PROVIDED; + } +} + +static const uint8_t PM_ESCAPE_FLAG_NONE = 0x0; +static const uint8_t PM_ESCAPE_FLAG_CONTROL = 0x1; +static const uint8_t PM_ESCAPE_FLAG_META = 0x2; +static const uint8_t PM_ESCAPE_FLAG_SINGLE = 0x4; +static const uint8_t PM_ESCAPE_FLAG_REGEXP = 0x8; + +/** + * This is a lookup table for whether or not an ASCII character is printable. + */ +static const bool ascii_printable_chars[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 +}; + +static inline bool +char_is_ascii_printable(const uint8_t b) { + return (b < 0x80) && ascii_printable_chars[b]; +} + +/** + * Return the value that a hexadecimal digit character represents. For example, + * transform 'a' into 10, 'b' into 11, etc. + */ +static inline uint8_t +escape_hexadecimal_digit(const uint8_t value) { + return (uint8_t) ((value <= '9') ? (value - '0') : (value & 0x7) + 9); +} + +/** + * Scan the 4 digits of a Unicode escape into the value. Returns the number of + * digits scanned. This function assumes that the characters have already been + * validated. + */ +static inline uint32_t +escape_unicode(pm_parser_t *parser, const uint8_t *string, size_t length, const pm_location_t *error_location) { + uint32_t value = 0; + for (size_t index = 0; index < length; index++) { + if (index != 0) value <<= 4; + value |= escape_hexadecimal_digit(string[index]); + } + + // Here we're going to verify that the value is actually a valid Unicode + // codepoint and not a surrogate pair. + if (value >= 0xD800 && value <= 0xDFFF) { + if (error_location != NULL) { + pm_parser_err(parser, error_location->start, error_location->end, PM_ERR_ESCAPE_INVALID_UNICODE); + } else { + pm_parser_err(parser, string, string + length, PM_ERR_ESCAPE_INVALID_UNICODE); + } + return 0xFFFD; + } + + return value; +} + +/** + * Escape a single character value based on the given flags. + */ +static inline uint8_t +escape_byte(uint8_t value, const uint8_t flags) { + if (flags & PM_ESCAPE_FLAG_CONTROL) value &= 0x9f; + if (flags & PM_ESCAPE_FLAG_META) value |= 0x80; + return value; +} + +/** + * Write a unicode codepoint to the given buffer. + */ +static inline void +escape_write_unicode(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t flags, const uint8_t *start, const uint8_t *end, uint32_t value) { + // \u escape sequences in string-like structures implicitly change the + // encoding to UTF-8 if they are >= 0x80 or if they are used in a character + // literal. + if (value >= 0x80 || flags & PM_ESCAPE_FLAG_SINGLE) { + if (parser->explicit_encoding != NULL && parser->explicit_encoding != PM_ENCODING_UTF_8_ENTRY) { + PM_PARSER_ERR_FORMAT(parser, start, end, PM_ERR_MIXED_ENCODING, parser->explicit_encoding->name); + } + + parser->explicit_encoding = PM_ENCODING_UTF_8_ENTRY; + } + + if (!pm_buffer_append_unicode_codepoint(buffer, value)) { + pm_parser_err(parser, start, end, PM_ERR_ESCAPE_INVALID_UNICODE); + pm_buffer_append_byte(buffer, 0xEF); + pm_buffer_append_byte(buffer, 0xBF); + pm_buffer_append_byte(buffer, 0xBD); + } +} + +/** + * When you're writing a byte to the unescape buffer, if the byte is non-ASCII + * (i.e., the top bit is set) then it locks in the encoding. + */ +static inline void +escape_write_byte_encoded(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t byte) { + if (byte >= 0x80) { + if (parser->explicit_encoding != NULL && parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY && parser->encoding != PM_ENCODING_UTF_8_ENTRY) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_MIXED_ENCODING, parser->encoding->name); + } + + parser->explicit_encoding = parser->encoding; + } + + pm_buffer_append_byte(buffer, byte); +} + +/** + * The regular expression engine doesn't support the same escape sequences as + * Ruby does. So first we have to read the escape sequence, and then we have to + * format it like the regular expression engine expects it. For example, in Ruby + * if we have: + * + * /\M-\C-?/ + * + * then the first byte is actually 255, so we have to rewrite this as: + * + * /\xFF/ + * + * Note that in this case there is a literal \ byte in the regular expression + * source so that the regular expression engine will perform its own unescaping. + */ +static inline void +escape_write_byte(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expression_buffer, uint8_t flags, uint8_t byte) { + if (flags & PM_ESCAPE_FLAG_REGEXP) { + pm_buffer_append_format(regular_expression_buffer, "\\x%02X", byte); + } + + escape_write_byte_encoded(parser, buffer, byte); +} + +/** + * Write each byte of the given escaped character into the buffer. + */ +static inline void +escape_write_escape_encoded(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expression_buffer, uint8_t flags) { + size_t width; + if (parser->encoding_changed) { + width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end); + } else { + width = pm_encoding_utf_8_char_width(parser->current.end, parser->end - parser->current.end); + } + + if (width == 1) { + escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte(*parser->current.end++, flags)); + } else if (width > 1) { + // Valid multibyte character. Just ignore escape. + pm_buffer_t *b = (flags & PM_ESCAPE_FLAG_REGEXP) ? regular_expression_buffer : buffer; + pm_buffer_append_bytes(b, parser->current.end, width); + parser->current.end += width; + } else { + // Assume the next character wasn't meant to be part of this escape + // sequence since it is invalid. Add an error and move on. + parser->current.end++; + pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL); + } +} + +/** + * Warn about using a space or a tab character in an escape, as opposed to using + * \\s or \\t. Note that we can quite copy the source because the warning + * message replaces \\c with \\C. + */ +static void +escape_read_warn(pm_parser_t *parser, uint8_t flags, uint8_t flag, const char *type) { +#define FLAG(value) ((value & PM_ESCAPE_FLAG_CONTROL) ? "\\C-" : (value & PM_ESCAPE_FLAG_META) ? "\\M-" : "") + + PM_PARSER_WARN_TOKEN_FORMAT( + parser, + parser->current, + PM_WARN_INVALID_CHARACTER, + FLAG(flags), + FLAG(flag), + type + ); + +#undef FLAG +} + +/** + * Read the value of an escape into the buffer. + */ +static void +escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expression_buffer, uint8_t flags) { + uint8_t peeked = peek(parser); + switch (peeked) { + case '\\': { + parser->current.end++; + escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte('\\', flags)); + return; + } + case '\'': { + parser->current.end++; + escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte('\'', flags)); + return; + } + case 'a': { + parser->current.end++; + escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte('\a', flags)); + return; + } + case 'b': { + parser->current.end++; + escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte('\b', flags)); + return; + } + case 'e': { + parser->current.end++; + escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte('\033', flags)); + return; + } + case 'f': { + parser->current.end++; + escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte('\f', flags)); + return; + } + case 'n': { + parser->current.end++; + escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte('\n', flags)); + return; + } + case 'r': { + parser->current.end++; + escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte('\r', flags)); + return; + } + case 's': { + parser->current.end++; + escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte(' ', flags)); + return; + } + case 't': { + parser->current.end++; + escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte('\t', flags)); + return; + } + case 'v': { + parser->current.end++; + escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte('\v', flags)); + return; + } + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { + uint8_t value = (uint8_t) (*parser->current.end - '0'); + parser->current.end++; + + if (pm_char_is_octal_digit(peek(parser))) { + value = ((uint8_t) (value << 3)) | ((uint8_t) (*parser->current.end - '0')); + parser->current.end++; + + if (pm_char_is_octal_digit(peek(parser))) { + value = ((uint8_t) (value << 3)) | ((uint8_t) (*parser->current.end - '0')); + parser->current.end++; + } + } + + value = escape_byte(value, flags); + escape_write_byte(parser, buffer, regular_expression_buffer, flags, value); + return; + } + case 'x': { + const uint8_t *start = parser->current.end - 1; + + parser->current.end++; + uint8_t byte = peek(parser); + + if (pm_char_is_hexadecimal_digit(byte)) { + uint8_t value = escape_hexadecimal_digit(byte); + parser->current.end++; + + byte = peek(parser); + if (pm_char_is_hexadecimal_digit(byte)) { + value = (uint8_t) ((value << 4) | escape_hexadecimal_digit(byte)); + parser->current.end++; + } + + value = escape_byte(value, flags); + if (flags & PM_ESCAPE_FLAG_REGEXP) { + if (flags & (PM_ESCAPE_FLAG_CONTROL | PM_ESCAPE_FLAG_META)) { + pm_buffer_append_format(regular_expression_buffer, "\\x%02X", value); + } else { + pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start)); + } + } + + escape_write_byte_encoded(parser, buffer, value); + } else { + pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_HEXADECIMAL); + } + + return; + } + case 'u': { + const uint8_t *start = parser->current.end - 1; + parser->current.end++; + + if (parser->current.end == parser->end) { + const uint8_t *start = parser->current.end - 2; + PM_PARSER_ERR_FORMAT(parser, start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_SHORT, 2, start); + } else if (peek(parser) == '{') { + const uint8_t *unicode_codepoints_start = parser->current.end - 2; + parser->current.end++; + + size_t whitespace; + while (true) { + if ((whitespace = pm_strspn_inline_whitespace(parser->current.end, parser->end - parser->current.end)) > 0) { + parser->current.end += whitespace; + } else if (peek(parser) == '\\' && peek_offset(parser, 1) == 'n') { + // This is super hacky, but it gets us nicer error + // messages because we can still pass it off to the + // regular expression engine even if we hit an + // unterminated regular expression. + parser->current.end += 2; + } else { + break; + } + } + + const uint8_t *extra_codepoints_start = NULL; + int codepoints_count = 0; + + while ((parser->current.end < parser->end) && (*parser->current.end != '}')) { + const uint8_t *unicode_start = parser->current.end; + size_t hexadecimal_length = pm_strspn_hexadecimal_digit(parser->current.end, parser->end - parser->current.end); + + if (hexadecimal_length > 6) { + // \u{nnnn} character literal allows only 1-6 hexadecimal digits + pm_parser_err(parser, unicode_start, unicode_start + hexadecimal_length, PM_ERR_ESCAPE_INVALID_UNICODE_LONG); + } else if (hexadecimal_length == 0) { + // there are not hexadecimal characters + + if (flags & PM_ESCAPE_FLAG_REGEXP) { + // If this is a regular expression, we are going to + // let the regular expression engine handle this + // error instead of us. + pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start)); + } else { + pm_parser_err(parser, parser->current.end, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE); + pm_parser_err(parser, parser->current.end, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_TERM); + } + + return; + } + + parser->current.end += hexadecimal_length; + codepoints_count++; + if (flags & PM_ESCAPE_FLAG_SINGLE && codepoints_count == 2) { + extra_codepoints_start = unicode_start; + } + + uint32_t value = escape_unicode(parser, unicode_start, hexadecimal_length, NULL); + escape_write_unicode(parser, buffer, flags, unicode_start, parser->current.end, value); + + parser->current.end += pm_strspn_inline_whitespace(parser->current.end, parser->end - parser->current.end); + } + + // ?\u{nnnn} character literal should contain only one codepoint + // and cannot be like ?\u{nnnn mmmm}. + if (flags & PM_ESCAPE_FLAG_SINGLE && codepoints_count > 1) { + pm_parser_err(parser, extra_codepoints_start, parser->current.end - 1, PM_ERR_ESCAPE_INVALID_UNICODE_LITERAL); + } + + if (parser->current.end == parser->end) { + PM_PARSER_ERR_FORMAT(parser, start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_LIST, (int) (parser->current.end - start), start); + } else if (peek(parser) == '}') { + parser->current.end++; + } else { + if (flags & PM_ESCAPE_FLAG_REGEXP) { + // If this is a regular expression, we are going to let + // the regular expression engine handle this error + // instead of us. + pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start)); + } else { + pm_parser_err(parser, unicode_codepoints_start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_TERM); + } + } + + if (flags & PM_ESCAPE_FLAG_REGEXP) { + pm_buffer_append_bytes(regular_expression_buffer, unicode_codepoints_start, (size_t) (parser->current.end - unicode_codepoints_start)); + } + } else { + size_t length = pm_strspn_hexadecimal_digit(parser->current.end, MIN(parser->end - parser->current.end, 4)); + + if (length == 0) { + if (flags & PM_ESCAPE_FLAG_REGEXP) { + pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start)); + } else { + const uint8_t *start = parser->current.end - 2; + PM_PARSER_ERR_FORMAT(parser, start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_SHORT, 2, start); + } + } else if (length == 4) { + uint32_t value = escape_unicode(parser, parser->current.end, 4, NULL); + + if (flags & PM_ESCAPE_FLAG_REGEXP) { + pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end + 4 - start)); + } + + escape_write_unicode(parser, buffer, flags, start, parser->current.end + 4, value); + parser->current.end += 4; + } else { + parser->current.end += length; + + if (flags & PM_ESCAPE_FLAG_REGEXP) { + // If this is a regular expression, we are going to let + // the regular expression engine handle this error + // instead of us. + pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start)); + } else { + pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_UNICODE); + } + } + } + + return; + } + case 'c': { + parser->current.end++; + if (flags & PM_ESCAPE_FLAG_CONTROL) { + pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL_REPEAT); + } + + if (parser->current.end == parser->end) { + pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL); + return; + } + + uint8_t peeked = peek(parser); + switch (peeked) { + case '?': { + parser->current.end++; + escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte(0x7f, flags)); + return; + } + case '\\': + parser->current.end++; + + if (match(parser, 'u') || match(parser, 'U')) { + pm_parser_err(parser, parser->current.start, parser->current.end, PM_ERR_INVALID_ESCAPE_CHARACTER); + return; + } + + escape_read(parser, buffer, regular_expression_buffer, flags | PM_ESCAPE_FLAG_CONTROL); + return; + case ' ': + parser->current.end++; + escape_read_warn(parser, flags, PM_ESCAPE_FLAG_CONTROL, "\\s"); + escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL)); + return; + case '\t': + parser->current.end++; + escape_read_warn(parser, flags, 0, "\\t"); + escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL)); + return; + default: { + if (!char_is_ascii_printable(peeked)) { + pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL); + return; + } + + parser->current.end++; + escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL)); + return; + } + } + } + case 'C': { + parser->current.end++; + if (flags & PM_ESCAPE_FLAG_CONTROL) { + pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL_REPEAT); + } + + if (peek(parser) != '-') { + size_t width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end); + pm_parser_err(parser, parser->current.start, parser->current.end + width, PM_ERR_ESCAPE_INVALID_CONTROL); + return; + } + + parser->current.end++; + if (parser->current.end == parser->end) { + pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL); + return; + } + + uint8_t peeked = peek(parser); + switch (peeked) { + case '?': { + parser->current.end++; + escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte(0x7f, flags)); + return; + } + case '\\': + parser->current.end++; + + if (match(parser, 'u') || match(parser, 'U')) { + pm_parser_err(parser, parser->current.start, parser->current.end, PM_ERR_INVALID_ESCAPE_CHARACTER); + return; + } + + escape_read(parser, buffer, regular_expression_buffer, flags | PM_ESCAPE_FLAG_CONTROL); + return; + case ' ': + parser->current.end++; + escape_read_warn(parser, flags, PM_ESCAPE_FLAG_CONTROL, "\\s"); + escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL)); + return; + case '\t': + parser->current.end++; + escape_read_warn(parser, flags, 0, "\\t"); + escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL)); + return; + default: { + if (!char_is_ascii_printable(peeked)) { + size_t width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end); + pm_parser_err(parser, parser->current.start, parser->current.end + width, PM_ERR_ESCAPE_INVALID_CONTROL); + return; + } + + parser->current.end++; + escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL)); + return; + } + } + } + case 'M': { + parser->current.end++; + if (flags & PM_ESCAPE_FLAG_META) { + pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_META_REPEAT); + } + + if (peek(parser) != '-') { + size_t width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end); + pm_parser_err(parser, parser->current.start, parser->current.end + width, PM_ERR_ESCAPE_INVALID_META); + return; + } + + parser->current.end++; + if (parser->current.end == parser->end) { + pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_META); + return; + } + + uint8_t peeked = peek(parser); + switch (peeked) { + case '\\': + parser->current.end++; + + if (match(parser, 'u') || match(parser, 'U')) { + pm_parser_err(parser, parser->current.start, parser->current.end, PM_ERR_INVALID_ESCAPE_CHARACTER); + return; + } + + escape_read(parser, buffer, regular_expression_buffer, flags | PM_ESCAPE_FLAG_META); + return; + case ' ': + parser->current.end++; + escape_read_warn(parser, flags, PM_ESCAPE_FLAG_META, "\\s"); + escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_META)); + return; + case '\t': + parser->current.end++; + escape_read_warn(parser, flags & ((uint8_t) ~PM_ESCAPE_FLAG_CONTROL), PM_ESCAPE_FLAG_META, "\\t"); + escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_META)); + return; + default: + if (!char_is_ascii_printable(peeked)) { + size_t width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end); + pm_parser_err(parser, parser->current.start, parser->current.end + width, PM_ERR_ESCAPE_INVALID_META); + return; + } + + parser->current.end++; + escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_META)); + return; + } + } + case '\r': { + if (peek_offset(parser, 1) == '\n') { + parser->current.end += 2; + escape_write_byte_encoded(parser, buffer, escape_byte('\n', flags)); + return; + } + PRISM_FALLTHROUGH + } + default: { + if ((flags & (PM_ESCAPE_FLAG_CONTROL | PM_ESCAPE_FLAG_META)) && !char_is_ascii_printable(peeked)) { + size_t width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end); + pm_parser_err(parser, parser->current.start, parser->current.end + width, PM_ERR_ESCAPE_INVALID_META); + return; + } + if (parser->current.end < parser->end) { + escape_write_escape_encoded(parser, buffer, regular_expression_buffer, flags); + } else { + pm_parser_err_current(parser, PM_ERR_INVALID_ESCAPE_CHARACTER); + } + return; + } + } +} + +/** + * This function is responsible for lexing either a character literal or the ? + * operator. The supported character literals are described below. + * + * \\a bell, ASCII 07h (BEL) + * \\b backspace, ASCII 08h (BS) + * \t horizontal tab, ASCII 09h (TAB) + * \\n newline (line feed), ASCII 0Ah (LF) + * \v vertical tab, ASCII 0Bh (VT) + * \f form feed, ASCII 0Ch (FF) + * \r carriage return, ASCII 0Dh (CR) + * \\e escape, ASCII 1Bh (ESC) + * \s space, ASCII 20h (SPC) + * \\ backslash + * \nnn octal bit pattern, where nnn is 1-3 octal digits ([0-7]) + * \xnn hexadecimal bit pattern, where nn is 1-2 hexadecimal digits ([0-9a-fA-F]) + * \unnnn Unicode character, where nnnn is exactly 4 hexadecimal digits ([0-9a-fA-F]) + * \u{nnnn ...} Unicode character(s), where each nnnn is 1-6 hexadecimal digits ([0-9a-fA-F]) + * \cx or \C-x control character, where x is an ASCII printable character + * \M-x meta character, where x is an ASCII printable character + * \M-\C-x meta control character, where x is an ASCII printable character + * \M-\cx same as above + * \\c\M-x same as above + * \\c? or \C-? delete, ASCII 7Fh (DEL) + */ +static pm_token_type_t +lex_question_mark(pm_parser_t *parser) { + if (lex_state_end_p(parser)) { + lex_state_set(parser, PM_LEX_STATE_BEG); + return PM_TOKEN_QUESTION_MARK; + } + + if (parser->current.end >= parser->end) { + pm_parser_err_current(parser, PM_ERR_INCOMPLETE_QUESTION_MARK); + pm_string_shared_init(&parser->current_string, parser->current.start + 1, parser->current.end); + return PM_TOKEN_CHARACTER_LITERAL; + } + + if (pm_char_is_whitespace(*parser->current.end)) { + lex_state_set(parser, PM_LEX_STATE_BEG); + return PM_TOKEN_QUESTION_MARK; + } + + lex_state_set(parser, PM_LEX_STATE_BEG); + + if (match(parser, '\\')) { + lex_state_set(parser, PM_LEX_STATE_END); + + pm_buffer_t buffer; + pm_buffer_init_capacity(&buffer, 3); + + escape_read(parser, &buffer, NULL, PM_ESCAPE_FLAG_SINGLE); + pm_string_owned_init(&parser->current_string, (uint8_t *) buffer.value, buffer.length); + + return PM_TOKEN_CHARACTER_LITERAL; + } else { + size_t encoding_width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end); + + // Ternary operators can have a ? immediately followed by an identifier + // which starts with an underscore. We check for this case here. + if ( + !(parser->encoding->alnum_char(parser->current.end, parser->end - parser->current.end) || peek(parser) == '_') || + ( + (parser->current.end + encoding_width >= parser->end) || + !char_is_identifier(parser, parser->current.end + encoding_width, parser->end - (parser->current.end + encoding_width)) + ) + ) { + lex_state_set(parser, PM_LEX_STATE_END); + parser->current.end += encoding_width; + pm_string_shared_init(&parser->current_string, parser->current.start + 1, parser->current.end); + return PM_TOKEN_CHARACTER_LITERAL; + } + } + + return PM_TOKEN_QUESTION_MARK; +} + +/** + * Lex a variable that starts with an @ sign (either an instance or class + * variable). + */ +static pm_token_type_t +lex_at_variable(pm_parser_t *parser) { + pm_token_type_t type = match(parser, '@') ? PM_TOKEN_CLASS_VARIABLE : PM_TOKEN_INSTANCE_VARIABLE; + const uint8_t *end = parser->end; + + size_t width; + if ((width = char_is_identifier_start(parser, parser->current.end, end - parser->current.end)) > 0) { + parser->current.end += width; + + while ((width = char_is_identifier(parser, parser->current.end, end - parser->current.end)) > 0) { + parser->current.end += width; + } + } else if (parser->current.end < end && pm_char_is_decimal_digit(*parser->current.end)) { + pm_diagnostic_id_t diag_id = (type == PM_TOKEN_CLASS_VARIABLE) ? PM_ERR_INCOMPLETE_VARIABLE_CLASS : PM_ERR_INCOMPLETE_VARIABLE_INSTANCE; + if (parser->version <= PM_OPTIONS_VERSION_CRUBY_3_3) { + diag_id = (type == PM_TOKEN_CLASS_VARIABLE) ? PM_ERR_INCOMPLETE_VARIABLE_CLASS_3_3 : PM_ERR_INCOMPLETE_VARIABLE_INSTANCE_3_3; + } + + size_t width = parser->encoding->char_width(parser->current.end, end - parser->current.end); + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, diag_id, (int) ((parser->current.end + width) - parser->current.start), (const char *) parser->current.start); + } else { + pm_diagnostic_id_t diag_id = (type == PM_TOKEN_CLASS_VARIABLE) ? PM_ERR_CLASS_VARIABLE_BARE : PM_ERR_INSTANCE_VARIABLE_BARE; + pm_parser_err_token(parser, &parser->current, diag_id); + } + + // If we're lexing an embedded variable, then we need to pop back into the + // parent lex context. + if (parser->lex_modes.current->mode == PM_LEX_EMBVAR) { + lex_mode_pop(parser); + } + + return type; +} + +/** + * Optionally call out to the lex callback if one is provided. + */ +static inline void +parser_lex_callback(pm_parser_t *parser) { + if (parser->lex_callback) { + parser->lex_callback->callback(parser->lex_callback->data, parser, &parser->current); + } +} + +/** + * Return a new comment node of the specified type. + */ +static inline pm_comment_t * +parser_comment(pm_parser_t *parser, pm_comment_type_t type) { + pm_comment_t *comment = (pm_comment_t *) xcalloc(1, sizeof(pm_comment_t)); + if (comment == NULL) return NULL; + + *comment = (pm_comment_t) { + .type = type, + .location = { parser->current.start, parser->current.end } + }; + + return comment; +} + +/** + * Lex out embedded documentation, and return when we have either hit the end of + * the file or the end of the embedded documentation. This calls the callback + * manually because only the lexer should see these tokens, not the parser. + */ +static pm_token_type_t +lex_embdoc(pm_parser_t *parser) { + // First, lex out the EMBDOC_BEGIN token. + const uint8_t *newline = next_newline(parser->current.end, parser->end - parser->current.end); + + if (newline == NULL) { + parser->current.end = parser->end; + } else { + pm_newline_list_append(&parser->newline_list, newline); + parser->current.end = newline + 1; + } + + parser->current.type = PM_TOKEN_EMBDOC_BEGIN; + parser_lex_callback(parser); + + // Now, create a comment that is going to be attached to the parser. + pm_comment_t *comment = parser_comment(parser, PM_COMMENT_EMBDOC); + if (comment == NULL) return PM_TOKEN_EOF; + + // Now, loop until we find the end of the embedded documentation or the end + // of the file. + while (parser->current.end + 4 <= parser->end) { + parser->current.start = parser->current.end; + + // If we've hit the end of the embedded documentation then we'll return + // that token here. + if ( + (memcmp(parser->current.end, "=end", 4) == 0) && + ( + (parser->current.end + 4 == parser->end) || // end of file + pm_char_is_whitespace(parser->current.end[4]) || // whitespace + (parser->current.end[4] == '\0') || // NUL or end of script + (parser->current.end[4] == '\004') || // ^D + (parser->current.end[4] == '\032') // ^Z + ) + ) { + const uint8_t *newline = next_newline(parser->current.end, parser->end - parser->current.end); + + if (newline == NULL) { + parser->current.end = parser->end; + } else { + pm_newline_list_append(&parser->newline_list, newline); + parser->current.end = newline + 1; + } + + parser->current.type = PM_TOKEN_EMBDOC_END; + parser_lex_callback(parser); + + comment->location.end = parser->current.end; + pm_list_append(&parser->comment_list, (pm_list_node_t *) comment); + + return PM_TOKEN_EMBDOC_END; + } + + // Otherwise, we'll parse until the end of the line and return a line of + // embedded documentation. + const uint8_t *newline = next_newline(parser->current.end, parser->end - parser->current.end); + + if (newline == NULL) { + parser->current.end = parser->end; + } else { + pm_newline_list_append(&parser->newline_list, newline); + parser->current.end = newline + 1; + } + + parser->current.type = PM_TOKEN_EMBDOC_LINE; + parser_lex_callback(parser); + } + + pm_parser_err_current(parser, PM_ERR_EMBDOC_TERM); + + comment->location.end = parser->current.end; + pm_list_append(&parser->comment_list, (pm_list_node_t *) comment); + + return PM_TOKEN_EOF; +} + +/** + * Set the current type to an ignored newline and then call the lex callback. + * This happens in a couple places depending on whether or not we have already + * lexed a comment. + */ +static inline void +parser_lex_ignored_newline(pm_parser_t *parser) { + parser->current.type = PM_TOKEN_IGNORED_NEWLINE; + parser_lex_callback(parser); +} + +/** + * This function will be called when a newline is encountered. In some newlines, + * we need to check if there is a heredoc or heredocs that we have already lexed + * the body of that we need to now skip past. That will be indicated by the + * heredoc_end field on the parser. + * + * If it is set, then we need to skip past the heredoc body and then clear the + * heredoc_end field. + */ +static inline void +parser_flush_heredoc_end(pm_parser_t *parser) { + assert(parser->heredoc_end <= parser->end); + parser->next_start = parser->heredoc_end; + parser->heredoc_end = NULL; +} + +/** + * Returns true if the parser has lexed the last token on the current line. +*/ +static bool +parser_end_of_line_p(const pm_parser_t *parser) { + const uint8_t *cursor = parser->current.end; + + while (cursor < parser->end && *cursor != '\n' && *cursor != '#') { + if (!pm_char_is_inline_whitespace(*cursor++)) return false; + } + + return true; +} + +/** + * When we're lexing certain types (strings, symbols, lists, etc.) we have + * string content associated with the tokens. For example: + * + * "foo" + * + * In this case, the string content is foo. Since there is no escaping, there's + * no need to track additional information and the token can be returned as + * normal. However, if we have escape sequences: + * + * "foo\n" + * + * then the bytes in the string are "f", "o", "o", "\", "n", but we want to + * provide our consumers with the string content "f", "o", "o", "\n". In these + * cases, when we find the first escape sequence, we initialize a pm_buffer_t + * to keep track of the string content. Then in the parser, it will + * automatically attach the string content to the node that it belongs to. + */ +typedef struct { + /** + * The buffer that we're using to keep track of the string content. It will + * only be initialized if we receive an escape sequence. + */ + pm_buffer_t buffer; + + /** + * The cursor into the source string that points to how far we have + * currently copied into the buffer. + */ + const uint8_t *cursor; +} pm_token_buffer_t; + +/** + * In order to properly set a regular expression's encoding and to validate + * the byte sequence for the underlying encoding we must process any escape + * sequences. The unescaped byte sequence will be stored in `buffer` just like + * for other string-like types. However, we also need to store the regular + * expression's source string. That string may be different from what we see + * during lexing because some escape sequences rewrite the source. + * + * This value will only be initialized for regular expressions and only if we + * receive an escape sequence. It will contain the regular expression's source + * string's byte sequence. + */ +typedef struct { + /** The embedded base buffer. */ + pm_token_buffer_t base; + + /** The buffer holding the regexp source. */ + pm_buffer_t regexp_buffer; +} pm_regexp_token_buffer_t; + +/** + * Push the given byte into the token buffer. + */ +static inline void +pm_token_buffer_push_byte(pm_token_buffer_t *token_buffer, uint8_t byte) { + pm_buffer_append_byte(&token_buffer->buffer, byte); +} + +static inline void +pm_regexp_token_buffer_push_byte(pm_regexp_token_buffer_t *token_buffer, uint8_t byte) { + pm_buffer_append_byte(&token_buffer->regexp_buffer, byte); +} + +/** + * Return the width of the character at the end of the current token. + */ +static inline size_t +parser_char_width(const pm_parser_t *parser) { + size_t width; + if (parser->encoding_changed) { + width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end); + } else { + width = pm_encoding_utf_8_char_width(parser->current.end, parser->end - parser->current.end); + } + + // TODO: If the character is invalid in the given encoding, then we'll just + // push one byte into the buffer. This should actually be an error. + return (width == 0 ? 1 : width); +} + +/** + * Push an escaped character into the token buffer. + */ +static void +pm_token_buffer_push_escaped(pm_token_buffer_t *token_buffer, pm_parser_t *parser) { + size_t width = parser_char_width(parser); + pm_buffer_append_bytes(&token_buffer->buffer, parser->current.end, width); + parser->current.end += width; +} + +static void +pm_regexp_token_buffer_push_escaped(pm_regexp_token_buffer_t *token_buffer, pm_parser_t *parser) { + size_t width = parser_char_width(parser); + pm_buffer_append_bytes(&token_buffer->base.buffer, parser->current.end, width); + pm_buffer_append_bytes(&token_buffer->regexp_buffer, parser->current.end, width); + parser->current.end += width; +} + +static bool +pm_slice_ascii_only_p(const uint8_t *value, size_t length) { + for (size_t index = 0; index < length; index++) { + if (value[index] & 0x80) return false; + } + + return true; +} + +/** + * When we're about to return from lexing the current token and we know for sure + * that we have found an escape sequence, this function is called to copy the + * contents of the token buffer into the current string on the parser so that it + * can be attached to the correct node. + */ +static inline void +pm_token_buffer_copy(pm_parser_t *parser, pm_token_buffer_t *token_buffer) { + pm_string_owned_init(&parser->current_string, (uint8_t *) pm_buffer_value(&token_buffer->buffer), pm_buffer_length(&token_buffer->buffer)); +} + +static inline void +pm_regexp_token_buffer_copy(pm_parser_t *parser, pm_regexp_token_buffer_t *token_buffer) { + pm_string_owned_init(&parser->current_string, (uint8_t *) pm_buffer_value(&token_buffer->base.buffer), pm_buffer_length(&token_buffer->base.buffer)); + parser->current_regular_expression_ascii_only = pm_slice_ascii_only_p((const uint8_t *) pm_buffer_value(&token_buffer->regexp_buffer), pm_buffer_length(&token_buffer->regexp_buffer)); + pm_buffer_free(&token_buffer->regexp_buffer); +} + +/** + * When we're about to return from lexing the current token, we need to flush + * all of the content that we have pushed into the buffer into the current + * string. If we haven't pushed anything into the buffer, this means that we + * never found an escape sequence, so we can directly reference the bounds of + * the current string. Either way, at the return of this function it is expected + * that parser->current_string is established in such a way that it can be + * attached to a node. + */ +static void +pm_token_buffer_flush(pm_parser_t *parser, pm_token_buffer_t *token_buffer) { + if (token_buffer->cursor == NULL) { + pm_string_shared_init(&parser->current_string, parser->current.start, parser->current.end); + } else { + pm_buffer_append_bytes(&token_buffer->buffer, token_buffer->cursor, (size_t) (parser->current.end - token_buffer->cursor)); + pm_token_buffer_copy(parser, token_buffer); + } +} + +static void +pm_regexp_token_buffer_flush(pm_parser_t *parser, pm_regexp_token_buffer_t *token_buffer) { + if (token_buffer->base.cursor == NULL) { + pm_string_shared_init(&parser->current_string, parser->current.start, parser->current.end); + parser->current_regular_expression_ascii_only = pm_slice_ascii_only_p(parser->current.start, (size_t) (parser->current.end - parser->current.start)); + } else { + pm_buffer_append_bytes(&token_buffer->base.buffer, token_buffer->base.cursor, (size_t) (parser->current.end - token_buffer->base.cursor)); + pm_buffer_append_bytes(&token_buffer->regexp_buffer, token_buffer->base.cursor, (size_t) (parser->current.end - token_buffer->base.cursor)); + pm_regexp_token_buffer_copy(parser, token_buffer); + } +} + +#define PM_TOKEN_BUFFER_DEFAULT_SIZE 16 + +/** + * When we've found an escape sequence, we need to copy everything up to this + * point into the buffer because we're about to provide a string that has + * different content than a direct slice of the source. + * + * It is expected that the parser's current token end will be pointing at one + * byte past the backslash that starts the escape sequence. + */ +static void +pm_token_buffer_escape(pm_parser_t *parser, pm_token_buffer_t *token_buffer) { + const uint8_t *start; + if (token_buffer->cursor == NULL) { + pm_buffer_init_capacity(&token_buffer->buffer, PM_TOKEN_BUFFER_DEFAULT_SIZE); + start = parser->current.start; + } else { + start = token_buffer->cursor; + } + + const uint8_t *end = parser->current.end - 1; + assert(end >= start); + pm_buffer_append_bytes(&token_buffer->buffer, start, (size_t) (end - start)); + + token_buffer->cursor = end; +} + +static void +pm_regexp_token_buffer_escape(pm_parser_t *parser, pm_regexp_token_buffer_t *token_buffer) { + const uint8_t *start; + if (token_buffer->base.cursor == NULL) { + pm_buffer_init_capacity(&token_buffer->base.buffer, PM_TOKEN_BUFFER_DEFAULT_SIZE); + pm_buffer_init_capacity(&token_buffer->regexp_buffer, PM_TOKEN_BUFFER_DEFAULT_SIZE); + start = parser->current.start; + } else { + start = token_buffer->base.cursor; + } + + const uint8_t *end = parser->current.end - 1; + pm_buffer_append_bytes(&token_buffer->base.buffer, start, (size_t) (end - start)); + pm_buffer_append_bytes(&token_buffer->regexp_buffer, start, (size_t) (end - start)); + + token_buffer->base.cursor = end; +} + +#undef PM_TOKEN_BUFFER_DEFAULT_SIZE + +/** + * Effectively the same thing as pm_strspn_inline_whitespace, but in the case of + * a tilde heredoc expands out tab characters to the nearest tab boundaries. + */ +static inline size_t +pm_heredoc_strspn_inline_whitespace(pm_parser_t *parser, const uint8_t **cursor, pm_heredoc_indent_t indent) { + size_t whitespace = 0; + + switch (indent) { + case PM_HEREDOC_INDENT_NONE: + // Do nothing, we can't match a terminator with + // indentation and there's no need to calculate common + // whitespace. + break; + case PM_HEREDOC_INDENT_DASH: + // Skip past inline whitespace. + *cursor += pm_strspn_inline_whitespace(*cursor, parser->end - *cursor); + break; + case PM_HEREDOC_INDENT_TILDE: + // Skip past inline whitespace and calculate common + // whitespace. + while (*cursor < parser->end && pm_char_is_inline_whitespace(**cursor)) { + if (**cursor == '\t') { + whitespace = (whitespace / PM_TAB_WHITESPACE_SIZE + 1) * PM_TAB_WHITESPACE_SIZE; + } else { + whitespace++; + } + (*cursor)++; + } + + break; + } + + return whitespace; +} + +/** + * Lex past the delimiter of a percent literal. Handle newlines and heredocs + * appropriately. + */ +static uint8_t +pm_lex_percent_delimiter(pm_parser_t *parser) { + size_t eol_length = match_eol(parser); + + if (eol_length) { + if (parser->heredoc_end) { + // If we have already lexed a heredoc, then the newline has already + // been added to the list. In this case we want to just flush the + // heredoc end. + parser_flush_heredoc_end(parser); + } else { + // Otherwise, we'll add the newline to the list of newlines. + pm_newline_list_append(&parser->newline_list, parser->current.end + eol_length - 1); + } + + uint8_t delimiter = *parser->current.end; + + // If our delimiter is \r\n, we want to treat it as if it's \n. + // For example, %\r\nfoo\r\n should be "foo" + if (eol_length == 2) { + delimiter = *(parser->current.end + 1); + } + + parser->current.end += eol_length; + return delimiter; + } + + return *parser->current.end++; +} + +/** + * This is a convenience macro that will set the current token type, call the + * lex callback, and then return from the parser_lex function. + */ +#define LEX(token_type) parser->current.type = token_type; parser_lex_callback(parser); return + +/** + * Called when the parser requires a new token. The parser maintains a moving + * window of two tokens at a time: parser.previous and parser.current. This + * function will move the current token into the previous token and then + * lex a new token into the current token. + */ +static void +parser_lex(pm_parser_t *parser) { + assert(parser->current.end <= parser->end); + parser->previous = parser->current; + + // This value mirrors cmd_state from CRuby. + bool previous_command_start = parser->command_start; + parser->command_start = false; + + // This is used to communicate to the newline lexing function that we've + // already seen a comment. + bool lexed_comment = false; + + // Here we cache the current value of the semantic token seen flag. This is + // used to reset it in case we find a token that shouldn't flip this flag. + unsigned int semantic_token_seen = parser->semantic_token_seen; + parser->semantic_token_seen = true; + + switch (parser->lex_modes.current->mode) { + case PM_LEX_DEFAULT: + case PM_LEX_EMBEXPR: + case PM_LEX_EMBVAR: + + // We have a specific named label here because we are going to jump back to + // this location in the event that we have lexed a token that should not be + // returned to the parser. This includes comments, ignored newlines, and + // invalid tokens of some form. + lex_next_token: { + // If we have the special next_start pointer set, then we're going to jump + // to that location and start lexing from there. + if (parser->next_start != NULL) { + parser->current.end = parser->next_start; + parser->next_start = NULL; + } + + // This value mirrors space_seen from CRuby. It tracks whether or not + // space has been eaten before the start of the next token. + bool space_seen = false; + + // First, we're going to skip past any whitespace at the front of the next + // token. + bool chomping = true; + while (parser->current.end < parser->end && chomping) { + switch (*parser->current.end) { + case ' ': + case '\t': + case '\f': + case '\v': + parser->current.end++; + space_seen = true; + break; + case '\r': + if (match_eol_offset(parser, 1)) { + chomping = false; + } else { + pm_parser_warn(parser, parser->current.end, parser->current.end + 1, PM_WARN_UNEXPECTED_CARRIAGE_RETURN); + parser->current.end++; + space_seen = true; + } + break; + case '\\': { + size_t eol_length = match_eol_offset(parser, 1); + if (eol_length) { + if (parser->heredoc_end) { + parser->current.end = parser->heredoc_end; + parser->heredoc_end = NULL; + } else { + parser->current.end += eol_length + 1; + pm_newline_list_append(&parser->newline_list, parser->current.end - 1); + space_seen = true; + } + } else if (pm_char_is_inline_whitespace(*parser->current.end)) { + parser->current.end += 2; + } else { + chomping = false; + } + + break; + } + default: + chomping = false; + break; + } + } + + // Next, we'll set to start of this token to be the current end. + parser->current.start = parser->current.end; + + // We'll check if we're at the end of the file. If we are, then we + // need to return the EOF token. + if (parser->current.end >= parser->end) { + // If we hit EOF, but the EOF came immediately after a newline, + // set the start of the token to the newline. This way any EOF + // errors will be reported as happening on that line rather than + // a line after. For example "foo(\n" should report an error + // on line 1 even though EOF technically occurs on line 2. + if (parser->current.start > parser->start && (*(parser->current.start - 1) == '\n')) { + parser->current.start -= 1; + } + LEX(PM_TOKEN_EOF); + } + + // Finally, we'll check the current character to determine the next + // token. + switch (*parser->current.end++) { + case '\0': // NUL or end of script + case '\004': // ^D + case '\032': // ^Z + parser->current.end--; + LEX(PM_TOKEN_EOF); + + case '#': { // comments + const uint8_t *ending = next_newline(parser->current.end, parser->end - parser->current.end); + parser->current.end = ending == NULL ? parser->end : ending; + + // If we found a comment while lexing, then we're going to + // add it to the list of comments in the file and keep + // lexing. + pm_comment_t *comment = parser_comment(parser, PM_COMMENT_INLINE); + pm_list_append(&parser->comment_list, (pm_list_node_t *) comment); + + if (ending) parser->current.end++; + parser->current.type = PM_TOKEN_COMMENT; + parser_lex_callback(parser); + + // Here, parse the comment to see if it's a magic comment + // and potentially change state on the parser. + if (!parser_lex_magic_comment(parser, semantic_token_seen) && (parser->current.start == parser->encoding_comment_start)) { + ptrdiff_t length = parser->current.end - parser->current.start; + + // If we didn't find a magic comment within the first + // pass and we're at the start of the file, then we need + // to do another pass to potentially find other patterns + // for encoding comments. + if (length >= 10 && !parser->encoding_locked) { + parser_lex_magic_comment_encoding(parser); + } + } + + lexed_comment = true; + } + PRISM_FALLTHROUGH + case '\r': + case '\n': { + parser->semantic_token_seen = semantic_token_seen & 0x1; + size_t eol_length = match_eol_at(parser, parser->current.end - 1); + + if (eol_length) { + // The only way you can have carriage returns in this + // particular loop is if you have a carriage return + // followed by a newline. In that case we'll just skip + // over the carriage return and continue lexing, in + // order to make it so that the newline token + // encapsulates both the carriage return and the + // newline. Note that we need to check that we haven't + // already lexed a comment here because that falls + // through into here as well. + if (!lexed_comment) { + parser->current.end += eol_length - 1; // skip CR + } + + if (parser->heredoc_end == NULL) { + pm_newline_list_append(&parser->newline_list, parser->current.end - 1); + } + } + + if (parser->heredoc_end) { + parser_flush_heredoc_end(parser); + } + + // If this is an ignored newline, then we can continue lexing after + // calling the callback with the ignored newline token. + switch (lex_state_ignored_p(parser)) { + case PM_IGNORED_NEWLINE_NONE: + break; + case PM_IGNORED_NEWLINE_PATTERN: + if (parser->pattern_matching_newlines || parser->in_keyword_arg) { + if (!lexed_comment) parser_lex_ignored_newline(parser); + lex_state_set(parser, PM_LEX_STATE_BEG); + parser->command_start = true; + parser->current.type = PM_TOKEN_NEWLINE; + return; + } + PRISM_FALLTHROUGH + case PM_IGNORED_NEWLINE_ALL: + if (!lexed_comment) parser_lex_ignored_newline(parser); + lexed_comment = false; + goto lex_next_token; + } + + // Here we need to look ahead and see if there is a call operator + // (either . or &.) that starts the next line. If there is, then this + // is going to become an ignored newline and we're going to instead + // return the call operator. + const uint8_t *next_content = parser->next_start == NULL ? parser->current.end : parser->next_start; + next_content += pm_strspn_inline_whitespace(next_content, parser->end - next_content); + + if (next_content < parser->end) { + // If we hit a comment after a newline, then we're going to check + // if it's ignored or if it's followed by a method call ('.'). + // If it is, then we're going to call the + // callback with an ignored newline and then continue lexing. + // Otherwise we'll return a regular newline. + if (next_content[0] == '#') { + // Here we look for a "." or "&." following a "\n". + const uint8_t *following = next_newline(next_content, parser->end - next_content); + + while (following && (following + 1 < parser->end)) { + following++; + following += pm_strspn_inline_whitespace(following, parser->end - following); + + // If this is not followed by a comment, then we can break out + // of this loop. + if (peek_at(parser, following) != '#') break; + + // If there is a comment, then we need to find the end of the + // comment and continue searching from there. + following = next_newline(following, parser->end - following); + } + + // If the lex state was ignored, we will lex the + // ignored newline. + if (lex_state_ignored_p(parser)) { + if (!lexed_comment) parser_lex_ignored_newline(parser); + lexed_comment = false; + goto lex_next_token; + } + + // If we hit a '.' or a '&.' we will lex the ignored + // newline. + if (following && ( + (peek_at(parser, following) == '.') || + (peek_at(parser, following) == '&' && peek_at(parser, following + 1) == '.') + )) { + if (!lexed_comment) parser_lex_ignored_newline(parser); + lexed_comment = false; + goto lex_next_token; + } + + + // If we are parsing as CRuby 4.0 or later and we + // hit a '&&' or a '||' then we will lex the ignored + // newline. + if ( + (parser->version >= PM_OPTIONS_VERSION_CRUBY_4_0) && + following && ( + (peek_at(parser, following) == '&' && peek_at(parser, following + 1) == '&') || + (peek_at(parser, following) == '|' && peek_at(parser, following + 1) == '|') || + (peek_at(parser, following) == 'a' && peek_at(parser, following + 1) == 'n' && peek_at(parser, following + 2) == 'd' && !char_is_identifier(parser, following + 3, parser->end - (following + 3))) || + (peek_at(parser, following) == 'o' && peek_at(parser, following + 1) == 'r' && !char_is_identifier(parser, following + 2, parser->end - (following + 2))) + ) + ) { + if (!lexed_comment) parser_lex_ignored_newline(parser); + lexed_comment = false; + goto lex_next_token; + } + } + + // If we hit a . after a newline, then we're in a call chain and + // we need to return the call operator. + if (next_content[0] == '.') { + // To match ripper, we need to emit an ignored newline even though + // it's a real newline in the case that we have a beginless range + // on a subsequent line. + if (peek_at(parser, next_content + 1) == '.') { + if (!lexed_comment) parser_lex_ignored_newline(parser); + lex_state_set(parser, PM_LEX_STATE_BEG); + parser->command_start = true; + parser->current.type = PM_TOKEN_NEWLINE; + return; + } + + if (!lexed_comment) parser_lex_ignored_newline(parser); + lex_state_set(parser, PM_LEX_STATE_DOT); + parser->current.start = next_content; + parser->current.end = next_content + 1; + parser->next_start = NULL; + LEX(PM_TOKEN_DOT); + } + + // If we hit a &. after a newline, then we're in a call chain and + // we need to return the call operator. + if (peek_at(parser, next_content) == '&' && peek_at(parser, next_content + 1) == '.') { + if (!lexed_comment) parser_lex_ignored_newline(parser); + lex_state_set(parser, PM_LEX_STATE_DOT); + parser->current.start = next_content; + parser->current.end = next_content + 2; + parser->next_start = NULL; + LEX(PM_TOKEN_AMPERSAND_DOT); + } + + if (parser->version >= PM_OPTIONS_VERSION_CRUBY_4_0) { + // If we hit an && then we are in a logical chain + // and we need to return the logical operator. + if (peek_at(parser, next_content) == '&' && peek_at(parser, next_content + 1) == '&') { + if (!lexed_comment) parser_lex_ignored_newline(parser); + lex_state_set(parser, PM_LEX_STATE_BEG); + parser->current.start = next_content; + parser->current.end = next_content + 2; + parser->next_start = NULL; + LEX(PM_TOKEN_AMPERSAND_AMPERSAND); + } + + // If we hit a || then we are in a logical chain and + // we need to return the logical operator. + if (peek_at(parser, next_content) == '|' && peek_at(parser, next_content + 1) == '|') { + if (!lexed_comment) parser_lex_ignored_newline(parser); + lex_state_set(parser, PM_LEX_STATE_BEG); + parser->current.start = next_content; + parser->current.end = next_content + 2; + parser->next_start = NULL; + LEX(PM_TOKEN_PIPE_PIPE); + } + + // If we hit an 'and' then we are in a logical chain + // and we need to return the logical operator. + if ( + peek_at(parser, next_content) == 'a' && + peek_at(parser, next_content + 1) == 'n' && + peek_at(parser, next_content + 2) == 'd' && + !char_is_identifier(parser, next_content + 3, parser->end - (next_content + 3)) + ) { + if (!lexed_comment) parser_lex_ignored_newline(parser); + lex_state_set(parser, PM_LEX_STATE_BEG); + parser->current.start = next_content; + parser->current.end = next_content + 3; + parser->next_start = NULL; + parser->command_start = true; + LEX(PM_TOKEN_KEYWORD_AND); + } + + // If we hit a 'or' then we are in a logical chain + // and we need to return the logical operator. + if ( + peek_at(parser, next_content) == 'o' && + peek_at(parser, next_content + 1) == 'r' && + !char_is_identifier(parser, next_content + 2, parser->end - (next_content + 2)) + ) { + if (!lexed_comment) parser_lex_ignored_newline(parser); + lex_state_set(parser, PM_LEX_STATE_BEG); + parser->current.start = next_content; + parser->current.end = next_content + 2; + parser->next_start = NULL; + parser->command_start = true; + LEX(PM_TOKEN_KEYWORD_OR); + } + } + } + + // At this point we know this is a regular newline, and we can set the + // necessary state and return the token. + lex_state_set(parser, PM_LEX_STATE_BEG); + parser->command_start = true; + parser->current.type = PM_TOKEN_NEWLINE; + if (!lexed_comment) parser_lex_callback(parser); + return; + } + + // , + case ',': + if ((parser->previous.type == PM_TOKEN_COMMA) && (parser->enclosure_nesting > 0)) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_ARRAY_TERM, pm_token_type_human(parser->current.type)); + } + + lex_state_set(parser, PM_LEX_STATE_BEG | PM_LEX_STATE_LABEL); + LEX(PM_TOKEN_COMMA); + + // ( + case '(': { + pm_token_type_t type = PM_TOKEN_PARENTHESIS_LEFT; + + if (space_seen && (lex_state_arg_p(parser) || parser->lex_state == (PM_LEX_STATE_END | PM_LEX_STATE_LABEL))) { + type = PM_TOKEN_PARENTHESIS_LEFT_PARENTHESES; + } + + parser->enclosure_nesting++; + lex_state_set(parser, PM_LEX_STATE_BEG | PM_LEX_STATE_LABEL); + pm_do_loop_stack_push(parser, false); + LEX(type); + } + + // ) + case ')': + parser->enclosure_nesting--; + lex_state_set(parser, PM_LEX_STATE_ENDFN); + pm_do_loop_stack_pop(parser); + LEX(PM_TOKEN_PARENTHESIS_RIGHT); + + // ; + case ';': + lex_state_set(parser, PM_LEX_STATE_BEG); + parser->command_start = true; + LEX(PM_TOKEN_SEMICOLON); + + // [ [] []= + case '[': + parser->enclosure_nesting++; + pm_token_type_t type = PM_TOKEN_BRACKET_LEFT; + + if (lex_state_operator_p(parser)) { + if (match(parser, ']')) { + parser->enclosure_nesting--; + lex_state_set(parser, PM_LEX_STATE_ARG); + LEX(match(parser, '=') ? PM_TOKEN_BRACKET_LEFT_RIGHT_EQUAL : PM_TOKEN_BRACKET_LEFT_RIGHT); + } + + lex_state_set(parser, PM_LEX_STATE_ARG | PM_LEX_STATE_LABEL); + LEX(type); + } + + if (lex_state_beg_p(parser) || (lex_state_arg_p(parser) && (space_seen || lex_state_p(parser, PM_LEX_STATE_LABELED)))) { + type = PM_TOKEN_BRACKET_LEFT_ARRAY; + } + + lex_state_set(parser, PM_LEX_STATE_BEG | PM_LEX_STATE_LABEL); + pm_do_loop_stack_push(parser, false); + LEX(type); + + // ] + case ']': + parser->enclosure_nesting--; + lex_state_set(parser, PM_LEX_STATE_END); + pm_do_loop_stack_pop(parser); + LEX(PM_TOKEN_BRACKET_RIGHT); + + // { + case '{': { + pm_token_type_t type = PM_TOKEN_BRACE_LEFT; + + if (parser->enclosure_nesting == parser->lambda_enclosure_nesting) { + // This { begins a lambda + parser->command_start = true; + lex_state_set(parser, PM_LEX_STATE_BEG); + type = PM_TOKEN_LAMBDA_BEGIN; + } else if (lex_state_p(parser, PM_LEX_STATE_LABELED)) { + // This { begins a hash literal + lex_state_set(parser, PM_LEX_STATE_BEG | PM_LEX_STATE_LABEL); + } else if (lex_state_p(parser, PM_LEX_STATE_ARG_ANY | PM_LEX_STATE_END | PM_LEX_STATE_ENDFN)) { + // This { begins a block + parser->command_start = true; + lex_state_set(parser, PM_LEX_STATE_BEG); + } else if (lex_state_p(parser, PM_LEX_STATE_ENDARG)) { + // This { begins a block on a command + parser->command_start = true; + lex_state_set(parser, PM_LEX_STATE_BEG); + } else { + // This { begins a hash literal + lex_state_set(parser, PM_LEX_STATE_BEG | PM_LEX_STATE_LABEL); + } + + parser->enclosure_nesting++; + parser->brace_nesting++; + pm_do_loop_stack_push(parser, false); + + LEX(type); + } + + // } + case '}': + parser->enclosure_nesting--; + pm_do_loop_stack_pop(parser); + + if ((parser->lex_modes.current->mode == PM_LEX_EMBEXPR) && (parser->brace_nesting == 0)) { + lex_mode_pop(parser); + LEX(PM_TOKEN_EMBEXPR_END); + } + + parser->brace_nesting--; + lex_state_set(parser, PM_LEX_STATE_END); + LEX(PM_TOKEN_BRACE_RIGHT); + + // * ** **= *= + case '*': { + if (match(parser, '*')) { + if (match(parser, '=')) { + lex_state_set(parser, PM_LEX_STATE_BEG); + LEX(PM_TOKEN_STAR_STAR_EQUAL); + } + + pm_token_type_t type = PM_TOKEN_STAR_STAR; + + if (lex_state_spcarg_p(parser, space_seen)) { + pm_parser_warn_token(parser, &parser->current, PM_WARN_AMBIGUOUS_PREFIX_STAR_STAR); + type = PM_TOKEN_USTAR_STAR; + } else if (lex_state_beg_p(parser)) { + type = PM_TOKEN_USTAR_STAR; + } else if (ambiguous_operator_p(parser, space_seen)) { + PM_PARSER_WARN_TOKEN_FORMAT(parser, parser->current, PM_WARN_AMBIGUOUS_BINARY_OPERATOR, "**", "argument prefix"); + } + + if (lex_state_operator_p(parser)) { + lex_state_set(parser, PM_LEX_STATE_ARG); + } else { + lex_state_set(parser, PM_LEX_STATE_BEG); + } + + LEX(type); + } + + if (match(parser, '=')) { + lex_state_set(parser, PM_LEX_STATE_BEG); + LEX(PM_TOKEN_STAR_EQUAL); + } + + pm_token_type_t type = PM_TOKEN_STAR; + + if (lex_state_spcarg_p(parser, space_seen)) { + pm_parser_warn_token(parser, &parser->current, PM_WARN_AMBIGUOUS_PREFIX_STAR); + type = PM_TOKEN_USTAR; + } else if (lex_state_beg_p(parser)) { + type = PM_TOKEN_USTAR; + } else if (ambiguous_operator_p(parser, space_seen)) { + PM_PARSER_WARN_TOKEN_FORMAT(parser, parser->current, PM_WARN_AMBIGUOUS_BINARY_OPERATOR, "*", "argument prefix"); + } + + if (lex_state_operator_p(parser)) { + lex_state_set(parser, PM_LEX_STATE_ARG); + } else { + lex_state_set(parser, PM_LEX_STATE_BEG); + } + + LEX(type); + } + + // ! != !~ !@ + case '!': + if (lex_state_operator_p(parser)) { + lex_state_set(parser, PM_LEX_STATE_ARG); + if (match(parser, '@')) { + LEX(PM_TOKEN_BANG); + } + } else { + lex_state_set(parser, PM_LEX_STATE_BEG); + } + + if (match(parser, '=')) { + LEX(PM_TOKEN_BANG_EQUAL); + } + + if (match(parser, '~')) { + LEX(PM_TOKEN_BANG_TILDE); + } + + LEX(PM_TOKEN_BANG); + + // = => =~ == === =begin + case '=': + if ( + current_token_starts_line(parser) && + (parser->current.end + 5 <= parser->end) && + memcmp(parser->current.end, "begin", 5) == 0 && + (pm_char_is_whitespace(peek_offset(parser, 5)) || (peek_offset(parser, 5) == '\0')) + ) { + pm_token_type_t type = lex_embdoc(parser); + if (type == PM_TOKEN_EOF) { + LEX(type); + } + + goto lex_next_token; + } + + if (lex_state_operator_p(parser)) { + lex_state_set(parser, PM_LEX_STATE_ARG); + } else { + lex_state_set(parser, PM_LEX_STATE_BEG); + } + + if (match(parser, '>')) { + LEX(PM_TOKEN_EQUAL_GREATER); + } + + if (match(parser, '~')) { + LEX(PM_TOKEN_EQUAL_TILDE); + } + + if (match(parser, '=')) { + LEX(match(parser, '=') ? PM_TOKEN_EQUAL_EQUAL_EQUAL : PM_TOKEN_EQUAL_EQUAL); + } + + LEX(PM_TOKEN_EQUAL); + + // < << <<= <= <=> + case '<': + if (match(parser, '<')) { + if ( + !lex_state_p(parser, PM_LEX_STATE_DOT | PM_LEX_STATE_CLASS) && + !lex_state_end_p(parser) && + (!lex_state_p(parser, PM_LEX_STATE_ARG_ANY) || lex_state_p(parser, PM_LEX_STATE_LABELED) || space_seen) + ) { + const uint8_t *end = parser->current.end; + + pm_heredoc_quote_t quote = PM_HEREDOC_QUOTE_NONE; + pm_heredoc_indent_t indent = PM_HEREDOC_INDENT_NONE; + + if (match(parser, '-')) { + indent = PM_HEREDOC_INDENT_DASH; + } + else if (match(parser, '~')) { + indent = PM_HEREDOC_INDENT_TILDE; + } + + if (match(parser, '`')) { + quote = PM_HEREDOC_QUOTE_BACKTICK; + } + else if (match(parser, '"')) { + quote = PM_HEREDOC_QUOTE_DOUBLE; + } + else if (match(parser, '\'')) { + quote = PM_HEREDOC_QUOTE_SINGLE; + } + + const uint8_t *ident_start = parser->current.end; + size_t width = 0; + + if (parser->current.end >= parser->end) { + parser->current.end = end; + } else if (quote == PM_HEREDOC_QUOTE_NONE && (width = char_is_identifier(parser, parser->current.end, parser->end - parser->current.end)) == 0) { + parser->current.end = end; + } else { + if (quote == PM_HEREDOC_QUOTE_NONE) { + parser->current.end += width; + + while ((width = char_is_identifier(parser, parser->current.end, parser->end - parser->current.end))) { + parser->current.end += width; + } + } else { + // If we have quotes, then we're going to go until we find the + // end quote. + while ((parser->current.end < parser->end) && quote != (pm_heredoc_quote_t) (*parser->current.end)) { + if (*parser->current.end == '\r' || *parser->current.end == '\n') break; + parser->current.end++; + } + } + + size_t ident_length = (size_t) (parser->current.end - ident_start); + bool ident_error = false; + + if (quote != PM_HEREDOC_QUOTE_NONE && !match(parser, (uint8_t) quote)) { + pm_parser_err(parser, ident_start, ident_start + ident_length, PM_ERR_HEREDOC_IDENTIFIER); + ident_error = true; + } + + parser->explicit_encoding = NULL; + lex_mode_push(parser, (pm_lex_mode_t) { + .mode = PM_LEX_HEREDOC, + .as.heredoc = { + .base = { + .ident_start = ident_start, + .ident_length = ident_length, + .quote = quote, + .indent = indent + }, + .next_start = parser->current.end, + .common_whitespace = NULL, + .line_continuation = false + } + }); + + if (parser->heredoc_end == NULL) { + const uint8_t *body_start = next_newline(parser->current.end, parser->end - parser->current.end); + + if (body_start == NULL) { + // If there is no newline after the heredoc identifier, then + // this is not a valid heredoc declaration. In this case we + // will add an error, but we will still return a heredoc + // start. + if (!ident_error) pm_parser_err_heredoc_term(parser, ident_start, ident_length); + body_start = parser->end; + } else { + // Otherwise, we want to indicate that the body of the + // heredoc starts on the character after the next newline. + pm_newline_list_append(&parser->newline_list, body_start); + body_start++; + } + + parser->next_start = body_start; + } else { + parser->next_start = parser->heredoc_end; + } + + LEX(PM_TOKEN_HEREDOC_START); + } + } + + if (match(parser, '=')) { + lex_state_set(parser, PM_LEX_STATE_BEG); + LEX(PM_TOKEN_LESS_LESS_EQUAL); + } + + if (ambiguous_operator_p(parser, space_seen)) { + PM_PARSER_WARN_TOKEN_FORMAT(parser, parser->current, PM_WARN_AMBIGUOUS_BINARY_OPERATOR, "<<", "here document"); + } + + if (lex_state_operator_p(parser)) { + lex_state_set(parser, PM_LEX_STATE_ARG); + } else { + if (lex_state_p(parser, PM_LEX_STATE_CLASS)) parser->command_start = true; + lex_state_set(parser, PM_LEX_STATE_BEG); + } + + LEX(PM_TOKEN_LESS_LESS); + } + + if (lex_state_operator_p(parser)) { + lex_state_set(parser, PM_LEX_STATE_ARG); + } else { + if (lex_state_p(parser, PM_LEX_STATE_CLASS)) parser->command_start = true; + lex_state_set(parser, PM_LEX_STATE_BEG); + } + + if (match(parser, '=')) { + if (match(parser, '>')) { + LEX(PM_TOKEN_LESS_EQUAL_GREATER); + } + + LEX(PM_TOKEN_LESS_EQUAL); + } + + LEX(PM_TOKEN_LESS); + + // > >> >>= >= + case '>': + if (match(parser, '>')) { + if (lex_state_operator_p(parser)) { + lex_state_set(parser, PM_LEX_STATE_ARG); + } else { + lex_state_set(parser, PM_LEX_STATE_BEG); + } + LEX(match(parser, '=') ? PM_TOKEN_GREATER_GREATER_EQUAL : PM_TOKEN_GREATER_GREATER); + } + + if (lex_state_operator_p(parser)) { + lex_state_set(parser, PM_LEX_STATE_ARG); + } else { + lex_state_set(parser, PM_LEX_STATE_BEG); + } + + LEX(match(parser, '=') ? PM_TOKEN_GREATER_EQUAL : PM_TOKEN_GREATER); + + // double-quoted string literal + case '"': { + bool label_allowed = (lex_state_p(parser, PM_LEX_STATE_LABEL | PM_LEX_STATE_ENDFN) && !previous_command_start) || lex_state_arg_p(parser); + lex_mode_push_string(parser, true, label_allowed, '\0', '"'); + LEX(PM_TOKEN_STRING_BEGIN); + } + + // xstring literal + case '`': { + if (lex_state_p(parser, PM_LEX_STATE_FNAME)) { + lex_state_set(parser, PM_LEX_STATE_ENDFN); + LEX(PM_TOKEN_BACKTICK); + } + + if (lex_state_p(parser, PM_LEX_STATE_DOT)) { + if (previous_command_start) { + lex_state_set(parser, PM_LEX_STATE_CMDARG); + } else { + lex_state_set(parser, PM_LEX_STATE_ARG); + } + + LEX(PM_TOKEN_BACKTICK); + } + + lex_mode_push_string(parser, true, false, '\0', '`'); + LEX(PM_TOKEN_BACKTICK); + } + + // single-quoted string literal + case '\'': { + bool label_allowed = (lex_state_p(parser, PM_LEX_STATE_LABEL | PM_LEX_STATE_ENDFN) && !previous_command_start) || lex_state_arg_p(parser); + lex_mode_push_string(parser, false, label_allowed, '\0', '\''); + LEX(PM_TOKEN_STRING_BEGIN); + } + + // ? character literal + case '?': + LEX(lex_question_mark(parser)); + + // & && &&= &= + case '&': { + if (match(parser, '&')) { + lex_state_set(parser, PM_LEX_STATE_BEG); + + if (match(parser, '=')) { + LEX(PM_TOKEN_AMPERSAND_AMPERSAND_EQUAL); + } + + LEX(PM_TOKEN_AMPERSAND_AMPERSAND); + } + + if (match(parser, '=')) { + lex_state_set(parser, PM_LEX_STATE_BEG); + LEX(PM_TOKEN_AMPERSAND_EQUAL); + } + + if (match(parser, '.')) { + lex_state_set(parser, PM_LEX_STATE_DOT); + LEX(PM_TOKEN_AMPERSAND_DOT); + } + + pm_token_type_t type = PM_TOKEN_AMPERSAND; + if (lex_state_spcarg_p(parser, space_seen)) { + if ((peek(parser) != ':') || (peek_offset(parser, 1) == '\0')) { + pm_parser_warn_token(parser, &parser->current, PM_WARN_AMBIGUOUS_PREFIX_AMPERSAND); + } else { + const uint8_t delim = peek_offset(parser, 1); + + if ((delim != '\'') && (delim != '"') && !char_is_identifier(parser, parser->current.end + 1, parser->end - (parser->current.end + 1))) { + pm_parser_warn_token(parser, &parser->current, PM_WARN_AMBIGUOUS_PREFIX_AMPERSAND); + } + } + + type = PM_TOKEN_UAMPERSAND; + } else if (lex_state_beg_p(parser)) { + type = PM_TOKEN_UAMPERSAND; + } else if (ambiguous_operator_p(parser, space_seen)) { + PM_PARSER_WARN_TOKEN_FORMAT(parser, parser->current, PM_WARN_AMBIGUOUS_BINARY_OPERATOR, "&", "argument prefix"); + } + + if (lex_state_operator_p(parser)) { + lex_state_set(parser, PM_LEX_STATE_ARG); + } else { + lex_state_set(parser, PM_LEX_STATE_BEG); + } + + LEX(type); + } + + // | || ||= |= + case '|': + if (match(parser, '|')) { + if (match(parser, '=')) { + lex_state_set(parser, PM_LEX_STATE_BEG); + LEX(PM_TOKEN_PIPE_PIPE_EQUAL); + } + + if (lex_state_p(parser, PM_LEX_STATE_BEG)) { + parser->current.end--; + LEX(PM_TOKEN_PIPE); + } + + lex_state_set(parser, PM_LEX_STATE_BEG); + LEX(PM_TOKEN_PIPE_PIPE); + } + + if (match(parser, '=')) { + lex_state_set(parser, PM_LEX_STATE_BEG); + LEX(PM_TOKEN_PIPE_EQUAL); + } + + if (lex_state_operator_p(parser)) { + lex_state_set(parser, PM_LEX_STATE_ARG); + } else { + lex_state_set(parser, PM_LEX_STATE_BEG | PM_LEX_STATE_LABEL); + } + + LEX(PM_TOKEN_PIPE); + + // + += +@ + case '+': { + if (lex_state_operator_p(parser)) { + lex_state_set(parser, PM_LEX_STATE_ARG); + + if (match(parser, '@')) { + LEX(PM_TOKEN_UPLUS); + } + + LEX(PM_TOKEN_PLUS); + } + + if (match(parser, '=')) { + lex_state_set(parser, PM_LEX_STATE_BEG); + LEX(PM_TOKEN_PLUS_EQUAL); + } + + if ( + lex_state_beg_p(parser) || + (lex_state_spcarg_p(parser, space_seen) ? (pm_parser_warn_token(parser, &parser->current, PM_WARN_AMBIGUOUS_FIRST_ARGUMENT_PLUS), true) : false) + ) { + lex_state_set(parser, PM_LEX_STATE_BEG); + + if (pm_char_is_decimal_digit(peek(parser))) { + parser->current.end++; + pm_token_type_t type = lex_numeric(parser); + lex_state_set(parser, PM_LEX_STATE_END); + LEX(type); + } + + LEX(PM_TOKEN_UPLUS); + } + + if (ambiguous_operator_p(parser, space_seen)) { + PM_PARSER_WARN_TOKEN_FORMAT(parser, parser->current, PM_WARN_AMBIGUOUS_BINARY_OPERATOR, "+", "unary operator"); + } + + lex_state_set(parser, PM_LEX_STATE_BEG); + LEX(PM_TOKEN_PLUS); + } + + // - -= -@ + case '-': { + if (lex_state_operator_p(parser)) { + lex_state_set(parser, PM_LEX_STATE_ARG); + + if (match(parser, '@')) { + LEX(PM_TOKEN_UMINUS); + } + + LEX(PM_TOKEN_MINUS); + } + + if (match(parser, '=')) { + lex_state_set(parser, PM_LEX_STATE_BEG); + LEX(PM_TOKEN_MINUS_EQUAL); + } + + if (match(parser, '>')) { + lex_state_set(parser, PM_LEX_STATE_ENDFN); + LEX(PM_TOKEN_MINUS_GREATER); + } + + bool spcarg = lex_state_spcarg_p(parser, space_seen); + bool is_beg = lex_state_beg_p(parser); + if (!is_beg && spcarg) { + pm_parser_warn_token(parser, &parser->current, PM_WARN_AMBIGUOUS_FIRST_ARGUMENT_MINUS); + } + + if (is_beg || spcarg) { + lex_state_set(parser, PM_LEX_STATE_BEG); + LEX(pm_char_is_decimal_digit(peek(parser)) ? PM_TOKEN_UMINUS_NUM : PM_TOKEN_UMINUS); + } + + if (ambiguous_operator_p(parser, space_seen)) { + PM_PARSER_WARN_TOKEN_FORMAT(parser, parser->current, PM_WARN_AMBIGUOUS_BINARY_OPERATOR, "-", "unary operator"); + } + + lex_state_set(parser, PM_LEX_STATE_BEG); + LEX(PM_TOKEN_MINUS); + } + + // . .. ... + case '.': { + bool beg_p = lex_state_beg_p(parser); + + if (match(parser, '.')) { + if (match(parser, '.')) { + // If we're _not_ inside a range within default parameters + if (!context_p(parser, PM_CONTEXT_DEFAULT_PARAMS) && context_p(parser, PM_CONTEXT_DEF_PARAMS)) { + if (lex_state_p(parser, PM_LEX_STATE_END)) { + lex_state_set(parser, PM_LEX_STATE_BEG); + } else { + lex_state_set(parser, PM_LEX_STATE_ENDARG); + } + LEX(PM_TOKEN_UDOT_DOT_DOT); + } + + if (parser->enclosure_nesting == 0 && parser_end_of_line_p(parser)) { + pm_parser_warn_token(parser, &parser->current, PM_WARN_DOT_DOT_DOT_EOL); + } + + lex_state_set(parser, PM_LEX_STATE_BEG); + LEX(beg_p ? PM_TOKEN_UDOT_DOT_DOT : PM_TOKEN_DOT_DOT_DOT); + } + + lex_state_set(parser, PM_LEX_STATE_BEG); + LEX(beg_p ? PM_TOKEN_UDOT_DOT : PM_TOKEN_DOT_DOT); + } + + lex_state_set(parser, PM_LEX_STATE_DOT); + LEX(PM_TOKEN_DOT); + } + + // integer + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': { + pm_token_type_t type = lex_numeric(parser); + lex_state_set(parser, PM_LEX_STATE_END); + LEX(type); + } + + // :: symbol + case ':': + if (match(parser, ':')) { + if (lex_state_beg_p(parser) || lex_state_p(parser, PM_LEX_STATE_CLASS) || (lex_state_p(parser, PM_LEX_STATE_ARG_ANY) && space_seen)) { + lex_state_set(parser, PM_LEX_STATE_BEG); + LEX(PM_TOKEN_UCOLON_COLON); + } + + lex_state_set(parser, PM_LEX_STATE_DOT); + LEX(PM_TOKEN_COLON_COLON); + } + + if (lex_state_end_p(parser) || pm_char_is_whitespace(peek(parser)) || peek(parser) == '#') { + lex_state_set(parser, PM_LEX_STATE_BEG); + LEX(PM_TOKEN_COLON); + } + + if (peek(parser) == '"' || peek(parser) == '\'') { + lex_mode_push_string(parser, peek(parser) == '"', false, '\0', *parser->current.end); + parser->current.end++; + } + + lex_state_set(parser, PM_LEX_STATE_FNAME); + LEX(PM_TOKEN_SYMBOL_BEGIN); + + // / /= + case '/': + if (lex_state_beg_p(parser)) { + lex_mode_push_regexp(parser, '\0', '/'); + LEX(PM_TOKEN_REGEXP_BEGIN); + } + + if (match(parser, '=')) { + lex_state_set(parser, PM_LEX_STATE_BEG); + LEX(PM_TOKEN_SLASH_EQUAL); + } + + if (lex_state_spcarg_p(parser, space_seen)) { + pm_parser_warn_token(parser, &parser->current, PM_WARN_AMBIGUOUS_SLASH); + lex_mode_push_regexp(parser, '\0', '/'); + LEX(PM_TOKEN_REGEXP_BEGIN); + } + + if (ambiguous_operator_p(parser, space_seen)) { + PM_PARSER_WARN_TOKEN_FORMAT(parser, parser->current, PM_WARN_AMBIGUOUS_BINARY_OPERATOR, "/", "regexp literal"); + } + + if (lex_state_operator_p(parser)) { + lex_state_set(parser, PM_LEX_STATE_ARG); + } else { + lex_state_set(parser, PM_LEX_STATE_BEG); + } + + LEX(PM_TOKEN_SLASH); + + // ^ ^= + case '^': + if (lex_state_operator_p(parser)) { + lex_state_set(parser, PM_LEX_STATE_ARG); + } else { + lex_state_set(parser, PM_LEX_STATE_BEG); + } + LEX(match(parser, '=') ? PM_TOKEN_CARET_EQUAL : PM_TOKEN_CARET); + + // ~ ~@ + case '~': + if (lex_state_operator_p(parser)) { + (void) match(parser, '@'); + lex_state_set(parser, PM_LEX_STATE_ARG); + } else { + lex_state_set(parser, PM_LEX_STATE_BEG); + } + + LEX(PM_TOKEN_TILDE); + + // % %= %i %I %q %Q %w %W + case '%': { + // If there is no subsequent character then we have an + // invalid token. We're going to say it's the percent + // operator because we don't want to move into the string + // lex mode unnecessarily. + if ((lex_state_beg_p(parser) || lex_state_arg_p(parser)) && (parser->current.end >= parser->end)) { + pm_parser_err_current(parser, PM_ERR_INVALID_PERCENT_EOF); + LEX(PM_TOKEN_PERCENT); + } + + if (!lex_state_beg_p(parser) && match(parser, '=')) { + lex_state_set(parser, PM_LEX_STATE_BEG); + LEX(PM_TOKEN_PERCENT_EQUAL); + } else if ( + lex_state_beg_p(parser) || + (lex_state_p(parser, PM_LEX_STATE_FITEM) && (peek(parser) == 's')) || + lex_state_spcarg_p(parser, space_seen) + ) { + if (!parser->encoding->alnum_char(parser->current.end, parser->end - parser->current.end)) { + if (*parser->current.end >= 0x80) { + pm_parser_err_current(parser, PM_ERR_INVALID_PERCENT); + } + + const uint8_t delimiter = pm_lex_percent_delimiter(parser); + lex_mode_push_string(parser, true, false, lex_mode_incrementor(delimiter), lex_mode_terminator(delimiter)); + LEX(PM_TOKEN_STRING_BEGIN); + } + + // Delimiters for %-literals cannot be alphanumeric. We + // validate that here. + uint8_t delimiter = peek_offset(parser, 1); + if (delimiter >= 0x80 || parser->encoding->alnum_char(&delimiter, 1)) { + pm_parser_err_current(parser, PM_ERR_INVALID_PERCENT); + goto lex_next_token; + } + + switch (peek(parser)) { + case 'i': { + parser->current.end++; + + if (parser->current.end < parser->end) { + lex_mode_push_list(parser, false, pm_lex_percent_delimiter(parser)); + } else { + lex_mode_push_list_eof(parser); + } + + LEX(PM_TOKEN_PERCENT_LOWER_I); + } + case 'I': { + parser->current.end++; + + if (parser->current.end < parser->end) { + lex_mode_push_list(parser, true, pm_lex_percent_delimiter(parser)); + } else { + lex_mode_push_list_eof(parser); + } + + LEX(PM_TOKEN_PERCENT_UPPER_I); + } + case 'r': { + parser->current.end++; + + if (parser->current.end < parser->end) { + const uint8_t delimiter = pm_lex_percent_delimiter(parser); + lex_mode_push_regexp(parser, lex_mode_incrementor(delimiter), lex_mode_terminator(delimiter)); + } else { + lex_mode_push_regexp(parser, '\0', '\0'); + } + + LEX(PM_TOKEN_REGEXP_BEGIN); + } + case 'q': { + parser->current.end++; + + if (parser->current.end < parser->end) { + const uint8_t delimiter = pm_lex_percent_delimiter(parser); + lex_mode_push_string(parser, false, false, lex_mode_incrementor(delimiter), lex_mode_terminator(delimiter)); + } else { + lex_mode_push_string_eof(parser); + } + + LEX(PM_TOKEN_STRING_BEGIN); + } + case 'Q': { + parser->current.end++; + + if (parser->current.end < parser->end) { + const uint8_t delimiter = pm_lex_percent_delimiter(parser); + lex_mode_push_string(parser, true, false, lex_mode_incrementor(delimiter), lex_mode_terminator(delimiter)); + } else { + lex_mode_push_string_eof(parser); + } + + LEX(PM_TOKEN_STRING_BEGIN); + } + case 's': { + parser->current.end++; + + if (parser->current.end < parser->end) { + const uint8_t delimiter = pm_lex_percent_delimiter(parser); + lex_mode_push_string(parser, false, false, lex_mode_incrementor(delimiter), lex_mode_terminator(delimiter)); + lex_state_set(parser, PM_LEX_STATE_FNAME | PM_LEX_STATE_FITEM); + } else { + lex_mode_push_string_eof(parser); + } + + LEX(PM_TOKEN_SYMBOL_BEGIN); + } + case 'w': { + parser->current.end++; + + if (parser->current.end < parser->end) { + lex_mode_push_list(parser, false, pm_lex_percent_delimiter(parser)); + } else { + lex_mode_push_list_eof(parser); + } + + LEX(PM_TOKEN_PERCENT_LOWER_W); + } + case 'W': { + parser->current.end++; + + if (parser->current.end < parser->end) { + lex_mode_push_list(parser, true, pm_lex_percent_delimiter(parser)); + } else { + lex_mode_push_list_eof(parser); + } + + LEX(PM_TOKEN_PERCENT_UPPER_W); + } + case 'x': { + parser->current.end++; + + if (parser->current.end < parser->end) { + const uint8_t delimiter = pm_lex_percent_delimiter(parser); + lex_mode_push_string(parser, true, false, lex_mode_incrementor(delimiter), lex_mode_terminator(delimiter)); + } else { + lex_mode_push_string_eof(parser); + } + + LEX(PM_TOKEN_PERCENT_LOWER_X); + } + default: + // If we get to this point, then we have a % that is completely + // unparsable. In this case we'll just drop it from the parser + // and skip past it and hope that the next token is something + // that we can parse. + pm_parser_err_current(parser, PM_ERR_INVALID_PERCENT); + goto lex_next_token; + } + } + + if (ambiguous_operator_p(parser, space_seen)) { + PM_PARSER_WARN_TOKEN_FORMAT(parser, parser->current, PM_WARN_AMBIGUOUS_BINARY_OPERATOR, "%", "string literal"); + } + + lex_state_set(parser, lex_state_operator_p(parser) ? PM_LEX_STATE_ARG : PM_LEX_STATE_BEG); + LEX(PM_TOKEN_PERCENT); + } + + // global variable + case '$': { + pm_token_type_t type = lex_global_variable(parser); + + // If we're lexing an embedded variable, then we need to pop back into + // the parent lex context. + if (parser->lex_modes.current->mode == PM_LEX_EMBVAR) { + lex_mode_pop(parser); + } + + lex_state_set(parser, PM_LEX_STATE_END); + LEX(type); + } + + // instance variable, class variable + case '@': + lex_state_set(parser, parser->lex_state & PM_LEX_STATE_FNAME ? PM_LEX_STATE_ENDFN : PM_LEX_STATE_END); + LEX(lex_at_variable(parser)); + + default: { + if (*parser->current.start != '_') { + size_t width = char_is_identifier_start(parser, parser->current.start, parser->end - parser->current.start); + + // If this isn't the beginning of an identifier, then + // it's an invalid token as we've exhausted all of the + // other options. We'll skip past it and return the next + // token after adding an appropriate error message. + if (!width) { + if (*parser->current.start >= 0x80) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_CHARACTER, *parser->current.start); + } else if (*parser->current.start == '\\') { + switch (peek_at(parser, parser->current.start + 1)) { + case ' ': + parser->current.end++; + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_UNEXPECTED_TOKEN_IGNORE, "escaped space"); + break; + case '\f': + parser->current.end++; + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_UNEXPECTED_TOKEN_IGNORE, "escaped form feed"); + break; + case '\t': + parser->current.end++; + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_UNEXPECTED_TOKEN_IGNORE, "escaped horizontal tab"); + break; + case '\v': + parser->current.end++; + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_UNEXPECTED_TOKEN_IGNORE, "escaped vertical tab"); + break; + case '\r': + if (peek_at(parser, parser->current.start + 2) != '\n') { + parser->current.end++; + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_UNEXPECTED_TOKEN_IGNORE, "escaped carriage return"); + break; + } + PRISM_FALLTHROUGH + default: + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_UNEXPECTED_TOKEN_IGNORE, "backslash"); + break; + } + } else if (char_is_ascii_printable(*parser->current.start)) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_PRINTABLE_CHARACTER, *parser->current.start); + } else { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_CHARACTER, *parser->current.start); + } + + goto lex_next_token; + } + + parser->current.end = parser->current.start + width; + } + + pm_token_type_t type = lex_identifier(parser, previous_command_start); + + // If we've hit a __END__ and it was at the start of the + // line or the start of the file and it is followed by + // either a \n or a \r\n, then this is the last token of the + // file. + if ( + ((parser->current.end - parser->current.start) == 7) && + current_token_starts_line(parser) && + (memcmp(parser->current.start, "__END__", 7) == 0) && + (parser->current.end == parser->end || match_eol(parser)) + ) { + // Since we know we're about to add an __END__ comment, + // we know we need to add all of the newlines to get the + // correct column information for it. + const uint8_t *cursor = parser->current.end; + while ((cursor = next_newline(cursor, parser->end - cursor)) != NULL) { + pm_newline_list_append(&parser->newline_list, cursor++); + } + + parser->current.end = parser->end; + parser->current.type = PM_TOKEN___END__; + parser_lex_callback(parser); + + parser->data_loc.start = parser->current.start; + parser->data_loc.end = parser->current.end; + + LEX(PM_TOKEN_EOF); + } + + pm_lex_state_t last_state = parser->lex_state; + + if (type == PM_TOKEN_IDENTIFIER || type == PM_TOKEN_CONSTANT || type == PM_TOKEN_METHOD_NAME) { + if (lex_state_p(parser, PM_LEX_STATE_BEG_ANY | PM_LEX_STATE_ARG_ANY | PM_LEX_STATE_DOT)) { + if (previous_command_start) { + lex_state_set(parser, PM_LEX_STATE_CMDARG); + } else { + lex_state_set(parser, PM_LEX_STATE_ARG); + } + } else if (parser->lex_state == PM_LEX_STATE_FNAME) { + lex_state_set(parser, PM_LEX_STATE_ENDFN); + } else { + lex_state_set(parser, PM_LEX_STATE_END); + } + } + + if ( + !(last_state & (PM_LEX_STATE_DOT | PM_LEX_STATE_FNAME)) && + (type == PM_TOKEN_IDENTIFIER) && + ((pm_parser_local_depth(parser, &parser->current) != -1) || + pm_token_is_numbered_parameter(parser->current.start, parser->current.end)) + ) { + lex_state_set(parser, PM_LEX_STATE_END | PM_LEX_STATE_LABEL); + } + + LEX(type); + } + } + } + case PM_LEX_LIST: { + if (parser->next_start != NULL) { + parser->current.end = parser->next_start; + parser->next_start = NULL; + } + + // First we'll set the beginning of the token. + parser->current.start = parser->current.end; + + // If there's any whitespace at the start of the list, then we're + // going to trim it off the beginning and create a new token. + size_t whitespace; + + if (parser->heredoc_end) { + whitespace = pm_strspn_inline_whitespace(parser->current.end, parser->end - parser->current.end); + if (peek_offset(parser, (ptrdiff_t)whitespace) == '\n') { + whitespace += 1; + } + } else { + whitespace = pm_strspn_whitespace_newlines(parser->current.end, parser->end - parser->current.end, &parser->newline_list); + } + + if (whitespace > 0) { + parser->current.end += whitespace; + if (peek_offset(parser, -1) == '\n') { + // mutates next_start + parser_flush_heredoc_end(parser); + } + LEX(PM_TOKEN_WORDS_SEP); + } + + // We'll check if we're at the end of the file. If we are, then we + // need to return the EOF token. + if (parser->current.end >= parser->end) { + LEX(PM_TOKEN_EOF); + } + + // Here we'll get a list of the places where strpbrk should break, + // and then find the first one. + pm_lex_mode_t *lex_mode = parser->lex_modes.current; + const uint8_t *breakpoints = lex_mode->as.list.breakpoints; + const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true); + + // If we haven't found an escape yet, then this buffer will be + // unallocated since we can refer directly to the source string. + pm_token_buffer_t token_buffer = { 0 }; + + while (breakpoint != NULL) { + // If we hit whitespace, then we must have received content by + // now, so we can return an element of the list. + if (pm_char_is_whitespace(*breakpoint)) { + parser->current.end = breakpoint; + pm_token_buffer_flush(parser, &token_buffer); + LEX(PM_TOKEN_STRING_CONTENT); + } + + // If we hit the terminator, we need to check which token to + // return. + if (*breakpoint == lex_mode->as.list.terminator) { + // If this terminator doesn't actually close the list, then + // we need to continue on past it. + if (lex_mode->as.list.nesting > 0) { + parser->current.end = breakpoint + 1; + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true); + lex_mode->as.list.nesting--; + continue; + } + + // If we've hit the terminator and we've already skipped + // past content, then we can return a list node. + if (breakpoint > parser->current.start) { + parser->current.end = breakpoint; + pm_token_buffer_flush(parser, &token_buffer); + LEX(PM_TOKEN_STRING_CONTENT); + } + + // Otherwise, switch back to the default state and return + // the end of the list. + parser->current.end = breakpoint + 1; + lex_mode_pop(parser); + lex_state_set(parser, PM_LEX_STATE_END); + LEX(PM_TOKEN_STRING_END); + } + + // If we hit a null byte, skip directly past it. + if (*breakpoint == '\0') { + breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1), true); + continue; + } + + // If we hit escapes, then we need to treat the next token + // literally. In this case we'll skip past the next character + // and find the next breakpoint. + if (*breakpoint == '\\') { + parser->current.end = breakpoint + 1; + + // If we've hit the end of the file, then break out of the + // loop by setting the breakpoint to NULL. + if (parser->current.end == parser->end) { + breakpoint = NULL; + continue; + } + + pm_token_buffer_escape(parser, &token_buffer); + uint8_t peeked = peek(parser); + + switch (peeked) { + case ' ': + case '\f': + case '\t': + case '\v': + case '\\': + pm_token_buffer_push_byte(&token_buffer, peeked); + parser->current.end++; + break; + case '\r': + parser->current.end++; + if (peek(parser) != '\n') { + pm_token_buffer_push_byte(&token_buffer, '\r'); + break; + } + PRISM_FALLTHROUGH + case '\n': + pm_token_buffer_push_byte(&token_buffer, '\n'); + + if (parser->heredoc_end) { + // ... if we are on the same line as a heredoc, + // flush the heredoc and continue parsing after + // heredoc_end. + parser_flush_heredoc_end(parser); + pm_token_buffer_copy(parser, &token_buffer); + LEX(PM_TOKEN_STRING_CONTENT); + } else { + // ... else track the newline. + pm_newline_list_append(&parser->newline_list, parser->current.end); + } + + parser->current.end++; + break; + default: + if (peeked == lex_mode->as.list.incrementor || peeked == lex_mode->as.list.terminator) { + pm_token_buffer_push_byte(&token_buffer, peeked); + parser->current.end++; + } else if (lex_mode->as.list.interpolation) { + escape_read(parser, &token_buffer.buffer, NULL, PM_ESCAPE_FLAG_NONE); + } else { + pm_token_buffer_push_byte(&token_buffer, '\\'); + pm_token_buffer_push_escaped(&token_buffer, parser); + } + + break; + } + + token_buffer.cursor = parser->current.end; + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true); + continue; + } + + // If we hit a #, then we will attempt to lex interpolation. + if (*breakpoint == '#') { + pm_token_type_t type = lex_interpolation(parser, breakpoint); + + if (type == PM_TOKEN_NOT_PROVIDED) { + // If we haven't returned at this point then we had something + // that looked like an interpolated class or instance variable + // like "#@" but wasn't actually. In this case we'll just skip + // to the next breakpoint. + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true); + continue; + } + + if (type == PM_TOKEN_STRING_CONTENT) { + pm_token_buffer_flush(parser, &token_buffer); + } + + LEX(type); + } + + // If we've hit the incrementor, then we need to skip past it + // and find the next breakpoint. + assert(*breakpoint == lex_mode->as.list.incrementor); + parser->current.end = breakpoint + 1; + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true); + lex_mode->as.list.nesting++; + continue; + } + + if (parser->current.end > parser->current.start) { + pm_token_buffer_flush(parser, &token_buffer); + LEX(PM_TOKEN_STRING_CONTENT); + } + + // If we were unable to find a breakpoint, then this token hits the + // end of the file. + parser->current.end = parser->end; + pm_token_buffer_flush(parser, &token_buffer); + LEX(PM_TOKEN_STRING_CONTENT); + } + case PM_LEX_REGEXP: { + // First, we'll set to start of this token to be the current end. + if (parser->next_start == NULL) { + parser->current.start = parser->current.end; + } else { + parser->current.start = parser->next_start; + parser->current.end = parser->next_start; + parser->next_start = NULL; + } + + // We'll check if we're at the end of the file. If we are, then we + // need to return the EOF token. + if (parser->current.end >= parser->end) { + LEX(PM_TOKEN_EOF); + } + + // Get a reference to the current mode. + pm_lex_mode_t *lex_mode = parser->lex_modes.current; + + // These are the places where we need to split up the content of the + // regular expression. We'll use strpbrk to find the first of these + // characters. + const uint8_t *breakpoints = lex_mode->as.regexp.breakpoints; + const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false); + pm_regexp_token_buffer_t token_buffer = { 0 }; + + while (breakpoint != NULL) { + uint8_t term = lex_mode->as.regexp.terminator; + bool is_terminator = (*breakpoint == term); + + // If the terminator is newline, we need to consider \r\n _also_ a newline + // For example: `%\nfoo\r\n` + // The string should be "foo", not "foo\r" + if (*breakpoint == '\r' && peek_at(parser, breakpoint + 1) == '\n') { + if (term == '\n') { + is_terminator = true; + } + + // If the terminator is a CR, but we see a CRLF, we need to + // treat the CRLF as a newline, meaning this is _not_ the + // terminator + if (term == '\r') { + is_terminator = false; + } + } + + // If we hit the terminator, we need to determine what kind of + // token to return. + if (is_terminator) { + if (lex_mode->as.regexp.nesting > 0) { + parser->current.end = breakpoint + 1; + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false); + lex_mode->as.regexp.nesting--; + continue; + } + + // Here we've hit the terminator. If we have already consumed + // content then we need to return that content as string content + // first. + if (breakpoint > parser->current.start) { + parser->current.end = breakpoint; + pm_regexp_token_buffer_flush(parser, &token_buffer); + LEX(PM_TOKEN_STRING_CONTENT); + } + + // Check here if we need to track the newline. + size_t eol_length = match_eol_at(parser, breakpoint); + if (eol_length) { + parser->current.end = breakpoint + eol_length; + + // Track the newline if we're not in a heredoc that + // would have already have added the newline to the + // list. + if (parser->heredoc_end == NULL) { + pm_newline_list_append(&parser->newline_list, parser->current.end - 1); + } + } else { + parser->current.end = breakpoint + 1; + } + + // Since we've hit the terminator of the regular expression, + // we now need to parse the options. + parser->current.end += pm_strspn_regexp_option(parser->current.end, parser->end - parser->current.end); + + lex_mode_pop(parser); + lex_state_set(parser, PM_LEX_STATE_END); + LEX(PM_TOKEN_REGEXP_END); + } + + // If we've hit the incrementor, then we need to skip past it + // and find the next breakpoint. + if (*breakpoint && *breakpoint == lex_mode->as.regexp.incrementor) { + parser->current.end = breakpoint + 1; + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false); + lex_mode->as.regexp.nesting++; + continue; + } + + switch (*breakpoint) { + case '\0': + // If we hit a null byte, skip directly past it. + parser->current.end = breakpoint + 1; + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false); + break; + case '\r': + if (peek_at(parser, breakpoint + 1) != '\n') { + parser->current.end = breakpoint + 1; + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false); + break; + } + + breakpoint++; + parser->current.end = breakpoint; + pm_regexp_token_buffer_escape(parser, &token_buffer); + token_buffer.base.cursor = breakpoint; + + PRISM_FALLTHROUGH + case '\n': + // If we've hit a newline, then we need to track that in + // the list of newlines. + if (parser->heredoc_end == NULL) { + pm_newline_list_append(&parser->newline_list, breakpoint); + parser->current.end = breakpoint + 1; + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false); + break; + } + + parser->current.end = breakpoint + 1; + parser_flush_heredoc_end(parser); + pm_regexp_token_buffer_flush(parser, &token_buffer); + LEX(PM_TOKEN_STRING_CONTENT); + case '\\': { + // If we hit escapes, then we need to treat the next + // token literally. In this case we'll skip past the + // next character and find the next breakpoint. + parser->current.end = breakpoint + 1; + + // If we've hit the end of the file, then break out of + // the loop by setting the breakpoint to NULL. + if (parser->current.end == parser->end) { + breakpoint = NULL; + break; + } + + pm_regexp_token_buffer_escape(parser, &token_buffer); + uint8_t peeked = peek(parser); + + switch (peeked) { + case '\r': + parser->current.end++; + if (peek(parser) != '\n') { + if (lex_mode->as.regexp.terminator != '\r') { + pm_token_buffer_push_byte(&token_buffer.base, '\\'); + } + pm_regexp_token_buffer_push_byte(&token_buffer, '\r'); + pm_token_buffer_push_byte(&token_buffer.base, '\r'); + break; + } + PRISM_FALLTHROUGH + case '\n': + if (parser->heredoc_end) { + // ... if we are on the same line as a heredoc, + // flush the heredoc and continue parsing after + // heredoc_end. + parser_flush_heredoc_end(parser); + pm_regexp_token_buffer_copy(parser, &token_buffer); + LEX(PM_TOKEN_STRING_CONTENT); + } else { + // ... else track the newline. + pm_newline_list_append(&parser->newline_list, parser->current.end); + } + + parser->current.end++; + break; + case 'c': + case 'C': + case 'M': + case 'u': + case 'x': + escape_read(parser, &token_buffer.regexp_buffer, &token_buffer.base.buffer, PM_ESCAPE_FLAG_REGEXP); + break; + default: + if (lex_mode->as.regexp.terminator == peeked) { + // Some characters when they are used as the + // terminator also receive an escape. They are + // enumerated here. + switch (peeked) { + case '$': case ')': case '*': case '+': + case '.': case '>': case '?': case ']': + case '^': case '|': case '}': + pm_token_buffer_push_byte(&token_buffer.base, '\\'); + break; + default: + break; + } + + pm_regexp_token_buffer_push_byte(&token_buffer, peeked); + pm_token_buffer_push_byte(&token_buffer.base, peeked); + parser->current.end++; + break; + } + + if (peeked < 0x80) pm_token_buffer_push_byte(&token_buffer.base, '\\'); + pm_regexp_token_buffer_push_escaped(&token_buffer, parser); + break; + } + + token_buffer.base.cursor = parser->current.end; + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false); + break; + } + case '#': { + // If we hit a #, then we will attempt to lex + // interpolation. + pm_token_type_t type = lex_interpolation(parser, breakpoint); + + if (type == PM_TOKEN_NOT_PROVIDED) { + // If we haven't returned at this point then we had + // something that looked like an interpolated class or + // instance variable like "#@" but wasn't actually. In + // this case we'll just skip to the next breakpoint. + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false); + break; + } + + if (type == PM_TOKEN_STRING_CONTENT) { + pm_regexp_token_buffer_flush(parser, &token_buffer); + } + + LEX(type); + } + default: + assert(false && "unreachable"); + break; + } + } + + if (parser->current.end > parser->current.start) { + pm_regexp_token_buffer_flush(parser, &token_buffer); + LEX(PM_TOKEN_STRING_CONTENT); + } + + // If we were unable to find a breakpoint, then this token hits the + // end of the file. + parser->current.end = parser->end; + pm_regexp_token_buffer_flush(parser, &token_buffer); + LEX(PM_TOKEN_STRING_CONTENT); + } + case PM_LEX_STRING: { + // First, we'll set to start of this token to be the current end. + if (parser->next_start == NULL) { + parser->current.start = parser->current.end; + } else { + parser->current.start = parser->next_start; + parser->current.end = parser->next_start; + parser->next_start = NULL; + } + + // We'll check if we're at the end of the file. If we are, then we need to + // return the EOF token. + if (parser->current.end >= parser->end) { + LEX(PM_TOKEN_EOF); + } + + // These are the places where we need to split up the content of the + // string. We'll use strpbrk to find the first of these characters. + pm_lex_mode_t *lex_mode = parser->lex_modes.current; + const uint8_t *breakpoints = lex_mode->as.string.breakpoints; + const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true); + + // If we haven't found an escape yet, then this buffer will be + // unallocated since we can refer directly to the source string. + pm_token_buffer_t token_buffer = { 0 }; + + while (breakpoint != NULL) { + // If we hit the incrementor, then we'll increment then nesting and + // continue lexing. + if (lex_mode->as.string.incrementor != '\0' && *breakpoint == lex_mode->as.string.incrementor) { + lex_mode->as.string.nesting++; + parser->current.end = breakpoint + 1; + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true); + continue; + } + + uint8_t term = lex_mode->as.string.terminator; + bool is_terminator = (*breakpoint == term); + + // If the terminator is newline, we need to consider \r\n _also_ a newline + // For example: `%r\nfoo\r\n` + // The string should be /foo/, not /foo\r/ + if (*breakpoint == '\r' && peek_at(parser, breakpoint + 1) == '\n') { + if (term == '\n') { + is_terminator = true; + } + + // If the terminator is a CR, but we see a CRLF, we need to + // treat the CRLF as a newline, meaning this is _not_ the + // terminator + if (term == '\r') { + is_terminator = false; + } + } + + // Note that we have to check the terminator here first because we could + // potentially be parsing a % string that has a # character as the + // terminator. + if (is_terminator) { + // If this terminator doesn't actually close the string, then we need + // to continue on past it. + if (lex_mode->as.string.nesting > 0) { + parser->current.end = breakpoint + 1; + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true); + lex_mode->as.string.nesting--; + continue; + } + + // Here we've hit the terminator. If we have already consumed content + // then we need to return that content as string content first. + if (breakpoint > parser->current.start) { + parser->current.end = breakpoint; + pm_token_buffer_flush(parser, &token_buffer); + LEX(PM_TOKEN_STRING_CONTENT); + } + + // Otherwise we need to switch back to the parent lex mode and + // return the end of the string. + size_t eol_length = match_eol_at(parser, breakpoint); + if (eol_length) { + parser->current.end = breakpoint + eol_length; + + // Track the newline if we're not in a heredoc that + // would have already have added the newline to the + // list. + if (parser->heredoc_end == NULL) { + pm_newline_list_append(&parser->newline_list, parser->current.end - 1); + } + } else { + parser->current.end = breakpoint + 1; + } + + if (lex_mode->as.string.label_allowed && (peek(parser) == ':') && (peek_offset(parser, 1) != ':')) { + parser->current.end++; + lex_state_set(parser, PM_LEX_STATE_ARG | PM_LEX_STATE_LABELED); + lex_mode_pop(parser); + LEX(PM_TOKEN_LABEL_END); + } + + // When the delimiter itself is a newline, we won't + // get a chance to flush heredocs in the usual places since + // the newline is already consumed. + if (term == '\n' && parser->heredoc_end) { + parser_flush_heredoc_end(parser); + } + + lex_state_set(parser, PM_LEX_STATE_END); + lex_mode_pop(parser); + LEX(PM_TOKEN_STRING_END); + } + + switch (*breakpoint) { + case '\0': + // Skip directly past the null character. + parser->current.end = breakpoint + 1; + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true); + break; + case '\r': + if (peek_at(parser, breakpoint + 1) != '\n') { + parser->current.end = breakpoint + 1; + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true); + break; + } + + // If we hit a \r\n sequence, then we need to treat it + // as a newline. + breakpoint++; + parser->current.end = breakpoint; + pm_token_buffer_escape(parser, &token_buffer); + token_buffer.cursor = breakpoint; + + PRISM_FALLTHROUGH + case '\n': + // When we hit a newline, we need to flush any potential + // heredocs. Note that this has to happen after we check + // for the terminator in case the terminator is a + // newline character. + if (parser->heredoc_end == NULL) { + pm_newline_list_append(&parser->newline_list, breakpoint); + parser->current.end = breakpoint + 1; + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true); + break; + } + + parser->current.end = breakpoint + 1; + parser_flush_heredoc_end(parser); + pm_token_buffer_flush(parser, &token_buffer); + LEX(PM_TOKEN_STRING_CONTENT); + case '\\': { + // Here we hit escapes. + parser->current.end = breakpoint + 1; + + // If we've hit the end of the file, then break out of + // the loop by setting the breakpoint to NULL. + if (parser->current.end == parser->end) { + breakpoint = NULL; + continue; + } + + pm_token_buffer_escape(parser, &token_buffer); + uint8_t peeked = peek(parser); + + switch (peeked) { + case '\\': + pm_token_buffer_push_byte(&token_buffer, '\\'); + parser->current.end++; + break; + case '\r': + parser->current.end++; + if (peek(parser) != '\n') { + if (!lex_mode->as.string.interpolation) { + pm_token_buffer_push_byte(&token_buffer, '\\'); + } + pm_token_buffer_push_byte(&token_buffer, '\r'); + break; + } + PRISM_FALLTHROUGH + case '\n': + if (!lex_mode->as.string.interpolation) { + pm_token_buffer_push_byte(&token_buffer, '\\'); + pm_token_buffer_push_byte(&token_buffer, '\n'); + } + + if (parser->heredoc_end) { + // ... if we are on the same line as a heredoc, + // flush the heredoc and continue parsing after + // heredoc_end. + parser_flush_heredoc_end(parser); + pm_token_buffer_copy(parser, &token_buffer); + LEX(PM_TOKEN_STRING_CONTENT); + } else { + // ... else track the newline. + pm_newline_list_append(&parser->newline_list, parser->current.end); + } + + parser->current.end++; + break; + default: + if (lex_mode->as.string.incrementor != '\0' && peeked == lex_mode->as.string.incrementor) { + pm_token_buffer_push_byte(&token_buffer, peeked); + parser->current.end++; + } else if (lex_mode->as.string.terminator != '\0' && peeked == lex_mode->as.string.terminator) { + pm_token_buffer_push_byte(&token_buffer, peeked); + parser->current.end++; + } else if (lex_mode->as.string.interpolation) { + escape_read(parser, &token_buffer.buffer, NULL, PM_ESCAPE_FLAG_NONE); + } else { + pm_token_buffer_push_byte(&token_buffer, '\\'); + pm_token_buffer_push_escaped(&token_buffer, parser); + } + + break; + } + + token_buffer.cursor = parser->current.end; + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true); + break; + } + case '#': { + pm_token_type_t type = lex_interpolation(parser, breakpoint); + + if (type == PM_TOKEN_NOT_PROVIDED) { + // If we haven't returned at this point then we had something that + // looked like an interpolated class or instance variable like "#@" + // but wasn't actually. In this case we'll just skip to the next + // breakpoint. + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true); + break; + } + + if (type == PM_TOKEN_STRING_CONTENT) { + pm_token_buffer_flush(parser, &token_buffer); + } + + LEX(type); + } + default: + assert(false && "unreachable"); + } + } + + if (parser->current.end > parser->current.start) { + pm_token_buffer_flush(parser, &token_buffer); + LEX(PM_TOKEN_STRING_CONTENT); + } + + // If we've hit the end of the string, then this is an unterminated + // string. In that case we'll return a string content token. + parser->current.end = parser->end; + pm_token_buffer_flush(parser, &token_buffer); + LEX(PM_TOKEN_STRING_CONTENT); + } + case PM_LEX_HEREDOC: { + // First, we'll set to start of this token. + if (parser->next_start == NULL) { + parser->current.start = parser->current.end; + } else { + parser->current.start = parser->next_start; + parser->current.end = parser->next_start; + parser->heredoc_end = NULL; + parser->next_start = NULL; + } + + // Now let's grab the information about the identifier off of the + // current lex mode. + pm_lex_mode_t *lex_mode = parser->lex_modes.current; + pm_heredoc_lex_mode_t *heredoc_lex_mode = &lex_mode->as.heredoc.base; + + bool line_continuation = lex_mode->as.heredoc.line_continuation; + lex_mode->as.heredoc.line_continuation = false; + + // We'll check if we're at the end of the file. If we are, then we + // will add an error (because we weren't able to find the + // terminator) but still continue parsing so that content after the + // declaration of the heredoc can be parsed. + if (parser->current.end >= parser->end) { + pm_parser_err_heredoc_term(parser, heredoc_lex_mode->ident_start, heredoc_lex_mode->ident_length); + parser->next_start = lex_mode->as.heredoc.next_start; + parser->heredoc_end = parser->current.end; + lex_state_set(parser, PM_LEX_STATE_END); + lex_mode_pop(parser); + LEX(PM_TOKEN_HEREDOC_END); + } + + const uint8_t *ident_start = heredoc_lex_mode->ident_start; + size_t ident_length = heredoc_lex_mode->ident_length; + + // If we are immediately following a newline and we have hit the + // terminator, then we need to return the ending of the heredoc. + if (current_token_starts_line(parser)) { + const uint8_t *start = parser->current.start; + + if (!line_continuation && (start + ident_length <= parser->end)) { + const uint8_t *newline = next_newline(start, parser->end - start); + const uint8_t *ident_end = newline; + const uint8_t *terminator_end = newline; + + if (newline == NULL) { + terminator_end = parser->end; + ident_end = parser->end; + } else { + terminator_end++; + if (newline[-1] == '\r') { + ident_end--; // Remove \r + } + } + + const uint8_t *terminator_start = ident_end - ident_length; + const uint8_t *cursor = start; + + if (heredoc_lex_mode->indent == PM_HEREDOC_INDENT_DASH || heredoc_lex_mode->indent == PM_HEREDOC_INDENT_TILDE) { + while (cursor < terminator_start && pm_char_is_inline_whitespace(*cursor)) { + cursor++; + } + } + + if ( + (cursor == terminator_start) && + (memcmp(terminator_start, ident_start, ident_length) == 0) + ) { + if (newline != NULL) { + pm_newline_list_append(&parser->newline_list, newline); + } + + parser->current.end = terminator_end; + if (*lex_mode->as.heredoc.next_start == '\\') { + parser->next_start = NULL; + } else { + parser->next_start = lex_mode->as.heredoc.next_start; + parser->heredoc_end = parser->current.end; + } + + lex_state_set(parser, PM_LEX_STATE_END); + lex_mode_pop(parser); + LEX(PM_TOKEN_HEREDOC_END); + } + } + + size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, heredoc_lex_mode->indent); + if ( + heredoc_lex_mode->indent == PM_HEREDOC_INDENT_TILDE && + lex_mode->as.heredoc.common_whitespace != NULL && + (*lex_mode->as.heredoc.common_whitespace > whitespace) && + peek_at(parser, start) != '\n' + ) { + *lex_mode->as.heredoc.common_whitespace = whitespace; + } + } + + // Otherwise we'll be parsing string content. These are the places + // where we need to split up the content of the heredoc. We'll use + // strpbrk to find the first of these characters. + uint8_t breakpoints[] = "\r\n\\#"; + + pm_heredoc_quote_t quote = heredoc_lex_mode->quote; + if (quote == PM_HEREDOC_QUOTE_SINGLE) { + breakpoints[3] = '\0'; + } + + const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true); + pm_token_buffer_t token_buffer = { 0 }; + bool was_line_continuation = false; + + while (breakpoint != NULL) { + switch (*breakpoint) { + case '\0': + // Skip directly past the null character. + parser->current.end = breakpoint + 1; + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true); + break; + case '\r': + parser->current.end = breakpoint + 1; + + if (peek_at(parser, breakpoint + 1) != '\n') { + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true); + break; + } + + // If we hit a \r\n sequence, then we want to replace it + // with a single \n character in the final string. + breakpoint++; + pm_token_buffer_escape(parser, &token_buffer); + token_buffer.cursor = breakpoint; + + PRISM_FALLTHROUGH + case '\n': { + if (parser->heredoc_end != NULL && (parser->heredoc_end > breakpoint)) { + parser_flush_heredoc_end(parser); + parser->current.end = breakpoint + 1; + pm_token_buffer_flush(parser, &token_buffer); + LEX(PM_TOKEN_STRING_CONTENT); + } + + pm_newline_list_append(&parser->newline_list, breakpoint); + + // If we have a - or ~ heredoc, then we can match after + // some leading whitespace. + const uint8_t *start = breakpoint + 1; + + if (!was_line_continuation && (start + ident_length <= parser->end)) { + // We want to match the terminator starting from the end of the line in case + // there is whitespace in the ident such as <<-' DOC' or <<~' DOC'. + const uint8_t *newline = next_newline(start, parser->end - start); + + if (newline == NULL) { + newline = parser->end; + } else if (newline[-1] == '\r') { + newline--; // Remove \r + } + + // Start of a possible terminator. + const uint8_t *terminator_start = newline - ident_length; + + // Cursor to check for the leading whitespace. We skip the + // leading whitespace if we have a - or ~ heredoc. + const uint8_t *cursor = start; + + if (heredoc_lex_mode->indent == PM_HEREDOC_INDENT_DASH || heredoc_lex_mode->indent == PM_HEREDOC_INDENT_TILDE) { + while (cursor < terminator_start && pm_char_is_inline_whitespace(*cursor)) { + cursor++; + } + } + + if ( + cursor == terminator_start && + (memcmp(terminator_start, ident_start, ident_length) == 0) + ) { + parser->current.end = breakpoint + 1; + pm_token_buffer_flush(parser, &token_buffer); + LEX(PM_TOKEN_STRING_CONTENT); + } + } + + size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.base.indent); + + // If we have hit a newline that is followed by a valid + // terminator, then we need to return the content of the + // heredoc here as string content. Then, the next time a + // token is lexed, it will match again and return the + // end of the heredoc. + if (lex_mode->as.heredoc.base.indent == PM_HEREDOC_INDENT_TILDE) { + if ((lex_mode->as.heredoc.common_whitespace != NULL) && (*lex_mode->as.heredoc.common_whitespace > whitespace) && peek_at(parser, start) != '\n') { + *lex_mode->as.heredoc.common_whitespace = whitespace; + } + + parser->current.end = breakpoint + 1; + pm_token_buffer_flush(parser, &token_buffer); + LEX(PM_TOKEN_STRING_CONTENT); + } + + // Otherwise we hit a newline and it wasn't followed by + // a terminator, so we can continue parsing. + parser->current.end = breakpoint + 1; + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true); + break; + } + case '\\': { + // If we hit an escape, then we need to skip past + // however many characters the escape takes up. However + // it's important that if \n or \r\n are escaped, we + // stop looping before the newline and not after the + // newline so that we can still potentially find the + // terminator of the heredoc. + parser->current.end = breakpoint + 1; + + // If we've hit the end of the file, then break out of + // the loop by setting the breakpoint to NULL. + if (parser->current.end == parser->end) { + breakpoint = NULL; + continue; + } + + pm_token_buffer_escape(parser, &token_buffer); + uint8_t peeked = peek(parser); + + if (quote == PM_HEREDOC_QUOTE_SINGLE) { + switch (peeked) { + case '\r': + parser->current.end++; + if (peek(parser) != '\n') { + pm_token_buffer_push_byte(&token_buffer, '\\'); + pm_token_buffer_push_byte(&token_buffer, '\r'); + break; + } + PRISM_FALLTHROUGH + case '\n': + pm_token_buffer_push_byte(&token_buffer, '\\'); + pm_token_buffer_push_byte(&token_buffer, '\n'); + token_buffer.cursor = parser->current.end + 1; + breakpoint = parser->current.end; + continue; + default: + pm_token_buffer_push_byte(&token_buffer, '\\'); + pm_token_buffer_push_escaped(&token_buffer, parser); + break; + } + } else { + switch (peeked) { + case '\r': + parser->current.end++; + if (peek(parser) != '\n') { + pm_token_buffer_push_byte(&token_buffer, '\r'); + break; + } + PRISM_FALLTHROUGH + case '\n': + // If we are in a tilde here, we should + // break out of the loop and return the + // string content. + if (heredoc_lex_mode->indent == PM_HEREDOC_INDENT_TILDE) { + const uint8_t *end = parser->current.end; + + if (parser->heredoc_end == NULL) { + pm_newline_list_append(&parser->newline_list, end); + } + + // Here we want the buffer to only + // include up to the backslash. + parser->current.end = breakpoint; + pm_token_buffer_flush(parser, &token_buffer); + + // Now we can advance the end of the + // token past the newline. + parser->current.end = end + 1; + lex_mode->as.heredoc.line_continuation = true; + LEX(PM_TOKEN_STRING_CONTENT); + } + + was_line_continuation = true; + token_buffer.cursor = parser->current.end + 1; + breakpoint = parser->current.end; + continue; + default: + escape_read(parser, &token_buffer.buffer, NULL, PM_ESCAPE_FLAG_NONE); + break; + } + } + + token_buffer.cursor = parser->current.end; + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true); + break; + } + case '#': { + pm_token_type_t type = lex_interpolation(parser, breakpoint); + + if (type == PM_TOKEN_NOT_PROVIDED) { + // If we haven't returned at this point then we had + // something that looked like an interpolated class + // or instance variable like "#@" but wasn't + // actually. In this case we'll just skip to the + // next breakpoint. + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true); + break; + } + + if (type == PM_TOKEN_STRING_CONTENT) { + pm_token_buffer_flush(parser, &token_buffer); + } + + LEX(type); + } + default: + assert(false && "unreachable"); + } + + was_line_continuation = false; + } + + if (parser->current.end > parser->current.start) { + parser->current.end = parser->end; + pm_token_buffer_flush(parser, &token_buffer); + LEX(PM_TOKEN_STRING_CONTENT); + } + + // If we've hit the end of the string, then this is an unterminated + // heredoc. In that case we'll return a string content token. + parser->current.end = parser->end; + pm_token_buffer_flush(parser, &token_buffer); + LEX(PM_TOKEN_STRING_CONTENT); + } + } + + assert(false && "unreachable"); +} + +#undef LEX + +/******************************************************************************/ +/* Parse functions */ +/******************************************************************************/ + +/** + * These are the various precedence rules. Because we are using a Pratt parser, + * they are named binding power to represent the manner in which nodes are bound + * together in the stack. + * + * We increment by 2 because we want to leave room for the infix operators to + * specify their associativity by adding or subtracting one. + */ +typedef enum { + PM_BINDING_POWER_UNSET = 0, // used to indicate this token cannot be used as an infix operator + PM_BINDING_POWER_STATEMENT = 2, + PM_BINDING_POWER_MODIFIER_RESCUE = 4, // rescue + PM_BINDING_POWER_MODIFIER = 6, // if unless until while + PM_BINDING_POWER_COMPOSITION = 8, // and or + PM_BINDING_POWER_NOT = 10, // not + PM_BINDING_POWER_MATCH = 12, // => in + PM_BINDING_POWER_DEFINED = 14, // defined? + PM_BINDING_POWER_MULTI_ASSIGNMENT = 16, // = + PM_BINDING_POWER_ASSIGNMENT = 18, // = += -= *= /= %= &= |= ^= &&= ||= <<= >>= **= + PM_BINDING_POWER_TERNARY = 20, // ?: + PM_BINDING_POWER_RANGE = 22, // .. ... + PM_BINDING_POWER_LOGICAL_OR = 24, // || + PM_BINDING_POWER_LOGICAL_AND = 26, // && + PM_BINDING_POWER_EQUALITY = 28, // <=> == === != =~ !~ + PM_BINDING_POWER_COMPARISON = 30, // > >= < <= + PM_BINDING_POWER_BITWISE_OR = 32, // | ^ + PM_BINDING_POWER_BITWISE_AND = 34, // & + PM_BINDING_POWER_SHIFT = 36, // << >> + PM_BINDING_POWER_TERM = 38, // + - + PM_BINDING_POWER_FACTOR = 40, // * / % + PM_BINDING_POWER_UMINUS = 42, // -@ + PM_BINDING_POWER_EXPONENT = 44, // ** + PM_BINDING_POWER_UNARY = 46, // ! ~ +@ + PM_BINDING_POWER_INDEX = 48, // [] []= + PM_BINDING_POWER_CALL = 50, // :: . + PM_BINDING_POWER_MAX = 52 +} pm_binding_power_t; + +/** + * This struct represents a set of binding powers used for a given token. They + * are combined in this way to make it easier to represent associativity. + */ +typedef struct { + /** The left binding power. */ + pm_binding_power_t left; + + /** The right binding power. */ + pm_binding_power_t right; + + /** Whether or not this token can be used as a binary operator. */ + bool binary; + + /** + * Whether or not this token can be used as non-associative binary operator. + * Non-associative operators (e.g. in and =>) need special treatment in parse_expression. + */ + bool nonassoc; +} pm_binding_powers_t; + +#define BINDING_POWER_ASSIGNMENT { PM_BINDING_POWER_UNARY, PM_BINDING_POWER_ASSIGNMENT, true, false } +#define LEFT_ASSOCIATIVE(precedence) { precedence, precedence + 1, true, false } +#define RIGHT_ASSOCIATIVE(precedence) { precedence, precedence, true, false } +#define NON_ASSOCIATIVE(precedence) { precedence, precedence + 1, true, true } +#define RIGHT_ASSOCIATIVE_UNARY(precedence) { precedence, precedence, false, false } + +pm_binding_powers_t pm_binding_powers[PM_TOKEN_MAXIMUM] = { + // rescue + [PM_TOKEN_KEYWORD_RESCUE_MODIFIER] = { PM_BINDING_POWER_MODIFIER_RESCUE, PM_BINDING_POWER_COMPOSITION, true, false }, + + // if unless until while + [PM_TOKEN_KEYWORD_IF_MODIFIER] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_MODIFIER), + [PM_TOKEN_KEYWORD_UNLESS_MODIFIER] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_MODIFIER), + [PM_TOKEN_KEYWORD_UNTIL_MODIFIER] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_MODIFIER), + [PM_TOKEN_KEYWORD_WHILE_MODIFIER] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_MODIFIER), + + // and or + [PM_TOKEN_KEYWORD_AND] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_COMPOSITION), + [PM_TOKEN_KEYWORD_OR] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_COMPOSITION), + + // => in + [PM_TOKEN_EQUAL_GREATER] = NON_ASSOCIATIVE(PM_BINDING_POWER_MATCH), + [PM_TOKEN_KEYWORD_IN] = NON_ASSOCIATIVE(PM_BINDING_POWER_MATCH), + + // &&= &= ^= = >>= <<= -= %= |= ||= += /= *= **= + [PM_TOKEN_AMPERSAND_AMPERSAND_EQUAL] = BINDING_POWER_ASSIGNMENT, + [PM_TOKEN_AMPERSAND_EQUAL] = BINDING_POWER_ASSIGNMENT, + [PM_TOKEN_CARET_EQUAL] = BINDING_POWER_ASSIGNMENT, + [PM_TOKEN_EQUAL] = BINDING_POWER_ASSIGNMENT, + [PM_TOKEN_GREATER_GREATER_EQUAL] = BINDING_POWER_ASSIGNMENT, + [PM_TOKEN_LESS_LESS_EQUAL] = BINDING_POWER_ASSIGNMENT, + [PM_TOKEN_MINUS_EQUAL] = BINDING_POWER_ASSIGNMENT, + [PM_TOKEN_PERCENT_EQUAL] = BINDING_POWER_ASSIGNMENT, + [PM_TOKEN_PIPE_EQUAL] = BINDING_POWER_ASSIGNMENT, + [PM_TOKEN_PIPE_PIPE_EQUAL] = BINDING_POWER_ASSIGNMENT, + [PM_TOKEN_PLUS_EQUAL] = BINDING_POWER_ASSIGNMENT, + [PM_TOKEN_SLASH_EQUAL] = BINDING_POWER_ASSIGNMENT, + [PM_TOKEN_STAR_EQUAL] = BINDING_POWER_ASSIGNMENT, + [PM_TOKEN_STAR_STAR_EQUAL] = BINDING_POWER_ASSIGNMENT, + + // ?: + [PM_TOKEN_QUESTION_MARK] = RIGHT_ASSOCIATIVE(PM_BINDING_POWER_TERNARY), + + // .. ... + [PM_TOKEN_DOT_DOT] = NON_ASSOCIATIVE(PM_BINDING_POWER_RANGE), + [PM_TOKEN_DOT_DOT_DOT] = NON_ASSOCIATIVE(PM_BINDING_POWER_RANGE), + [PM_TOKEN_UDOT_DOT] = RIGHT_ASSOCIATIVE_UNARY(PM_BINDING_POWER_LOGICAL_OR), + [PM_TOKEN_UDOT_DOT_DOT] = RIGHT_ASSOCIATIVE_UNARY(PM_BINDING_POWER_LOGICAL_OR), + + // || + [PM_TOKEN_PIPE_PIPE] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_LOGICAL_OR), + + // && + [PM_TOKEN_AMPERSAND_AMPERSAND] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_LOGICAL_AND), + + // != !~ == === =~ <=> + [PM_TOKEN_BANG_EQUAL] = NON_ASSOCIATIVE(PM_BINDING_POWER_EQUALITY), + [PM_TOKEN_BANG_TILDE] = NON_ASSOCIATIVE(PM_BINDING_POWER_EQUALITY), + [PM_TOKEN_EQUAL_EQUAL] = NON_ASSOCIATIVE(PM_BINDING_POWER_EQUALITY), + [PM_TOKEN_EQUAL_EQUAL_EQUAL] = NON_ASSOCIATIVE(PM_BINDING_POWER_EQUALITY), + [PM_TOKEN_EQUAL_TILDE] = NON_ASSOCIATIVE(PM_BINDING_POWER_EQUALITY), + [PM_TOKEN_LESS_EQUAL_GREATER] = NON_ASSOCIATIVE(PM_BINDING_POWER_EQUALITY), + + // > >= < <= + [PM_TOKEN_GREATER] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_COMPARISON), + [PM_TOKEN_GREATER_EQUAL] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_COMPARISON), + [PM_TOKEN_LESS] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_COMPARISON), + [PM_TOKEN_LESS_EQUAL] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_COMPARISON), + + // ^ | + [PM_TOKEN_CARET] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_BITWISE_OR), + [PM_TOKEN_PIPE] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_BITWISE_OR), + + // & + [PM_TOKEN_AMPERSAND] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_BITWISE_AND), + + // >> << + [PM_TOKEN_GREATER_GREATER] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_SHIFT), + [PM_TOKEN_LESS_LESS] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_SHIFT), + + // - + + [PM_TOKEN_MINUS] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_TERM), + [PM_TOKEN_PLUS] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_TERM), + + // % / * + [PM_TOKEN_PERCENT] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_FACTOR), + [PM_TOKEN_SLASH] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_FACTOR), + [PM_TOKEN_STAR] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_FACTOR), + [PM_TOKEN_USTAR] = RIGHT_ASSOCIATIVE_UNARY(PM_BINDING_POWER_FACTOR), + + // -@ + [PM_TOKEN_UMINUS] = RIGHT_ASSOCIATIVE_UNARY(PM_BINDING_POWER_UMINUS), + [PM_TOKEN_UMINUS_NUM] = { PM_BINDING_POWER_UMINUS, PM_BINDING_POWER_MAX, false, false }, + + // ** + [PM_TOKEN_STAR_STAR] = RIGHT_ASSOCIATIVE(PM_BINDING_POWER_EXPONENT), + [PM_TOKEN_USTAR_STAR] = RIGHT_ASSOCIATIVE_UNARY(PM_BINDING_POWER_UNARY), + + // ! ~ +@ + [PM_TOKEN_BANG] = RIGHT_ASSOCIATIVE_UNARY(PM_BINDING_POWER_UNARY), + [PM_TOKEN_TILDE] = RIGHT_ASSOCIATIVE_UNARY(PM_BINDING_POWER_UNARY), + [PM_TOKEN_UPLUS] = RIGHT_ASSOCIATIVE_UNARY(PM_BINDING_POWER_UNARY), + + // [ + [PM_TOKEN_BRACKET_LEFT] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_INDEX), + + // :: . &. + [PM_TOKEN_COLON_COLON] = RIGHT_ASSOCIATIVE(PM_BINDING_POWER_CALL), + [PM_TOKEN_DOT] = RIGHT_ASSOCIATIVE(PM_BINDING_POWER_CALL), + [PM_TOKEN_AMPERSAND_DOT] = RIGHT_ASSOCIATIVE(PM_BINDING_POWER_CALL) +}; + +#undef BINDING_POWER_ASSIGNMENT +#undef LEFT_ASSOCIATIVE +#undef RIGHT_ASSOCIATIVE +#undef RIGHT_ASSOCIATIVE_UNARY + +/** + * Returns true if the current token is of the given type. + */ +static inline bool +match1(const pm_parser_t *parser, pm_token_type_t type) { + return parser->current.type == type; +} + +/** + * Returns true if the current token is of either of the given types. + */ +static inline bool +match2(const pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2) { + return match1(parser, type1) || match1(parser, type2); +} + +/** + * Returns true if the current token is any of the three given types. + */ +static inline bool +match3(const pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2, pm_token_type_t type3) { + return match1(parser, type1) || match1(parser, type2) || match1(parser, type3); +} + +/** + * Returns true if the current token is any of the four given types. + */ +static inline bool +match4(const pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2, pm_token_type_t type3, pm_token_type_t type4) { + return match1(parser, type1) || match1(parser, type2) || match1(parser, type3) || match1(parser, type4); +} + +/** + * Returns true if the current token is any of the seven given types. + */ +static inline bool +match7(const pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2, pm_token_type_t type3, pm_token_type_t type4, pm_token_type_t type5, pm_token_type_t type6, pm_token_type_t type7) { + return match1(parser, type1) || match1(parser, type2) || match1(parser, type3) || match1(parser, type4) || match1(parser, type5) || match1(parser, type6) || match1(parser, type7); +} + +/** + * Returns true if the current token is any of the eight given types. + */ +static inline bool +match8(const pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2, pm_token_type_t type3, pm_token_type_t type4, pm_token_type_t type5, pm_token_type_t type6, pm_token_type_t type7, pm_token_type_t type8) { + return match1(parser, type1) || match1(parser, type2) || match1(parser, type3) || match1(parser, type4) || match1(parser, type5) || match1(parser, type6) || match1(parser, type7) || match1(parser, type8); +} + +/** + * If the current token is of the specified type, lex forward by one token and + * return true. Otherwise, return false. For example: + * + * if (accept1(parser, PM_TOKEN_COLON)) { ... } + */ +static bool +accept1(pm_parser_t *parser, pm_token_type_t type) { + if (match1(parser, type)) { + parser_lex(parser); + return true; + } + return false; +} + +/** + * If the current token is either of the two given types, lex forward by one + * token and return true. Otherwise return false. + */ +static inline bool +accept2(pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2) { + if (match2(parser, type1, type2)) { + parser_lex(parser); + return true; + } + return false; +} + +/** + * This function indicates that the parser expects a token in a specific + * position. For example, if you're parsing a BEGIN block, you know that a { is + * expected immediately after the keyword. In that case you would call this + * function to indicate that that token should be found. + * + * If we didn't find the token that we were expecting, then we're going to add + * an error to the parser's list of errors (to indicate that the tree is not + * valid) and create an artificial token instead. This allows us to recover from + * the fact that the token isn't present and continue parsing. + */ +static void +expect1(pm_parser_t *parser, pm_token_type_t type, pm_diagnostic_id_t diag_id) { + if (accept1(parser, type)) return; + + const uint8_t *location = parser->previous.end; + pm_parser_err(parser, location, location, diag_id); + + parser->previous.start = location; + parser->previous.type = PM_TOKEN_MISSING; +} + +/** + * This function is the same as expect1, but it expects either of two token + * types. + */ +static void +expect2(pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2, pm_diagnostic_id_t diag_id) { + if (accept2(parser, type1, type2)) return; + + const uint8_t *location = parser->previous.end; + pm_parser_err(parser, location, location, diag_id); + + parser->previous.start = location; + parser->previous.type = PM_TOKEN_MISSING; +} + +/** + * A special expect1 that expects a heredoc terminator and handles popping the + * lex mode accordingly. + */ +static void +expect1_heredoc_term(pm_parser_t *parser, const uint8_t *ident_start, size_t ident_length) { + if (match1(parser, PM_TOKEN_HEREDOC_END)) { + parser_lex(parser); + } else { + pm_parser_err_heredoc_term(parser, ident_start, ident_length); + parser->previous.start = parser->previous.end; + parser->previous.type = PM_TOKEN_MISSING; + } +} + +/** + * A special expect1 that attaches the error to the opening token location + * rather than the current position. This is useful for errors about missing + * closing tokens, where we want to point to the line with the opening token + * (e.g., `def`, `class`, `if`, `{`) rather than the end of the file. + */ +static void +expect1_opening(pm_parser_t *parser, pm_token_type_t type, pm_diagnostic_id_t diag_id, const pm_token_t *opening) { + if (accept1(parser, type)) return; + + pm_parser_err(parser, opening->start, opening->end, diag_id); + + parser->previous.start = parser->previous.end; + parser->previous.type = PM_TOKEN_MISSING; +} + +static pm_node_t * +parse_expression(pm_parser_t *parser, pm_binding_power_t binding_power, bool accepts_command_call, bool accepts_label, pm_diagnostic_id_t diag_id, uint16_t depth); + +/** + * This is a wrapper of parse_expression, which also checks whether the + * resulting node is a value expression. + */ +static pm_node_t * +parse_value_expression(pm_parser_t *parser, pm_binding_power_t binding_power, bool accepts_command_call, bool accepts_label, pm_diagnostic_id_t diag_id, uint16_t depth) { + pm_node_t *node = parse_expression(parser, binding_power, accepts_command_call, accepts_label, diag_id, depth); + pm_assert_value_expression(parser, node); + return node; +} + +/** + * This function controls whether or not we will attempt to parse an expression + * beginning at the subsequent token. It is used when we are in a context where + * an expression is optional. + * + * For example, looking at a range object when we've already lexed the operator, + * we need to know if we should attempt to parse an expression on the right. + * + * For another example, if we've parsed an identifier or a method call and we do + * not have parentheses, then the next token may be the start of an argument or + * it may not. + * + * CRuby parsers that are generated would resolve this by using a lookahead and + * potentially backtracking. We attempt to do this by just looking at the next + * token and making a decision based on that. I am not sure if this is going to + * work in all cases, it may need to be refactored later. But it appears to work + * for now. + */ +static inline bool +token_begins_expression_p(pm_token_type_t type) { + switch (type) { + case PM_TOKEN_EQUAL_GREATER: + case PM_TOKEN_KEYWORD_IN: + // We need to special case this because it is a binary operator that + // should not be marked as beginning an expression. + return false; + case PM_TOKEN_BRACE_RIGHT: + case PM_TOKEN_BRACKET_RIGHT: + case PM_TOKEN_COLON: + case PM_TOKEN_COMMA: + case PM_TOKEN_EMBEXPR_END: + case PM_TOKEN_EOF: + case PM_TOKEN_LAMBDA_BEGIN: + case PM_TOKEN_KEYWORD_DO: + case PM_TOKEN_KEYWORD_DO_LOOP: + case PM_TOKEN_KEYWORD_END: + case PM_TOKEN_KEYWORD_ELSE: + case PM_TOKEN_KEYWORD_ELSIF: + case PM_TOKEN_KEYWORD_ENSURE: + case PM_TOKEN_KEYWORD_THEN: + case PM_TOKEN_KEYWORD_RESCUE: + case PM_TOKEN_KEYWORD_WHEN: + case PM_TOKEN_NEWLINE: + case PM_TOKEN_PARENTHESIS_RIGHT: + case PM_TOKEN_SEMICOLON: + // The reason we need this short-circuit is because we're using the + // binding powers table to tell us if the subsequent token could + // potentially be the start of an expression. If there _is_ a binding + // power for one of these tokens, then we should remove it from this list + // and let it be handled by the default case below. + assert(pm_binding_powers[type].left == PM_BINDING_POWER_UNSET); + return false; + case PM_TOKEN_UAMPERSAND: + // This is a special case because this unary operator cannot appear + // as a general operator, it only appears in certain circumstances. + return false; + case PM_TOKEN_UCOLON_COLON: + case PM_TOKEN_UMINUS: + case PM_TOKEN_UMINUS_NUM: + case PM_TOKEN_UPLUS: + case PM_TOKEN_BANG: + case PM_TOKEN_TILDE: + case PM_TOKEN_UDOT_DOT: + case PM_TOKEN_UDOT_DOT_DOT: + // These unary tokens actually do have binding power associated with them + // so that we can correctly place them into the precedence order. But we + // want them to be marked as beginning an expression, so we need to + // special case them here. + return true; + default: + return pm_binding_powers[type].left == PM_BINDING_POWER_UNSET; + } +} + +/** + * Parse an expression with the given binding power that may be optionally + * prefixed by the * operator. + */ +static pm_node_t * +parse_starred_expression(pm_parser_t *parser, pm_binding_power_t binding_power, bool accepts_command_call, pm_diagnostic_id_t diag_id, uint16_t depth) { + if (accept1(parser, PM_TOKEN_USTAR)) { + pm_token_t operator = parser->previous; + pm_node_t *expression = parse_value_expression(parser, binding_power, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_STAR, (uint16_t) (depth + 1)); + return UP(pm_splat_node_create(parser, &operator, expression)); + } + + return parse_value_expression(parser, binding_power, accepts_command_call, false, diag_id, depth); +} + +static bool +pm_node_unreference_each(const pm_node_t *node, void *data) { + switch (PM_NODE_TYPE(node)) { + /* When we are about to destroy a set of nodes that could potentially + * contain block exits for the current scope, we need to check if they + * are contained in the list of block exits and remove them if they are. + */ + case PM_BREAK_NODE: + case PM_NEXT_NODE: + case PM_REDO_NODE: { + pm_parser_t *parser = (pm_parser_t *) data; + size_t index = 0; + + while (index < parser->current_block_exits->size) { + pm_node_t *block_exit = parser->current_block_exits->nodes[index]; + + if (block_exit == node) { + if (index + 1 < parser->current_block_exits->size) { + memmove( + &parser->current_block_exits->nodes[index], + &parser->current_block_exits->nodes[index + 1], + (parser->current_block_exits->size - index - 1) * sizeof(pm_node_t *) + ); + } + parser->current_block_exits->size--; + + /* Note returning true here because these nodes could have + * arguments that are themselves block exits. */ + return true; + } + + index++; + } + + return true; + } + /* When an implicit local variable is written to or targeted, it becomes + * a regular, named local variable. This branch removes it from the list + * of implicit parameters when that happens. */ + case PM_LOCAL_VARIABLE_READ_NODE: + case PM_IT_LOCAL_VARIABLE_READ_NODE: { + pm_parser_t *parser = (pm_parser_t *) data; + pm_node_list_t *implicit_parameters = &parser->current_scope->implicit_parameters; + + for (size_t index = 0; index < implicit_parameters->size; index++) { + if (implicit_parameters->nodes[index] == node) { + /* If the node is not the last one in the list, we need to + * shift the remaining nodes down to fill the gap. This is + * extremely unlikely to happen. */ + if (index != implicit_parameters->size - 1) { + memmove(&implicit_parameters->nodes[index], &implicit_parameters->nodes[index + 1], (implicit_parameters->size - index - 1) * sizeof(pm_node_t *)); + } + + implicit_parameters->size--; + break; + } + } + + return false; + } + default: + return true; + } +} + +/** + * When we are about to destroy a set of nodes that could potentially be + * referenced by one or more lists on the parser, then remove them from those + * lists so we don't get a use-after-free. + */ +static void +pm_node_unreference(pm_parser_t *parser, const pm_node_t *node) { + pm_visit_node(node, pm_node_unreference_each, parser); +} + +/** + * Convert the name of a method into the corresponding write method name. For + * example, foo would be turned into foo=. + */ +static void +parse_write_name(pm_parser_t *parser, pm_constant_id_t *name_field) { + // The method name needs to change. If we previously had + // foo, we now need foo=. In this case we'll allocate a new + // owned string, copy the previous method name in, and + // append an =. + pm_constant_t *constant = pm_constant_pool_id_to_constant(&parser->constant_pool, *name_field); + size_t length = constant->length; + uint8_t *name = xcalloc(length + 1, sizeof(uint8_t)); + if (name == NULL) return; + + memcpy(name, constant->start, length); + name[length] = '='; + + // Now switch the name to the new string. + // This silences clang analyzer warning about leak of memory pointed by `name`. + // NOLINTNEXTLINE(clang-analyzer-*) + *name_field = pm_constant_pool_insert_owned(&parser->constant_pool, name, length + 1); +} + +/** + * Certain expressions are not targetable, but in order to provide a better + * experience we give a specific error message. In order to maintain as much + * information in the tree as possible, we replace them with local variable + * writes. + */ +static pm_node_t * +parse_unwriteable_target(pm_parser_t *parser, pm_node_t *target) { + switch (PM_NODE_TYPE(target)) { + case PM_SOURCE_ENCODING_NODE: pm_parser_err_node(parser, target, PM_ERR_EXPRESSION_NOT_WRITABLE_ENCODING); break; + case PM_FALSE_NODE: pm_parser_err_node(parser, target, PM_ERR_EXPRESSION_NOT_WRITABLE_FALSE); break; + case PM_SOURCE_FILE_NODE: pm_parser_err_node(parser, target, PM_ERR_EXPRESSION_NOT_WRITABLE_FILE); break; + case PM_SOURCE_LINE_NODE: pm_parser_err_node(parser, target, PM_ERR_EXPRESSION_NOT_WRITABLE_LINE); break; + case PM_NIL_NODE: pm_parser_err_node(parser, target, PM_ERR_EXPRESSION_NOT_WRITABLE_NIL); break; + case PM_SELF_NODE: pm_parser_err_node(parser, target, PM_ERR_EXPRESSION_NOT_WRITABLE_SELF); break; + case PM_TRUE_NODE: pm_parser_err_node(parser, target, PM_ERR_EXPRESSION_NOT_WRITABLE_TRUE); break; + default: break; + } + + pm_constant_id_t name = pm_parser_constant_id_location(parser, target->location.start, target->location.end); + pm_local_variable_target_node_t *result = pm_local_variable_target_node_create(parser, &target->location, name, 0); + + pm_node_destroy(parser, target); + return UP(result); +} + +/** + * Convert the given node into a valid target node. + * + * @param multiple Whether or not this target is part of a larger set of + * targets. If it is, then the &. operator is not allowed. + * @param splat Whether or not this target is a child of a splat target. If it + * is, then fewer patterns are allowed. + */ +static pm_node_t * +parse_target(pm_parser_t *parser, pm_node_t *target, bool multiple, bool splat_parent) { + switch (PM_NODE_TYPE(target)) { + case PM_MISSING_NODE: + return target; + case PM_SOURCE_ENCODING_NODE: + case PM_FALSE_NODE: + case PM_SOURCE_FILE_NODE: + case PM_SOURCE_LINE_NODE: + case PM_NIL_NODE: + case PM_SELF_NODE: + case PM_TRUE_NODE: { + // In these special cases, we have specific error messages and we + // will replace them with local variable writes. + return parse_unwriteable_target(parser, target); + } + case PM_CLASS_VARIABLE_READ_NODE: + assert(sizeof(pm_class_variable_target_node_t) == sizeof(pm_class_variable_read_node_t)); + target->type = PM_CLASS_VARIABLE_TARGET_NODE; + return target; + case PM_CONSTANT_PATH_NODE: + if (context_def_p(parser)) { + pm_parser_err_node(parser, target, PM_ERR_WRITE_TARGET_IN_METHOD); + } + + assert(sizeof(pm_constant_path_target_node_t) == sizeof(pm_constant_path_node_t)); + target->type = PM_CONSTANT_PATH_TARGET_NODE; + + return target; + case PM_CONSTANT_READ_NODE: + if (context_def_p(parser)) { + pm_parser_err_node(parser, target, PM_ERR_WRITE_TARGET_IN_METHOD); + } + + assert(sizeof(pm_constant_target_node_t) == sizeof(pm_constant_read_node_t)); + target->type = PM_CONSTANT_TARGET_NODE; + + return target; + case PM_BACK_REFERENCE_READ_NODE: + case PM_NUMBERED_REFERENCE_READ_NODE: + PM_PARSER_ERR_NODE_FORMAT_CONTENT(parser, target, PM_ERR_WRITE_TARGET_READONLY); + return target; + case PM_GLOBAL_VARIABLE_READ_NODE: + assert(sizeof(pm_global_variable_target_node_t) == sizeof(pm_global_variable_read_node_t)); + target->type = PM_GLOBAL_VARIABLE_TARGET_NODE; + return target; + case PM_LOCAL_VARIABLE_READ_NODE: { + if (pm_token_is_numbered_parameter(target->location.start, target->location.end)) { + PM_PARSER_ERR_FORMAT(parser, target->location.start, target->location.end, PM_ERR_PARAMETER_NUMBERED_RESERVED, target->location.start); + pm_node_unreference(parser, target); + } + + const pm_local_variable_read_node_t *cast = (const pm_local_variable_read_node_t *) target; + uint32_t name = cast->name; + uint32_t depth = cast->depth; + pm_locals_unread(&pm_parser_scope_find(parser, depth)->locals, name); + + assert(sizeof(pm_local_variable_target_node_t) == sizeof(pm_local_variable_read_node_t)); + target->type = PM_LOCAL_VARIABLE_TARGET_NODE; + + return target; + } + case PM_IT_LOCAL_VARIABLE_READ_NODE: { + pm_constant_id_t name = pm_parser_local_add_constant(parser, "it", 2); + pm_node_t *node = UP(pm_local_variable_target_node_create(parser, &target->location, name, 0)); + + pm_node_unreference(parser, target); + pm_node_destroy(parser, target); + + return node; + } + case PM_INSTANCE_VARIABLE_READ_NODE: + assert(sizeof(pm_instance_variable_target_node_t) == sizeof(pm_instance_variable_read_node_t)); + target->type = PM_INSTANCE_VARIABLE_TARGET_NODE; + return target; + case PM_MULTI_TARGET_NODE: + if (splat_parent) { + // Multi target is not accepted in all positions. If this is one + // of them, then we need to add an error. + pm_parser_err_node(parser, target, PM_ERR_WRITE_TARGET_UNEXPECTED); + } + + return target; + case PM_SPLAT_NODE: { + pm_splat_node_t *splat = (pm_splat_node_t *) target; + + if (splat->expression != NULL) { + splat->expression = parse_target(parser, splat->expression, multiple, true); + } + + return UP(splat); + } + case PM_CALL_NODE: { + pm_call_node_t *call = (pm_call_node_t *) target; + + // If we have no arguments to the call node and we need this to be a + // target then this is either a method call or a local variable + // write. + if ( + (call->message_loc.start != NULL) && + (call->message_loc.end[-1] != '!') && + (call->message_loc.end[-1] != '?') && + (call->opening_loc.start == NULL) && + (call->arguments == NULL) && + (call->block == NULL) + ) { + if (call->receiver == NULL) { + // When we get here, we have a local variable write, because it + // was previously marked as a method call but now we have an =. + // This looks like: + // + // foo = 1 + // + // When it was parsed in the prefix position, foo was seen as a + // method call with no receiver and no arguments. Now we have an + // =, so we know it's a local variable write. + const pm_location_t message_loc = call->message_loc; + + pm_constant_id_t name = pm_parser_local_add_location(parser, message_loc.start, message_loc.end, 0); + pm_node_destroy(parser, target); + + return UP(pm_local_variable_target_node_create(parser, &message_loc, name, 0)); + } + + if (peek_at(parser, call->message_loc.start) == '_' || parser->encoding->alnum_char(call->message_loc.start, call->message_loc.end - call->message_loc.start)) { + if (multiple && PM_NODE_FLAG_P(call, PM_CALL_NODE_FLAGS_SAFE_NAVIGATION)) { + pm_parser_err_node(parser, (const pm_node_t *) call, PM_ERR_UNEXPECTED_SAFE_NAVIGATION); + } + + parse_write_name(parser, &call->name); + return UP(pm_call_target_node_create(parser, call)); + } + } + + // If there is no call operator and the message is "[]" then this is + // an aref expression, and we can transform it into an aset + // expression. + if (PM_NODE_FLAG_P(call, PM_CALL_NODE_FLAGS_INDEX)) { + return UP(pm_index_target_node_create(parser, call)); + } + } + PRISM_FALLTHROUGH + default: + // In this case we have a node that we don't know how to convert + // into a target. We need to treat it as an error. For now, we'll + // mark it as an error and just skip right past it. + pm_parser_err_node(parser, target, PM_ERR_WRITE_TARGET_UNEXPECTED); + return target; + } +} + +/** + * Parse a write target and validate that it is in a valid position for + * assignment. + */ +static pm_node_t * +parse_target_validate(pm_parser_t *parser, pm_node_t *target, bool multiple) { + pm_node_t *result = parse_target(parser, target, multiple, false); + + // Ensure that we have one of an =, an 'in' in for indexes, and a ')' in + // parens after the targets. + if ( + !match1(parser, PM_TOKEN_EQUAL) && + !(context_p(parser, PM_CONTEXT_FOR_INDEX) && match1(parser, PM_TOKEN_KEYWORD_IN)) && + !(context_p(parser, PM_CONTEXT_PARENS) && match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) + ) { + pm_parser_err_node(parser, result, PM_ERR_WRITE_TARGET_UNEXPECTED); + } + + return result; +} + +/** + * Potentially wrap a constant write node in a shareable constant node depending + * on the current state. + */ +static pm_node_t * +parse_shareable_constant_write(pm_parser_t *parser, pm_node_t *write) { + pm_shareable_constant_value_t shareable_constant = pm_parser_scope_shareable_constant_get(parser); + + if (shareable_constant != PM_SCOPE_SHAREABLE_CONSTANT_NONE) { + return UP(pm_shareable_constant_node_create(parser, write, shareable_constant)); + } + + return write; +} + +/** + * Convert the given node into a valid write node. + */ +static pm_node_t * +parse_write(pm_parser_t *parser, pm_node_t *target, pm_token_t *operator, pm_node_t *value) { + switch (PM_NODE_TYPE(target)) { + case PM_MISSING_NODE: + pm_node_destroy(parser, value); + return target; + case PM_CLASS_VARIABLE_READ_NODE: { + pm_class_variable_write_node_t *node = pm_class_variable_write_node_create(parser, (pm_class_variable_read_node_t *) target, operator, value); + pm_node_destroy(parser, target); + return UP(node); + } + case PM_CONSTANT_PATH_NODE: { + pm_node_t *node = UP(pm_constant_path_write_node_create(parser, (pm_constant_path_node_t *) target, operator, value)); + + if (context_def_p(parser)) { + pm_parser_err_node(parser, node, PM_ERR_WRITE_TARGET_IN_METHOD); + } + + return parse_shareable_constant_write(parser, node); + } + case PM_CONSTANT_READ_NODE: { + pm_node_t *node = UP(pm_constant_write_node_create(parser, (pm_constant_read_node_t *) target, operator, value)); + + if (context_def_p(parser)) { + pm_parser_err_node(parser, node, PM_ERR_WRITE_TARGET_IN_METHOD); + } + + pm_node_destroy(parser, target); + return parse_shareable_constant_write(parser, node); + } + case PM_BACK_REFERENCE_READ_NODE: + case PM_NUMBERED_REFERENCE_READ_NODE: + PM_PARSER_ERR_NODE_FORMAT_CONTENT(parser, target, PM_ERR_WRITE_TARGET_READONLY); + PRISM_FALLTHROUGH + case PM_GLOBAL_VARIABLE_READ_NODE: { + pm_global_variable_write_node_t *node = pm_global_variable_write_node_create(parser, target, operator, value); + pm_node_destroy(parser, target); + return UP(node); + } + case PM_LOCAL_VARIABLE_READ_NODE: { + pm_local_variable_read_node_t *local_read = (pm_local_variable_read_node_t *) target; + + pm_constant_id_t name = local_read->name; + pm_location_t name_loc = target->location; + + uint32_t depth = local_read->depth; + pm_scope_t *scope = pm_parser_scope_find(parser, depth); + + if (pm_token_is_numbered_parameter(target->location.start, target->location.end)) { + pm_diagnostic_id_t diag_id = (scope->parameters & PM_SCOPE_PARAMETERS_NUMBERED_FOUND) ? PM_ERR_EXPRESSION_NOT_WRITABLE_NUMBERED : PM_ERR_PARAMETER_NUMBERED_RESERVED; + PM_PARSER_ERR_FORMAT(parser, target->location.start, target->location.end, diag_id, target->location.start); + pm_node_unreference(parser, target); + } + + pm_locals_unread(&scope->locals, name); + pm_node_destroy(parser, target); + + return UP(pm_local_variable_write_node_create(parser, name, depth, value, &name_loc, operator)); + } + case PM_IT_LOCAL_VARIABLE_READ_NODE: { + pm_constant_id_t name = pm_parser_local_add_constant(parser, "it", 2); + pm_node_t *node = UP(pm_local_variable_write_node_create(parser, name, 0, value, &target->location, operator)); + + pm_node_unreference(parser, target); + pm_node_destroy(parser, target); + + return node; + } + case PM_INSTANCE_VARIABLE_READ_NODE: { + pm_node_t *write_node = UP(pm_instance_variable_write_node_create(parser, (pm_instance_variable_read_node_t *) target, operator, value)); + pm_node_destroy(parser, target); + return write_node; + } + case PM_MULTI_TARGET_NODE: + return UP(pm_multi_write_node_create(parser, (pm_multi_target_node_t *) target, operator, value)); + case PM_SPLAT_NODE: { + pm_splat_node_t *splat = (pm_splat_node_t *) target; + + if (splat->expression != NULL) { + splat->expression = parse_write(parser, splat->expression, operator, value); + } + + pm_multi_target_node_t *multi_target = pm_multi_target_node_create(parser); + pm_multi_target_node_targets_append(parser, multi_target, UP(splat)); + + return UP(pm_multi_write_node_create(parser, multi_target, operator, value)); + } + case PM_CALL_NODE: { + pm_call_node_t *call = (pm_call_node_t *) target; + + // If we have no arguments to the call node and we need this to be a + // target then this is either a method call or a local variable + // write. + if ( + (call->message_loc.start != NULL) && + (call->message_loc.end[-1] != '!') && + (call->message_loc.end[-1] != '?') && + (call->opening_loc.start == NULL) && + (call->arguments == NULL) && + (call->block == NULL) + ) { + if (call->receiver == NULL) { + // When we get here, we have a local variable write, because it + // was previously marked as a method call but now we have an =. + // This looks like: + // + // foo = 1 + // + // When it was parsed in the prefix position, foo was seen as a + // method call with no receiver and no arguments. Now we have an + // =, so we know it's a local variable write. + const pm_location_t message = call->message_loc; + + pm_parser_local_add_location(parser, message.start, message.end, 0); + pm_node_destroy(parser, target); + + pm_constant_id_t constant_id = pm_parser_constant_id_location(parser, message.start, message.end); + target = UP(pm_local_variable_write_node_create(parser, constant_id, 0, value, &message, operator)); + + pm_refute_numbered_parameter(parser, message.start, message.end); + return target; + } + + if (char_is_identifier_start(parser, call->message_loc.start, parser->end - call->message_loc.start)) { + // When we get here, we have a method call, because it was + // previously marked as a method call but now we have an =. This + // looks like: + // + // foo.bar = 1 + // + // When it was parsed in the prefix position, foo.bar was seen as a + // method call with no arguments. Now we have an =, so we know it's + // a method call with an argument. In this case we will create the + // arguments node, parse the argument, and add it to the list. + pm_arguments_node_t *arguments = pm_arguments_node_create(parser); + call->arguments = arguments; + + pm_arguments_node_arguments_append(arguments, value); + call->base.location.end = arguments->base.location.end; + call->equal_loc = PM_LOCATION_TOKEN_VALUE(operator); + + parse_write_name(parser, &call->name); + pm_node_flag_set(UP(call), PM_CALL_NODE_FLAGS_ATTRIBUTE_WRITE | pm_implicit_array_write_flags(value, PM_CALL_NODE_FLAGS_IMPLICIT_ARRAY)); + + return UP(call); + } + } + + // If there is no call operator and the message is "[]" then this is + // an aref expression, and we can transform it into an aset + // expression. + if (PM_NODE_FLAG_P(call, PM_CALL_NODE_FLAGS_INDEX)) { + if (call->arguments == NULL) { + call->arguments = pm_arguments_node_create(parser); + } + + pm_arguments_node_arguments_append(call->arguments, value); + target->location.end = value->location.end; + + // Replace the name with "[]=". + call->name = pm_parser_constant_id_constant(parser, "[]=", 3); + call->equal_loc = PM_LOCATION_TOKEN_VALUE(operator); + + // Ensure that the arguments for []= don't contain keywords + pm_index_arguments_check(parser, call->arguments, call->block); + pm_node_flag_set(UP(call), PM_CALL_NODE_FLAGS_ATTRIBUTE_WRITE | pm_implicit_array_write_flags(value, PM_CALL_NODE_FLAGS_IMPLICIT_ARRAY)); + + return target; + } + + // If there are arguments on the call node, then it can't be a + // method call ending with = or a local variable write, so it must + // be a syntax error. In this case we'll fall through to our default + // handling. We need to free the value that we parsed because there + // is no way for us to attach it to the tree at this point. + // + // Since it is possible for the value to contain an implicit + // parameter somewhere in its subtree, we need to walk it and remove + // any implicit parameters from the list of implicit parameters for + // the current scope. + pm_node_unreference(parser, value); + pm_node_destroy(parser, value); + } + PRISM_FALLTHROUGH + default: + // In this case we have a node that we don't know how to convert into a + // target. We need to treat it as an error. For now, we'll mark it as an + // error and just skip right past it. + pm_parser_err_token(parser, operator, PM_ERR_WRITE_TARGET_UNEXPECTED); + return target; + } +} + +/** + * Certain expressions are not writable, but in order to provide a better + * experience we give a specific error message. In order to maintain as much + * information in the tree as possible, we replace them with local variable + * writes. + */ +static pm_node_t * +parse_unwriteable_write(pm_parser_t *parser, pm_node_t *target, const pm_token_t *equals, pm_node_t *value) { + switch (PM_NODE_TYPE(target)) { + case PM_SOURCE_ENCODING_NODE: pm_parser_err_token(parser, equals, PM_ERR_EXPRESSION_NOT_WRITABLE_ENCODING); break; + case PM_FALSE_NODE: pm_parser_err_token(parser, equals, PM_ERR_EXPRESSION_NOT_WRITABLE_FALSE); break; + case PM_SOURCE_FILE_NODE: pm_parser_err_token(parser, equals, PM_ERR_EXPRESSION_NOT_WRITABLE_FILE); break; + case PM_SOURCE_LINE_NODE: pm_parser_err_token(parser, equals, PM_ERR_EXPRESSION_NOT_WRITABLE_LINE); break; + case PM_NIL_NODE: pm_parser_err_token(parser, equals, PM_ERR_EXPRESSION_NOT_WRITABLE_NIL); break; + case PM_SELF_NODE: pm_parser_err_token(parser, equals, PM_ERR_EXPRESSION_NOT_WRITABLE_SELF); break; + case PM_TRUE_NODE: pm_parser_err_token(parser, equals, PM_ERR_EXPRESSION_NOT_WRITABLE_TRUE); break; + default: break; + } + + pm_constant_id_t name = pm_parser_local_add_location(parser, target->location.start, target->location.end, 1); + pm_local_variable_write_node_t *result = pm_local_variable_write_node_create(parser, name, 0, value, &target->location, equals); + + pm_node_destroy(parser, target); + return UP(result); +} + +/** + * Parse a list of targets for assignment. This is used in the case of a for + * loop or a multi-assignment. For example, in the following code: + * + * for foo, bar in baz + * ^^^^^^^^ + * + * The targets are `foo` and `bar`. This function will either return a single + * target node or a multi-target node. + */ +static pm_node_t * +parse_targets(pm_parser_t *parser, pm_node_t *first_target, pm_binding_power_t binding_power, uint16_t depth) { + bool has_rest = PM_NODE_TYPE_P(first_target, PM_SPLAT_NODE); + + pm_multi_target_node_t *result = pm_multi_target_node_create(parser); + pm_multi_target_node_targets_append(parser, result, parse_target(parser, first_target, true, false)); + + while (accept1(parser, PM_TOKEN_COMMA)) { + if (accept1(parser, PM_TOKEN_USTAR)) { + // Here we have a splat operator. It can have a name or be + // anonymous. It can be the final target or be in the middle if + // there haven't been any others yet. + if (has_rest) { + pm_parser_err_previous(parser, PM_ERR_MULTI_ASSIGN_MULTI_SPLATS); + } + + pm_token_t star_operator = parser->previous; + pm_node_t *name = NULL; + + if (token_begins_expression_p(parser->current.type)) { + name = parse_expression(parser, binding_power, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_STAR, (uint16_t) (depth + 1)); + name = parse_target(parser, name, true, true); + } + + pm_node_t *splat = UP(pm_splat_node_create(parser, &star_operator, name)); + pm_multi_target_node_targets_append(parser, result, splat); + has_rest = true; + } else if (match1(parser, PM_TOKEN_PARENTHESIS_LEFT)) { + context_push(parser, PM_CONTEXT_MULTI_TARGET); + pm_node_t *target = parse_expression(parser, binding_power, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_COMMA, (uint16_t) (depth + 1)); + target = parse_target(parser, target, true, false); + + pm_multi_target_node_targets_append(parser, result, target); + context_pop(parser); + } else if (token_begins_expression_p(parser->current.type)) { + pm_node_t *target = parse_expression(parser, binding_power, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_COMMA, (uint16_t) (depth + 1)); + target = parse_target(parser, target, true, false); + + pm_multi_target_node_targets_append(parser, result, target); + } else if (!match1(parser, PM_TOKEN_EOF)) { + // If we get here, then we have a trailing , in a multi target node. + // We'll add an implicit rest node to represent this. + pm_node_t *rest = UP(pm_implicit_rest_node_create(parser, &parser->previous)); + pm_multi_target_node_targets_append(parser, result, rest); + break; + } + } + + return UP(result); +} + +/** + * Parse a list of targets and validate that it is in a valid position for + * assignment. + */ +static pm_node_t * +parse_targets_validate(pm_parser_t *parser, pm_node_t *first_target, pm_binding_power_t binding_power, uint16_t depth) { + pm_node_t *result = parse_targets(parser, first_target, binding_power, depth); + accept1(parser, PM_TOKEN_NEWLINE); + + // Ensure that we have either an = or a ) after the targets. + if (!match2(parser, PM_TOKEN_EQUAL, PM_TOKEN_PARENTHESIS_RIGHT)) { + pm_parser_err_node(parser, result, PM_ERR_WRITE_TARGET_UNEXPECTED); + } + + return result; +} + +/** + * Parse a list of statements separated by newlines or semicolons. + */ +static pm_statements_node_t * +parse_statements(pm_parser_t *parser, pm_context_t context, uint16_t depth) { + // First, skip past any optional terminators that might be at the beginning + // of the statements. + while (accept2(parser, PM_TOKEN_SEMICOLON, PM_TOKEN_NEWLINE)); + + // If we have a terminator, then we can just return NULL. + if (context_terminator(context, &parser->current)) return NULL; + + pm_statements_node_t *statements = pm_statements_node_create(parser); + + // At this point we know we have at least one statement, and that it + // immediately follows the current token. + context_push(parser, context); + + while (true) { + pm_node_t *node = parse_expression(parser, PM_BINDING_POWER_STATEMENT, true, false, PM_ERR_CANNOT_PARSE_EXPRESSION, (uint16_t) (depth + 1)); + pm_statements_node_body_append(parser, statements, node, true); + + // If we're recovering from a syntax error, then we need to stop parsing + // the statements now. + if (parser->recovering) { + // If this is the level of context where the recovery has happened, + // then we can mark the parser as done recovering. + if (context_terminator(context, &parser->current)) parser->recovering = false; + break; + } + + // If we have a terminator, then we will parse all consecutive + // terminators and then continue parsing the statements list. + if (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) { + // If we have a terminator, then we will continue parsing the + // statements list. + while (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)); + if (context_terminator(context, &parser->current)) break; + + // Now we can continue parsing the list of statements. + continue; + } + + // At this point we have a list of statements that are not terminated by + // a newline or semicolon. At this point we need to check if we're at + // the end of the statements list. If we are, then we should break out + // of the loop. + if (context_terminator(context, &parser->current)) break; + + // At this point, we have a syntax error, because the statement was not + // terminated by a newline or semicolon, and we're not at the end of the + // statements list. Ideally we should scan forward to determine if we + // should insert a missing terminator or break out of parsing the + // statements list at this point. + // + // We don't have that yet, so instead we'll do a more naive approach. If + // we were unable to parse an expression, then we will skip past this + // token and continue parsing the statements list. Otherwise we'll add + // an error and continue parsing the statements list. + if (PM_NODE_TYPE_P(node, PM_MISSING_NODE)) { + parser_lex(parser); + + // If we are at the end of the file, then we need to stop parsing + // the statements entirely at this point. Mark the parser as + // recovering, as we know that EOF closes the top-level context, and + // then break out of the loop. + if (match1(parser, PM_TOKEN_EOF)) { + parser->recovering = true; + break; + } + + while (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)); + if (context_terminator(context, &parser->current)) break; + } else if (!accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_EOF)) { + // This is an inlined version of accept1 because the error that we + // want to add has varargs. If this happens again, we should + // probably extract a helper function. + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_type_human(parser->current.type)); + parser->previous.start = parser->previous.end; + parser->previous.type = PM_TOKEN_MISSING; + } + } + + context_pop(parser); + bool last_value = true; + switch (context) { + case PM_CONTEXT_BEGIN_ENSURE: + case PM_CONTEXT_DEF_ENSURE: + last_value = false; + break; + default: + break; + } + pm_void_statements_check(parser, statements, last_value); + + return statements; +} + +/** + * Add a node to a set of static literals that holds a set of hash keys. If the + * node is a duplicate, then add an appropriate warning. + */ +static void +pm_hash_key_static_literals_add(pm_parser_t *parser, pm_static_literals_t *literals, pm_node_t *node) { + const pm_node_t *duplicated = pm_static_literals_add(&parser->newline_list, parser->start_line, literals, node, true); + + if (duplicated != NULL) { + pm_buffer_t buffer = { 0 }; + pm_static_literal_inspect(&buffer, &parser->newline_list, parser->start_line, parser->encoding->name, duplicated); + + pm_diagnostic_list_append_format( + &parser->warning_list, + duplicated->location.start, + duplicated->location.end, + PM_WARN_DUPLICATED_HASH_KEY, + (int) pm_buffer_length(&buffer), + pm_buffer_value(&buffer), + pm_newline_list_line_column(&parser->newline_list, node->location.start, parser->start_line).line + ); + + pm_buffer_free(&buffer); + } +} + +/** + * Add a node to a set of static literals that holds a set of hash keys. If the + * node is a duplicate, then add an appropriate warning. + */ +static void +pm_when_clause_static_literals_add(pm_parser_t *parser, pm_static_literals_t *literals, pm_node_t *node) { + pm_node_t *previous; + + if ((previous = pm_static_literals_add(&parser->newline_list, parser->start_line, literals, node, false)) != NULL) { + pm_diagnostic_list_append_format( + &parser->warning_list, + node->location.start, + node->location.end, + PM_WARN_DUPLICATED_WHEN_CLAUSE, + pm_newline_list_line_column(&parser->newline_list, node->location.start, parser->start_line).line, + pm_newline_list_line_column(&parser->newline_list, previous->location.start, parser->start_line).line + ); + } +} + +/** + * Parse all of the elements of a hash. Return true if a double splat was found. + */ +static bool +parse_assocs(pm_parser_t *parser, pm_static_literals_t *literals, pm_node_t *node, uint16_t depth) { + assert(PM_NODE_TYPE_P(node, PM_HASH_NODE) || PM_NODE_TYPE_P(node, PM_KEYWORD_HASH_NODE)); + bool contains_keyword_splat = false; + + while (true) { + pm_node_t *element; + + switch (parser->current.type) { + case PM_TOKEN_USTAR_STAR: { + parser_lex(parser); + pm_token_t operator = parser->previous; + pm_node_t *value = NULL; + + if (match1(parser, PM_TOKEN_BRACE_LEFT)) { + // If we're about to parse a nested hash that is being + // pushed into this hash directly with **, then we want the + // inner hash to share the static literals with the outer + // hash. + parser->current_hash_keys = literals; + value = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_SPLAT_HASH, (uint16_t) (depth + 1)); + } else if (token_begins_expression_p(parser->current.type)) { + value = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_SPLAT_HASH, (uint16_t) (depth + 1)); + } else { + pm_parser_scope_forwarding_keywords_check(parser, &operator); + } + + element = UP(pm_assoc_splat_node_create(parser, value, &operator)); + contains_keyword_splat = true; + break; + } + case PM_TOKEN_LABEL: { + pm_token_t label = parser->current; + parser_lex(parser); + + pm_node_t *key = UP(pm_symbol_node_label_create(parser, &label)); + pm_hash_key_static_literals_add(parser, literals, key); + + pm_token_t operator = not_provided(parser); + pm_node_t *value = NULL; + + if (token_begins_expression_p(parser->current.type)) { + value = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, false, PM_ERR_HASH_EXPRESSION_AFTER_LABEL, (uint16_t) (depth + 1)); + } else { + if (parser->encoding->isupper_char(label.start, (label.end - 1) - label.start)) { + pm_token_t constant = { .type = PM_TOKEN_CONSTANT, .start = label.start, .end = label.end - 1 }; + value = UP(pm_constant_read_node_create(parser, &constant)); + } else { + int depth = -1; + pm_token_t identifier = { .type = PM_TOKEN_IDENTIFIER, .start = label.start, .end = label.end - 1 }; + + if (identifier.end[-1] == '!' || identifier.end[-1] == '?') { + PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, identifier, PM_ERR_INVALID_LOCAL_VARIABLE_READ); + } else { + depth = pm_parser_local_depth(parser, &identifier); + } + + if (depth == -1) { + value = UP(pm_call_node_variable_call_create(parser, &identifier)); + } else { + value = UP(pm_local_variable_read_node_create(parser, &identifier, (uint32_t) depth)); + } + } + + value->location.end++; + value = UP(pm_implicit_node_create(parser, value)); + } + + element = UP(pm_assoc_node_create(parser, key, &operator, value)); + break; + } + default: { + pm_node_t *key = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, true, PM_ERR_HASH_KEY, (uint16_t) (depth + 1)); + + // Hash keys that are strings are automatically frozen. We will + // mark that here. + if (PM_NODE_TYPE_P(key, PM_STRING_NODE)) { + pm_node_flag_set(key, PM_STRING_FLAGS_FROZEN | PM_NODE_FLAG_STATIC_LITERAL); + } + + pm_hash_key_static_literals_add(parser, literals, key); + + pm_token_t operator; + if (pm_symbol_node_label_p(key)) { + operator = not_provided(parser); + } else { + expect1(parser, PM_TOKEN_EQUAL_GREATER, PM_ERR_HASH_ROCKET); + operator = parser->previous; + } + + pm_node_t *value = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, false, PM_ERR_HASH_VALUE, (uint16_t) (depth + 1)); + element = UP(pm_assoc_node_create(parser, key, &operator, value)); + break; + } + } + + if (PM_NODE_TYPE_P(node, PM_HASH_NODE)) { + pm_hash_node_elements_append((pm_hash_node_t *) node, element); + } else { + pm_keyword_hash_node_elements_append((pm_keyword_hash_node_t *) node, element); + } + + // If there's no comma after the element, then we're done. + if (!accept1(parser, PM_TOKEN_COMMA)) break; + + // If the next element starts with a label or a **, then we know we have + // another element in the hash, so we'll continue parsing. + if (match2(parser, PM_TOKEN_USTAR_STAR, PM_TOKEN_LABEL)) continue; + + // Otherwise we need to check if the subsequent token begins an expression. + // If it does, then we'll continue parsing. + if (token_begins_expression_p(parser->current.type)) continue; + + // Otherwise by default we will exit out of this loop. + break; + } + + return contains_keyword_splat; +} + +static inline bool +argument_allowed_for_bare_hash(pm_parser_t *parser, pm_node_t *argument) { + if (pm_symbol_node_label_p(argument)) { + return true; + } + + switch (PM_NODE_TYPE(argument)) { + case PM_CALL_NODE: { + pm_call_node_t *cast = (pm_call_node_t *) argument; + if (cast->opening_loc.start == NULL && cast->arguments != NULL) { + if (PM_NODE_FLAG_P(cast->arguments, PM_ARGUMENTS_NODE_FLAGS_CONTAINS_KEYWORDS | PM_ARGUMENTS_NODE_FLAGS_CONTAINS_SPLAT)) { + return false; + } + if (cast->block != NULL) { + return false; + } + } + break; + } + default: break; + } + return accept1(parser, PM_TOKEN_EQUAL_GREATER); +} + +/** + * Append an argument to a list of arguments. + */ +static inline void +parse_arguments_append(pm_parser_t *parser, pm_arguments_t *arguments, pm_node_t *argument) { + if (arguments->arguments == NULL) { + arguments->arguments = pm_arguments_node_create(parser); + } + + pm_arguments_node_arguments_append(arguments->arguments, argument); +} + +/** + * Parse a list of arguments. + */ +static void +parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_forwarding, pm_token_type_t terminator, uint16_t depth) { + pm_binding_power_t binding_power = pm_binding_powers[parser->current.type].left; + + // First we need to check if the next token is one that could be the start + // of an argument. If it's not, then we can just return. + if ( + match2(parser, terminator, PM_TOKEN_EOF) || + (binding_power != PM_BINDING_POWER_UNSET && binding_power < PM_BINDING_POWER_RANGE) || + context_terminator(parser->current_context->context, &parser->current) + ) { + return; + } + + bool parsed_first_argument = false; + bool parsed_bare_hash = false; + bool parsed_block_argument = false; + bool parsed_forwarding_arguments = false; + + while (!match1(parser, PM_TOKEN_EOF)) { + if (parsed_forwarding_arguments) { + pm_parser_err_current(parser, PM_ERR_ARGUMENT_AFTER_FORWARDING_ELLIPSES); + } + + pm_node_t *argument = NULL; + + switch (parser->current.type) { + case PM_TOKEN_USTAR_STAR: + case PM_TOKEN_LABEL: { + if (parsed_bare_hash) { + pm_parser_err_current(parser, PM_ERR_ARGUMENT_BARE_HASH); + } + + pm_keyword_hash_node_t *hash = pm_keyword_hash_node_create(parser); + argument = UP(hash); + + pm_static_literals_t hash_keys = { 0 }; + bool contains_keyword_splat = parse_assocs(parser, &hash_keys, UP(hash), (uint16_t) (depth + 1)); + + parse_arguments_append(parser, arguments, argument); + + pm_node_flags_t flags = PM_ARGUMENTS_NODE_FLAGS_CONTAINS_KEYWORDS; + if (contains_keyword_splat) flags |= PM_ARGUMENTS_NODE_FLAGS_CONTAINS_KEYWORD_SPLAT; + pm_node_flag_set(UP(arguments->arguments), flags); + + pm_static_literals_free(&hash_keys); + parsed_bare_hash = true; + + break; + } + case PM_TOKEN_UAMPERSAND: { + parser_lex(parser); + pm_token_t operator = parser->previous; + pm_node_t *expression = NULL; + + if (token_begins_expression_p(parser->current.type)) { + expression = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, false, PM_ERR_EXPECT_ARGUMENT, (uint16_t) (depth + 1)); + } else { + pm_parser_scope_forwarding_block_check(parser, &operator); + } + + argument = UP(pm_block_argument_node_create(parser, &operator, expression)); + if (parsed_block_argument) { + parse_arguments_append(parser, arguments, argument); + } else { + arguments->block = argument; + } + + if (match1(parser, PM_TOKEN_COMMA)) { + pm_parser_err_current(parser, PM_ERR_ARGUMENT_AFTER_BLOCK); + } + + parsed_block_argument = true; + break; + } + case PM_TOKEN_USTAR: { + parser_lex(parser); + pm_token_t operator = parser->previous; + + if (match4(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_TOKEN_COMMA, PM_TOKEN_SEMICOLON, PM_TOKEN_BRACKET_RIGHT)) { + pm_parser_scope_forwarding_positionals_check(parser, &operator); + argument = UP(pm_splat_node_create(parser, &operator, NULL)); + if (parsed_bare_hash) { + pm_parser_err_previous(parser, PM_ERR_ARGUMENT_SPLAT_AFTER_ASSOC_SPLAT); + } + } else { + pm_node_t *expression = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_SPLAT, (uint16_t) (depth + 1)); + + if (parsed_bare_hash) { + pm_parser_err(parser, operator.start, expression->location.end, PM_ERR_ARGUMENT_SPLAT_AFTER_ASSOC_SPLAT); + } + + argument = UP(pm_splat_node_create(parser, &operator, expression)); + } + + parse_arguments_append(parser, arguments, argument); + break; + } + case PM_TOKEN_UDOT_DOT_DOT: { + if (accepts_forwarding) { + parser_lex(parser); + + if (token_begins_expression_p(parser->current.type)) { + // If the token begins an expression then this ... was + // not actually argument forwarding but was instead a + // range. + pm_token_t operator = parser->previous; + pm_node_t *right = parse_expression(parser, PM_BINDING_POWER_RANGE, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + + // If we parse a range, we need to validate that we + // didn't accidentally violate the nonassoc rules of the + // ... operator. + if (PM_NODE_TYPE_P(right, PM_RANGE_NODE)) { + pm_range_node_t *range = (pm_range_node_t *) right; + pm_parser_err(parser, range->operator_loc.start, range->operator_loc.end, PM_ERR_UNEXPECTED_RANGE_OPERATOR); + } + + argument = UP(pm_range_node_create(parser, NULL, &operator, right)); + } else { + pm_parser_scope_forwarding_all_check(parser, &parser->previous); + if (parsed_first_argument && terminator == PM_TOKEN_EOF) { + pm_parser_err_previous(parser, PM_ERR_ARGUMENT_FORWARDING_UNBOUND); + } + + argument = UP(pm_forwarding_arguments_node_create(parser, &parser->previous)); + parse_arguments_append(parser, arguments, argument); + pm_node_flag_set(UP(arguments->arguments), PM_ARGUMENTS_NODE_FLAGS_CONTAINS_FORWARDING); + arguments->has_forwarding = true; + parsed_forwarding_arguments = true; + break; + } + } + } + PRISM_FALLTHROUGH + default: { + if (argument == NULL) { + argument = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, !parsed_first_argument, true, PM_ERR_EXPECT_ARGUMENT, (uint16_t) (depth + 1)); + } + + bool contains_keywords = false; + bool contains_keyword_splat = false; + + if (argument_allowed_for_bare_hash(parser, argument)){ + if (parsed_bare_hash) { + pm_parser_err_previous(parser, PM_ERR_ARGUMENT_BARE_HASH); + } + + pm_token_t operator; + if (parser->previous.type == PM_TOKEN_EQUAL_GREATER) { + operator = parser->previous; + } else { + operator = not_provided(parser); + } + + pm_keyword_hash_node_t *bare_hash = pm_keyword_hash_node_create(parser); + contains_keywords = true; + + // Create the set of static literals for this hash. + pm_static_literals_t hash_keys = { 0 }; + pm_hash_key_static_literals_add(parser, &hash_keys, argument); + + // Finish parsing the one we are part way through. + pm_node_t *value = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, false, PM_ERR_HASH_VALUE, (uint16_t) (depth + 1)); + argument = UP(pm_assoc_node_create(parser, argument, &operator, value)); + + pm_keyword_hash_node_elements_append(bare_hash, argument); + argument = UP(bare_hash); + + // Then parse more if we have a comma + if (accept1(parser, PM_TOKEN_COMMA) && ( + token_begins_expression_p(parser->current.type) || + match2(parser, PM_TOKEN_USTAR_STAR, PM_TOKEN_LABEL) + )) { + contains_keyword_splat = parse_assocs(parser, &hash_keys, UP(bare_hash), (uint16_t) (depth + 1)); + } + + pm_static_literals_free(&hash_keys); + parsed_bare_hash = true; + } + + parse_arguments_append(parser, arguments, argument); + + pm_node_flags_t flags = 0; + if (contains_keywords) flags |= PM_ARGUMENTS_NODE_FLAGS_CONTAINS_KEYWORDS; + if (contains_keyword_splat) flags |= PM_ARGUMENTS_NODE_FLAGS_CONTAINS_KEYWORD_SPLAT; + pm_node_flag_set(UP(arguments->arguments), flags); + + break; + } + } + + parsed_first_argument = true; + + // If parsing the argument failed, we need to stop parsing arguments. + if (PM_NODE_TYPE_P(argument, PM_MISSING_NODE) || parser->recovering) break; + + // If the terminator of these arguments is not EOF, then we have a + // specific token we're looking for. In that case we can accept a + // newline here because it is not functioning as a statement terminator. + bool accepted_newline = false; + if (terminator != PM_TOKEN_EOF) { + accepted_newline = accept1(parser, PM_TOKEN_NEWLINE); + } + + if (parser->previous.type == PM_TOKEN_COMMA && parsed_bare_hash) { + // If we previously were on a comma and we just parsed a bare hash, + // then we want to continue parsing arguments. This is because the + // comma was grabbed up by the hash parser. + } else if (accept1(parser, PM_TOKEN_COMMA)) { + // If there was a comma, then we need to check if we also accepted a + // newline. If we did, then this is a syntax error. + if (accepted_newline) { + pm_parser_err_previous(parser, PM_ERR_INVALID_COMMA); + } + + // If this is a command call and an argument takes a block, + // there can be no further arguments. For example, + // `foo(bar 1 do end, 2)` should be rejected. + if (PM_NODE_TYPE_P(argument, PM_CALL_NODE)) { + pm_call_node_t *call = (pm_call_node_t *) argument; + if (call->opening_loc.start == NULL && call->arguments != NULL && call->block != NULL) { + pm_parser_err_previous(parser, PM_ERR_INVALID_COMMA); + break; + } + } + } else { + // If there is no comma at the end of the argument list then we're + // done parsing arguments and can break out of this loop. + break; + } + + // If we hit the terminator, then that means we have a trailing comma so + // we can accept that output as well. + if (match1(parser, terminator)) break; + } +} + +/** + * Required parameters on method, block, and lambda declarations can be + * destructured using parentheses. This looks like: + * + * def foo((bar, baz)) + * end + * + * + * It can recurse infinitely down, and splats are allowed to group arguments. + */ +static pm_multi_target_node_t * +parse_required_destructured_parameter(pm_parser_t *parser) { + expect1(parser, PM_TOKEN_PARENTHESIS_LEFT, PM_ERR_EXPECT_LPAREN_REQ_PARAMETER); + + pm_multi_target_node_t *node = pm_multi_target_node_create(parser); + pm_multi_target_node_opening_set(node, &parser->previous); + + do { + pm_node_t *param; + + // If we get here then we have a trailing comma, which isn't allowed in + // the grammar. In other places, multi targets _do_ allow trailing + // commas, so here we'll assume this is a mistake of the user not + // knowing it's not allowed here. + if (node->lefts.size > 0 && match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { + param = UP(pm_implicit_rest_node_create(parser, &parser->previous)); + pm_multi_target_node_targets_append(parser, node, param); + pm_parser_err_current(parser, PM_ERR_PARAMETER_WILD_LOOSE_COMMA); + break; + } + + if (match1(parser, PM_TOKEN_PARENTHESIS_LEFT)) { + param = UP(parse_required_destructured_parameter(parser)); + } else if (accept1(parser, PM_TOKEN_USTAR)) { + pm_token_t star = parser->previous; + pm_node_t *value = NULL; + + if (accept1(parser, PM_TOKEN_IDENTIFIER)) { + pm_token_t name = parser->previous; + value = UP(pm_required_parameter_node_create(parser, &name)); + if (pm_parser_parameter_name_check(parser, &name)) { + pm_node_flag_set_repeated_parameter(value); + } + pm_parser_local_add_token(parser, &name, 1); + } + + param = UP(pm_splat_node_create(parser, &star, value)); + } else { + expect1(parser, PM_TOKEN_IDENTIFIER, PM_ERR_EXPECT_IDENT_REQ_PARAMETER); + pm_token_t name = parser->previous; + + param = UP(pm_required_parameter_node_create(parser, &name)); + if (pm_parser_parameter_name_check(parser, &name)) { + pm_node_flag_set_repeated_parameter(param); + } + pm_parser_local_add_token(parser, &name, 1); + } + + pm_multi_target_node_targets_append(parser, node, param); + } while (accept1(parser, PM_TOKEN_COMMA)); + + accept1(parser, PM_TOKEN_NEWLINE); + expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_EXPECT_RPAREN_REQ_PARAMETER); + pm_multi_target_node_closing_set(node, &parser->previous); + + return node; +} + +/** + * This represents the different order states we can be in when parsing + * method parameters. + */ +typedef enum { + PM_PARAMETERS_NO_CHANGE = 0, // Extra state for tokens that should not change the state + PM_PARAMETERS_ORDER_NOTHING_AFTER = 1, + PM_PARAMETERS_ORDER_KEYWORDS_REST, + PM_PARAMETERS_ORDER_KEYWORDS, + PM_PARAMETERS_ORDER_REST, + PM_PARAMETERS_ORDER_AFTER_OPTIONAL, + PM_PARAMETERS_ORDER_OPTIONAL, + PM_PARAMETERS_ORDER_NAMED, + PM_PARAMETERS_ORDER_NONE, +} pm_parameters_order_t; + +/** + * This matches parameters tokens with parameters state. + */ +static pm_parameters_order_t parameters_ordering[PM_TOKEN_MAXIMUM] = { + [0] = PM_PARAMETERS_NO_CHANGE, + [PM_TOKEN_UAMPERSAND] = PM_PARAMETERS_ORDER_NOTHING_AFTER, + [PM_TOKEN_AMPERSAND] = PM_PARAMETERS_ORDER_NOTHING_AFTER, + [PM_TOKEN_UDOT_DOT_DOT] = PM_PARAMETERS_ORDER_NOTHING_AFTER, + [PM_TOKEN_IDENTIFIER] = PM_PARAMETERS_ORDER_NAMED, + [PM_TOKEN_PARENTHESIS_LEFT] = PM_PARAMETERS_ORDER_NAMED, + [PM_TOKEN_EQUAL] = PM_PARAMETERS_ORDER_OPTIONAL, + [PM_TOKEN_LABEL] = PM_PARAMETERS_ORDER_KEYWORDS, + [PM_TOKEN_USTAR] = PM_PARAMETERS_ORDER_AFTER_OPTIONAL, + [PM_TOKEN_STAR] = PM_PARAMETERS_ORDER_AFTER_OPTIONAL, + [PM_TOKEN_USTAR_STAR] = PM_PARAMETERS_ORDER_KEYWORDS_REST, + [PM_TOKEN_STAR_STAR] = PM_PARAMETERS_ORDER_KEYWORDS_REST +}; + +/** + * Check if current parameter follows valid parameters ordering. If not it adds + * an error to the list without stopping the parsing, otherwise sets the + * parameters state to the one corresponding to the current parameter. + * + * It returns true if it was successful, and false otherwise. + */ +static bool +update_parameter_state(pm_parser_t *parser, pm_token_t *token, pm_parameters_order_t *current) { + pm_parameters_order_t state = parameters_ordering[token->type]; + if (state == PM_PARAMETERS_NO_CHANGE) return true; + + // If we see another ordered argument after a optional argument + // we only continue parsing ordered arguments until we stop seeing ordered arguments. + if (*current == PM_PARAMETERS_ORDER_OPTIONAL && state == PM_PARAMETERS_ORDER_NAMED) { + *current = PM_PARAMETERS_ORDER_AFTER_OPTIONAL; + return true; + } else if (*current == PM_PARAMETERS_ORDER_AFTER_OPTIONAL && state == PM_PARAMETERS_ORDER_NAMED) { + return true; + } + + if (token->type == PM_TOKEN_USTAR && *current == PM_PARAMETERS_ORDER_AFTER_OPTIONAL) { + pm_parser_err_token(parser, token, PM_ERR_PARAMETER_STAR); + return false; + } else if (token->type == PM_TOKEN_UDOT_DOT_DOT && (*current >= PM_PARAMETERS_ORDER_KEYWORDS_REST && *current <= PM_PARAMETERS_ORDER_AFTER_OPTIONAL)) { + pm_parser_err_token(parser, token, *current == PM_PARAMETERS_ORDER_AFTER_OPTIONAL ? PM_ERR_PARAMETER_FORWARDING_AFTER_REST : PM_ERR_PARAMETER_ORDER); + return false; + } else if (*current == PM_PARAMETERS_ORDER_NOTHING_AFTER || state > *current) { + // We know what transition we failed on, so we can provide a better error here. + pm_parser_err_token(parser, token, PM_ERR_PARAMETER_ORDER); + return false; + } + + if (state < *current) *current = state; + return true; +} + +/** + * Parse a list of parameters on a method definition. + */ +static pm_parameters_node_t * +parse_parameters( + pm_parser_t *parser, + pm_binding_power_t binding_power, + bool uses_parentheses, + bool allows_trailing_comma, + bool allows_forwarding_parameters, + bool accepts_blocks_in_defaults, + bool in_block, + uint16_t depth +) { + pm_do_loop_stack_push(parser, false); + + pm_parameters_node_t *params = pm_parameters_node_create(parser); + pm_parameters_order_t order = PM_PARAMETERS_ORDER_NONE; + + while (true) { + bool parsing = true; + + switch (parser->current.type) { + case PM_TOKEN_PARENTHESIS_LEFT: { + update_parameter_state(parser, &parser->current, &order); + pm_node_t *param = UP(parse_required_destructured_parameter(parser)); + + if (order > PM_PARAMETERS_ORDER_AFTER_OPTIONAL) { + pm_parameters_node_requireds_append(params, param); + } else { + pm_parameters_node_posts_append(params, param); + } + break; + } + case PM_TOKEN_UAMPERSAND: + case PM_TOKEN_AMPERSAND: { + update_parameter_state(parser, &parser->current, &order); + parser_lex(parser); + + pm_token_t operator = parser->previous; + pm_token_t name; + + bool repeated = false; + if (accept1(parser, PM_TOKEN_IDENTIFIER)) { + name = parser->previous; + repeated = pm_parser_parameter_name_check(parser, &name); + pm_parser_local_add_token(parser, &name, 1); + } else { + name = not_provided(parser); + parser->current_scope->parameters |= PM_SCOPE_PARAMETERS_FORWARDING_BLOCK; + } + + pm_block_parameter_node_t *param = pm_block_parameter_node_create(parser, &name, &operator); + if (repeated) { + pm_node_flag_set_repeated_parameter(UP(param)); + } + if (params->block == NULL) { + pm_parameters_node_block_set(params, param); + } else { + pm_parser_err_node(parser, UP(param), PM_ERR_PARAMETER_BLOCK_MULTI); + pm_parameters_node_posts_append(params, UP(param)); + } + + break; + } + case PM_TOKEN_UDOT_DOT_DOT: { + if (!allows_forwarding_parameters) { + pm_parser_err_current(parser, PM_ERR_ARGUMENT_NO_FORWARDING_ELLIPSES); + } + + bool succeeded = update_parameter_state(parser, &parser->current, &order); + parser_lex(parser); + + parser->current_scope->parameters |= PM_SCOPE_PARAMETERS_FORWARDING_ALL; + pm_forwarding_parameter_node_t *param = pm_forwarding_parameter_node_create(parser, &parser->previous); + + if (params->keyword_rest != NULL) { + // If we already have a keyword rest parameter, then we replace it with the + // forwarding parameter and move the keyword rest parameter to the posts list. + pm_node_t *keyword_rest = params->keyword_rest; + pm_parameters_node_posts_append(params, keyword_rest); + if (succeeded) pm_parser_err_previous(parser, PM_ERR_PARAMETER_UNEXPECTED_FWD); + params->keyword_rest = NULL; + } + + pm_parameters_node_keyword_rest_set(params, UP(param)); + break; + } + case PM_TOKEN_CLASS_VARIABLE: + case PM_TOKEN_IDENTIFIER: + case PM_TOKEN_CONSTANT: + case PM_TOKEN_INSTANCE_VARIABLE: + case PM_TOKEN_GLOBAL_VARIABLE: + case PM_TOKEN_METHOD_NAME: { + parser_lex(parser); + switch (parser->previous.type) { + case PM_TOKEN_CONSTANT: + pm_parser_err_previous(parser, PM_ERR_ARGUMENT_FORMAL_CONSTANT); + break; + case PM_TOKEN_INSTANCE_VARIABLE: + pm_parser_err_previous(parser, PM_ERR_ARGUMENT_FORMAL_IVAR); + break; + case PM_TOKEN_GLOBAL_VARIABLE: + pm_parser_err_previous(parser, PM_ERR_ARGUMENT_FORMAL_GLOBAL); + break; + case PM_TOKEN_CLASS_VARIABLE: + pm_parser_err_previous(parser, PM_ERR_ARGUMENT_FORMAL_CLASS); + break; + case PM_TOKEN_METHOD_NAME: + pm_parser_err_previous(parser, PM_ERR_PARAMETER_METHOD_NAME); + break; + default: break; + } + + if (parser->current.type == PM_TOKEN_EQUAL) { + update_parameter_state(parser, &parser->current, &order); + } else { + update_parameter_state(parser, &parser->previous, &order); + } + + pm_token_t name = parser->previous; + bool repeated = pm_parser_parameter_name_check(parser, &name); + pm_parser_local_add_token(parser, &name, 1); + + if (match1(parser, PM_TOKEN_EQUAL)) { + pm_token_t operator = parser->current; + context_push(parser, PM_CONTEXT_DEFAULT_PARAMS); + parser_lex(parser); + + pm_constant_id_t name_id = pm_parser_constant_id_token(parser, &name); + uint32_t reads = parser->version <= PM_OPTIONS_VERSION_CRUBY_3_3 ? pm_locals_reads(&parser->current_scope->locals, name_id) : 0; + + if (accepts_blocks_in_defaults) pm_accepts_block_stack_push(parser, true); + pm_node_t *value = parse_value_expression(parser, binding_power, false, false, PM_ERR_PARAMETER_NO_DEFAULT, (uint16_t) (depth + 1)); + if (accepts_blocks_in_defaults) pm_accepts_block_stack_pop(parser); + + pm_optional_parameter_node_t *param = pm_optional_parameter_node_create(parser, &name, &operator, value); + + if (repeated) { + pm_node_flag_set_repeated_parameter(UP(param)); + } + pm_parameters_node_optionals_append(params, param); + + // If the value of the parameter increased the number of + // reads of that parameter, then we need to warn that we + // have a circular definition. + if ((parser->version <= PM_OPTIONS_VERSION_CRUBY_3_3) && (pm_locals_reads(&parser->current_scope->locals, name_id) != reads)) { + PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, name, PM_ERR_PARAMETER_CIRCULAR); + } + + context_pop(parser); + + // If parsing the value of the parameter resulted in error recovery, + // then we can put a missing node in its place and stop parsing the + // parameters entirely now. + if (parser->recovering) { + parsing = false; + break; + } + } else if (order > PM_PARAMETERS_ORDER_AFTER_OPTIONAL) { + pm_required_parameter_node_t *param = pm_required_parameter_node_create(parser, &name); + if (repeated) { + pm_node_flag_set_repeated_parameter(UP(param)); + } + pm_parameters_node_requireds_append(params, UP(param)); + } else { + pm_required_parameter_node_t *param = pm_required_parameter_node_create(parser, &name); + if (repeated) { + pm_node_flag_set_repeated_parameter(UP(param)); + } + pm_parameters_node_posts_append(params, UP(param)); + } + + break; + } + case PM_TOKEN_LABEL: { + if (!uses_parentheses && !in_block) parser->in_keyword_arg = true; + update_parameter_state(parser, &parser->current, &order); + + context_push(parser, PM_CONTEXT_DEFAULT_PARAMS); + parser_lex(parser); + + pm_token_t name = parser->previous; + pm_token_t local = name; + local.end -= 1; + + if (parser->encoding_changed ? parser->encoding->isupper_char(local.start, local.end - local.start) : pm_encoding_utf_8_isupper_char(local.start, local.end - local.start)) { + pm_parser_err(parser, local.start, local.end, PM_ERR_ARGUMENT_FORMAL_CONSTANT); + } else if (local.end[-1] == '!' || local.end[-1] == '?') { + PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, local, PM_ERR_INVALID_LOCAL_VARIABLE_WRITE); + } + + bool repeated = pm_parser_parameter_name_check(parser, &local); + pm_parser_local_add_token(parser, &local, 1); + + switch (parser->current.type) { + case PM_TOKEN_COMMA: + case PM_TOKEN_PARENTHESIS_RIGHT: + case PM_TOKEN_PIPE: { + context_pop(parser); + + pm_node_t *param = UP(pm_required_keyword_parameter_node_create(parser, &name)); + if (repeated) { + pm_node_flag_set_repeated_parameter(param); + } + + pm_parameters_node_keywords_append(params, param); + break; + } + case PM_TOKEN_SEMICOLON: + case PM_TOKEN_NEWLINE: { + context_pop(parser); + + if (uses_parentheses) { + parsing = false; + break; + } + + pm_node_t *param = UP(pm_required_keyword_parameter_node_create(parser, &name)); + if (repeated) { + pm_node_flag_set_repeated_parameter(param); + } + + pm_parameters_node_keywords_append(params, param); + break; + } + default: { + pm_node_t *param; + + if (token_begins_expression_p(parser->current.type)) { + pm_constant_id_t name_id = pm_parser_constant_id_token(parser, &local); + uint32_t reads = parser->version <= PM_OPTIONS_VERSION_CRUBY_3_3 ? pm_locals_reads(&parser->current_scope->locals, name_id) : 0; + + if (accepts_blocks_in_defaults) pm_accepts_block_stack_push(parser, true); + pm_node_t *value = parse_value_expression(parser, binding_power, false, false, PM_ERR_PARAMETER_NO_DEFAULT_KW, (uint16_t) (depth + 1)); + if (accepts_blocks_in_defaults) pm_accepts_block_stack_pop(parser); + + if (parser->version <= PM_OPTIONS_VERSION_CRUBY_3_3 && (pm_locals_reads(&parser->current_scope->locals, name_id) != reads)) { + PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, local, PM_ERR_PARAMETER_CIRCULAR); + } + + param = UP(pm_optional_keyword_parameter_node_create(parser, &name, value)); + } + else { + param = UP(pm_required_keyword_parameter_node_create(parser, &name)); + } + + if (repeated) { + pm_node_flag_set_repeated_parameter(param); + } + + context_pop(parser); + pm_parameters_node_keywords_append(params, param); + + // If parsing the value of the parameter resulted in error recovery, + // then we can put a missing node in its place and stop parsing the + // parameters entirely now. + if (parser->recovering) { + parsing = false; + break; + } + } + } + + parser->in_keyword_arg = false; + break; + } + case PM_TOKEN_USTAR: + case PM_TOKEN_STAR: { + update_parameter_state(parser, &parser->current, &order); + parser_lex(parser); + + pm_token_t operator = parser->previous; + pm_token_t name; + bool repeated = false; + + if (accept1(parser, PM_TOKEN_IDENTIFIER)) { + name = parser->previous; + repeated = pm_parser_parameter_name_check(parser, &name); + pm_parser_local_add_token(parser, &name, 1); + } else { + name = not_provided(parser); + parser->current_scope->parameters |= PM_SCOPE_PARAMETERS_FORWARDING_POSITIONALS; + } + + pm_node_t *param = UP(pm_rest_parameter_node_create(parser, &operator, &name)); + if (repeated) { + pm_node_flag_set_repeated_parameter(param); + } + + if (params->rest == NULL) { + pm_parameters_node_rest_set(params, param); + } else { + pm_parser_err_node(parser, param, PM_ERR_PARAMETER_SPLAT_MULTI); + pm_parameters_node_posts_append(params, param); + } + + break; + } + case PM_TOKEN_STAR_STAR: + case PM_TOKEN_USTAR_STAR: { + pm_parameters_order_t previous_order = order; + update_parameter_state(parser, &parser->current, &order); + parser_lex(parser); + + pm_token_t operator = parser->previous; + pm_node_t *param; + + if (accept1(parser, PM_TOKEN_KEYWORD_NIL)) { + if (previous_order <= PM_PARAMETERS_ORDER_KEYWORDS) { + pm_parser_err_previous(parser, PM_ERR_PARAMETER_UNEXPECTED_NO_KW); + } + + param = UP(pm_no_keywords_parameter_node_create(parser, &operator, &parser->previous)); + } else { + pm_token_t name; + + bool repeated = false; + if (accept1(parser, PM_TOKEN_IDENTIFIER)) { + name = parser->previous; + repeated = pm_parser_parameter_name_check(parser, &name); + pm_parser_local_add_token(parser, &name, 1); + } else { + name = not_provided(parser); + parser->current_scope->parameters |= PM_SCOPE_PARAMETERS_FORWARDING_KEYWORDS; + } + + param = UP(pm_keyword_rest_parameter_node_create(parser, &operator, &name)); + if (repeated) { + pm_node_flag_set_repeated_parameter(param); + } + } + + if (params->keyword_rest == NULL) { + pm_parameters_node_keyword_rest_set(params, param); + } else { + pm_parser_err_node(parser, param, PM_ERR_PARAMETER_ASSOC_SPLAT_MULTI); + pm_parameters_node_posts_append(params, param); + } + + break; + } + default: + if (parser->previous.type == PM_TOKEN_COMMA) { + if (allows_trailing_comma && order >= PM_PARAMETERS_ORDER_NAMED) { + // If we get here, then we have a trailing comma in a + // block parameter list. + pm_node_t *param = UP(pm_implicit_rest_node_create(parser, &parser->previous)); + + if (params->rest == NULL) { + pm_parameters_node_rest_set(params, param); + } else { + pm_parser_err_node(parser, UP(param), PM_ERR_PARAMETER_SPLAT_MULTI); + pm_parameters_node_posts_append(params, UP(param)); + } + } else { + pm_parser_err_previous(parser, PM_ERR_PARAMETER_WILD_LOOSE_COMMA); + } + } + + parsing = false; + break; + } + + // If we hit some kind of issue while parsing the parameter, this would + // have been set to false. In that case, we need to break out of the + // loop. + if (!parsing) break; + + bool accepted_newline = false; + if (uses_parentheses) { + accepted_newline = accept1(parser, PM_TOKEN_NEWLINE); + } + + if (accept1(parser, PM_TOKEN_COMMA)) { + // If there was a comma, but we also accepted a newline, then this + // is a syntax error. + if (accepted_newline) { + pm_parser_err_previous(parser, PM_ERR_INVALID_COMMA); + } + } else { + // If there was no comma, then we're done parsing parameters. + break; + } + } + + pm_do_loop_stack_pop(parser); + + // If we don't have any parameters, return `NULL` instead of an empty `ParametersNode`. + if (params->base.location.start == params->base.location.end) { + pm_node_destroy(parser, UP(params)); + return NULL; + } + + return params; +} + +/** + * Accepts a parser returns the index of the last newline in the file that was + * ecorded before the current token within the newline list. + */ +static size_t +token_newline_index(const pm_parser_t *parser) { + if (parser->heredoc_end == NULL) { + // This is the common case. In this case we can look at the previously + // recorded newline in the newline list and subtract from the current + // offset. + return parser->newline_list.size - 1; + } else { + // This is unlikely. This is the case that we have already parsed the + // start of a heredoc, so we cannot rely on looking at the previous + // offset of the newline list, and instead must go through the whole + // process of a binary search for the line number. + return (size_t) pm_newline_list_line(&parser->newline_list, parser->current.start, 0); + } +} + +/** + * Accepts a parser, a newline index, and a token and returns the column. The + * important piece of this is that it expands tabs out to the next tab stop. + */ +static int64_t +token_column(const pm_parser_t *parser, size_t newline_index, const pm_token_t *token, bool break_on_non_space) { + const uint8_t *cursor = parser->start + parser->newline_list.offsets[newline_index]; + const uint8_t *end = token->start; + + // Skip over the BOM if it is present. + if ( + newline_index == 0 && + parser->start[0] == 0xef && + parser->start[1] == 0xbb && + parser->start[2] == 0xbf + ) cursor += 3; + + int64_t column = 0; + for (; cursor < end; cursor++) { + switch (*cursor) { + case '\t': + column = ((column / PM_TAB_WHITESPACE_SIZE) + 1) * PM_TAB_WHITESPACE_SIZE; + break; + case ' ': + column++; + break; + default: + column++; + if (break_on_non_space) return -1; + break; + } + } + + return column; +} + +/** + * Accepts a parser, two newline indices, and pointers to two tokens. This + * function warns if the indentation of the two tokens does not match. + */ +static void +parser_warn_indentation_mismatch(pm_parser_t *parser, size_t opening_newline_index, const pm_token_t *opening_token, bool if_after_else, bool allow_indent) { + // If these warnings are disabled (unlikely), then we can just return. + if (!parser->warn_mismatched_indentation) return; + + // If the tokens are on the same line, we do not warn. + size_t closing_newline_index = token_newline_index(parser); + if (opening_newline_index == closing_newline_index) return; + + // If the opening token has anything other than spaces or tabs before it, + // then we do not warn. This is unless we are matching up an `if`/`end` pair + // and the `if` immediately follows an `else` keyword. + int64_t opening_column = token_column(parser, opening_newline_index, opening_token, !if_after_else); + if (!if_after_else && (opening_column == -1)) return; + + // Get a reference to the closing token off the current parser. This assumes + // that the caller has placed this in the correct position. + pm_token_t *closing_token = &parser->current; + + // If the tokens are at the same indentation, we do not warn. + int64_t closing_column = token_column(parser, closing_newline_index, closing_token, true); + if ((closing_column == -1) || (opening_column == closing_column)) return; + + // If the closing column is greater than the opening column and we are + // allowing indentation, then we do not warn. + if (allow_indent && (closing_column > opening_column)) return; + + // Otherwise, add a warning. + PM_PARSER_WARN_FORMAT( + parser, + closing_token->start, + closing_token->end, + PM_WARN_INDENTATION_MISMATCH, + (int) (closing_token->end - closing_token->start), + (const char *) closing_token->start, + (int) (opening_token->end - opening_token->start), + (const char *) opening_token->start, + ((int32_t) opening_newline_index) + parser->start_line + ); +} + +typedef enum { + PM_RESCUES_BEGIN = 1, + PM_RESCUES_BLOCK, + PM_RESCUES_CLASS, + PM_RESCUES_DEF, + PM_RESCUES_LAMBDA, + PM_RESCUES_MODULE, + PM_RESCUES_SCLASS +} pm_rescues_type_t; + +/** + * Parse any number of rescue clauses. This will form a linked list of if + * nodes pointing to each other from the top. + */ +static inline void +parse_rescues(pm_parser_t *parser, size_t opening_newline_index, const pm_token_t *opening, pm_begin_node_t *parent_node, pm_rescues_type_t type, uint16_t depth) { + pm_rescue_node_t *current = NULL; + + while (match1(parser, PM_TOKEN_KEYWORD_RESCUE)) { + if (opening != NULL) parser_warn_indentation_mismatch(parser, opening_newline_index, opening, false, false); + parser_lex(parser); + + pm_rescue_node_t *rescue = pm_rescue_node_create(parser, &parser->previous); + + switch (parser->current.type) { + case PM_TOKEN_EQUAL_GREATER: { + // Here we have an immediate => after the rescue keyword, in which case + // we're going to have an empty list of exceptions to rescue (which + // implies StandardError). + parser_lex(parser); + pm_rescue_node_operator_set(rescue, &parser->previous); + + pm_node_t *reference = parse_expression(parser, PM_BINDING_POWER_INDEX, false, false, PM_ERR_RESCUE_VARIABLE, (uint16_t) (depth + 1)); + reference = parse_target(parser, reference, false, false); + + pm_rescue_node_reference_set(rescue, reference); + break; + } + case PM_TOKEN_NEWLINE: + case PM_TOKEN_SEMICOLON: + case PM_TOKEN_KEYWORD_THEN: + // Here we have a terminator for the rescue keyword, in which + // case we're going to just continue on. + break; + default: { + if (token_begins_expression_p(parser->current.type) || match1(parser, PM_TOKEN_USTAR)) { + // Here we have something that could be an exception expression, so + // we'll attempt to parse it here and any others delimited by commas. + + do { + pm_node_t *expression = parse_starred_expression(parser, PM_BINDING_POWER_DEFINED, false, PM_ERR_RESCUE_EXPRESSION, (uint16_t) (depth + 1)); + pm_rescue_node_exceptions_append(rescue, expression); + + // If we hit a newline, then this is the end of the rescue expression. We + // can continue on to parse the statements. + if (match3(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON, PM_TOKEN_KEYWORD_THEN)) break; + + // If we hit a `=>` then we're going to parse the exception variable. Once + // we've done that, we'll break out of the loop and parse the statements. + if (accept1(parser, PM_TOKEN_EQUAL_GREATER)) { + pm_rescue_node_operator_set(rescue, &parser->previous); + + pm_node_t *reference = parse_expression(parser, PM_BINDING_POWER_INDEX, false, false, PM_ERR_RESCUE_VARIABLE, (uint16_t) (depth + 1)); + reference = parse_target(parser, reference, false, false); + + pm_rescue_node_reference_set(rescue, reference); + break; + } + } while (accept1(parser, PM_TOKEN_COMMA)); + } + } + } + + if (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) { + if (accept1(parser, PM_TOKEN_KEYWORD_THEN)) { + rescue->then_keyword_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(&parser->previous); + } + } else { + expect1(parser, PM_TOKEN_KEYWORD_THEN, PM_ERR_RESCUE_TERM); + rescue->then_keyword_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(&parser->previous); + } + + if (!match3(parser, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_END)) { + pm_accepts_block_stack_push(parser, true); + pm_context_t context; + + switch (type) { + case PM_RESCUES_BEGIN: context = PM_CONTEXT_BEGIN_RESCUE; break; + case PM_RESCUES_BLOCK: context = PM_CONTEXT_BLOCK_RESCUE; break; + case PM_RESCUES_CLASS: context = PM_CONTEXT_CLASS_RESCUE; break; + case PM_RESCUES_DEF: context = PM_CONTEXT_DEF_RESCUE; break; + case PM_RESCUES_LAMBDA: context = PM_CONTEXT_LAMBDA_RESCUE; break; + case PM_RESCUES_MODULE: context = PM_CONTEXT_MODULE_RESCUE; break; + case PM_RESCUES_SCLASS: context = PM_CONTEXT_SCLASS_RESCUE; break; + default: assert(false && "unreachable"); context = PM_CONTEXT_BEGIN_RESCUE; break; + } + + pm_statements_node_t *statements = parse_statements(parser, context, (uint16_t) (depth + 1)); + if (statements != NULL) pm_rescue_node_statements_set(rescue, statements); + + pm_accepts_block_stack_pop(parser); + accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); + } + + if (current == NULL) { + pm_begin_node_rescue_clause_set(parent_node, rescue); + } else { + pm_rescue_node_subsequent_set(current, rescue); + } + + current = rescue; + } + + // The end node locations on rescue nodes will not be set correctly + // since we won't know the end until we've found all subsequent + // clauses. This sets the end location on all rescues once we know it. + if (current != NULL) { + const uint8_t *end_to_set = current->base.location.end; + pm_rescue_node_t *clause = parent_node->rescue_clause; + + while (clause != NULL) { + clause->base.location.end = end_to_set; + clause = clause->subsequent; + } + } + + pm_token_t else_keyword; + if (match1(parser, PM_TOKEN_KEYWORD_ELSE)) { + if (opening != NULL) parser_warn_indentation_mismatch(parser, opening_newline_index, opening, false, false); + opening_newline_index = token_newline_index(parser); + + else_keyword = parser->current; + opening = &else_keyword; + + parser_lex(parser); + accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); + + pm_statements_node_t *else_statements = NULL; + if (!match2(parser, PM_TOKEN_KEYWORD_END, PM_TOKEN_KEYWORD_ENSURE)) { + pm_accepts_block_stack_push(parser, true); + pm_context_t context; + + switch (type) { + case PM_RESCUES_BEGIN: context = PM_CONTEXT_BEGIN_ELSE; break; + case PM_RESCUES_BLOCK: context = PM_CONTEXT_BLOCK_ELSE; break; + case PM_RESCUES_CLASS: context = PM_CONTEXT_CLASS_ELSE; break; + case PM_RESCUES_DEF: context = PM_CONTEXT_DEF_ELSE; break; + case PM_RESCUES_LAMBDA: context = PM_CONTEXT_LAMBDA_ELSE; break; + case PM_RESCUES_MODULE: context = PM_CONTEXT_MODULE_ELSE; break; + case PM_RESCUES_SCLASS: context = PM_CONTEXT_SCLASS_ELSE; break; + default: assert(false && "unreachable"); context = PM_CONTEXT_BEGIN_ELSE; break; + } + + else_statements = parse_statements(parser, context, (uint16_t) (depth + 1)); + pm_accepts_block_stack_pop(parser); + + accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); + } + + pm_else_node_t *else_clause = pm_else_node_create(parser, &else_keyword, else_statements, &parser->current); + pm_begin_node_else_clause_set(parent_node, else_clause); + + // If we don't have a `current` rescue node, then this is a dangling + // else, and it's an error. + if (current == NULL) pm_parser_err_node(parser, UP(else_clause), PM_ERR_BEGIN_LONELY_ELSE); + } + + if (match1(parser, PM_TOKEN_KEYWORD_ENSURE)) { + if (opening != NULL) parser_warn_indentation_mismatch(parser, opening_newline_index, opening, false, false); + pm_token_t ensure_keyword = parser->current; + + parser_lex(parser); + accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); + + pm_statements_node_t *ensure_statements = NULL; + if (!match1(parser, PM_TOKEN_KEYWORD_END)) { + pm_accepts_block_stack_push(parser, true); + pm_context_t context; + + switch (type) { + case PM_RESCUES_BEGIN: context = PM_CONTEXT_BEGIN_ENSURE; break; + case PM_RESCUES_BLOCK: context = PM_CONTEXT_BLOCK_ENSURE; break; + case PM_RESCUES_CLASS: context = PM_CONTEXT_CLASS_ENSURE; break; + case PM_RESCUES_DEF: context = PM_CONTEXT_DEF_ENSURE; break; + case PM_RESCUES_LAMBDA: context = PM_CONTEXT_LAMBDA_ENSURE; break; + case PM_RESCUES_MODULE: context = PM_CONTEXT_MODULE_ENSURE; break; + case PM_RESCUES_SCLASS: context = PM_CONTEXT_SCLASS_ENSURE; break; + default: assert(false && "unreachable"); context = PM_CONTEXT_BEGIN_RESCUE; break; + } + + ensure_statements = parse_statements(parser, context, (uint16_t) (depth + 1)); + pm_accepts_block_stack_pop(parser); + + accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); + } + + pm_ensure_node_t *ensure_clause = pm_ensure_node_create(parser, &ensure_keyword, ensure_statements, &parser->current); + pm_begin_node_ensure_clause_set(parent_node, ensure_clause); + } + + if (match1(parser, PM_TOKEN_KEYWORD_END)) { + if (opening != NULL) parser_warn_indentation_mismatch(parser, opening_newline_index, opening, false, false); + pm_begin_node_end_keyword_set(parent_node, &parser->current); + } else { + pm_token_t end_keyword = (pm_token_t) { .type = PM_TOKEN_MISSING, .start = parser->previous.end, .end = parser->previous.end }; + pm_begin_node_end_keyword_set(parent_node, &end_keyword); + } +} + +/** + * Parse a set of rescue clauses with an implicit begin (for example when on a + * class, module, def, etc.). + */ +static pm_begin_node_t * +parse_rescues_implicit_begin(pm_parser_t *parser, size_t opening_newline_index, const pm_token_t *opening, const uint8_t *start, pm_statements_node_t *statements, pm_rescues_type_t type, uint16_t depth) { + pm_token_t begin_keyword = not_provided(parser); + pm_begin_node_t *node = pm_begin_node_create(parser, &begin_keyword, statements); + + parse_rescues(parser, opening_newline_index, opening, node, type, (uint16_t) (depth + 1)); + node->base.location.start = start; + + return node; +} + +/** + * Parse a list of parameters and local on a block definition. + */ +static pm_block_parameters_node_t * +parse_block_parameters( + pm_parser_t *parser, + bool allows_trailing_comma, + const pm_token_t *opening, + bool is_lambda_literal, + bool accepts_blocks_in_defaults, + uint16_t depth +) { + pm_parameters_node_t *parameters = NULL; + if (!match1(parser, PM_TOKEN_SEMICOLON)) { + if (!is_lambda_literal) { + context_push(parser, PM_CONTEXT_BLOCK_PARAMETERS); + } + parameters = parse_parameters( + parser, + is_lambda_literal ? PM_BINDING_POWER_DEFINED : PM_BINDING_POWER_INDEX, + false, + allows_trailing_comma, + false, + accepts_blocks_in_defaults, + true, + (uint16_t) (depth + 1) + ); + if (!is_lambda_literal) { + context_pop(parser); + } + } + + pm_block_parameters_node_t *block_parameters = pm_block_parameters_node_create(parser, parameters, opening); + if ((opening->type != PM_TOKEN_NOT_PROVIDED)) { + accept1(parser, PM_TOKEN_NEWLINE); + + if (accept1(parser, PM_TOKEN_SEMICOLON)) { + do { + switch (parser->current.type) { + case PM_TOKEN_CONSTANT: + pm_parser_err_current(parser, PM_ERR_ARGUMENT_FORMAL_CONSTANT); + parser_lex(parser); + break; + case PM_TOKEN_INSTANCE_VARIABLE: + pm_parser_err_current(parser, PM_ERR_ARGUMENT_FORMAL_IVAR); + parser_lex(parser); + break; + case PM_TOKEN_GLOBAL_VARIABLE: + pm_parser_err_current(parser, PM_ERR_ARGUMENT_FORMAL_GLOBAL); + parser_lex(parser); + break; + case PM_TOKEN_CLASS_VARIABLE: + pm_parser_err_current(parser, PM_ERR_ARGUMENT_FORMAL_CLASS); + parser_lex(parser); + break; + default: + expect1(parser, PM_TOKEN_IDENTIFIER, PM_ERR_BLOCK_PARAM_LOCAL_VARIABLE); + break; + } + + bool repeated = pm_parser_parameter_name_check(parser, &parser->previous); + pm_parser_local_add_token(parser, &parser->previous, 1); + + pm_block_local_variable_node_t *local = pm_block_local_variable_node_create(parser, &parser->previous); + if (repeated) pm_node_flag_set_repeated_parameter(UP(local)); + + pm_block_parameters_node_append_local(block_parameters, local); + } while (accept1(parser, PM_TOKEN_COMMA)); + } + } + + return block_parameters; +} + +/** + * Return true if any of the visible scopes to the current context are using + * numbered parameters. + */ +static bool +outer_scope_using_numbered_parameters_p(pm_parser_t *parser) { + for (pm_scope_t *scope = parser->current_scope->previous; scope != NULL && !scope->closed; scope = scope->previous) { + if (scope->parameters & PM_SCOPE_PARAMETERS_NUMBERED_FOUND) return true; + } + + return false; +} + +/** + * These are the names of the various numbered parameters. We have them here so + * that when we insert them into the constant pool we can use a constant string + * and not have to allocate. + */ +static const char * const pm_numbered_parameter_names[] = { + "_1", "_2", "_3", "_4", "_5", "_6", "_7", "_8", "_9" +}; + +/** + * Return the node that should be used in the parameters field of a block-like + * (block or lambda) node, depending on the kind of parameters that were + * declared in the current scope. + */ +static pm_node_t * +parse_blocklike_parameters(pm_parser_t *parser, pm_node_t *parameters, const pm_token_t *opening, const pm_token_t *closing) { + pm_node_list_t *implicit_parameters = &parser->current_scope->implicit_parameters; + + // If we have ordinary parameters, then we will return them as the set of + // parameters. + if (parameters != NULL) { + // If we also have implicit parameters, then this is an error. + if (implicit_parameters->size > 0) { + pm_node_t *node = implicit_parameters->nodes[0]; + + if (PM_NODE_TYPE_P(node, PM_LOCAL_VARIABLE_READ_NODE)) { + pm_parser_err_node(parser, node, PM_ERR_NUMBERED_PARAMETER_ORDINARY); + } else if (PM_NODE_TYPE_P(node, PM_IT_LOCAL_VARIABLE_READ_NODE)) { + pm_parser_err_node(parser, node, PM_ERR_IT_NOT_ALLOWED_ORDINARY); + } else { + assert(false && "unreachable"); + } + } + + return parameters; + } + + // If we don't have any implicit parameters, then the set of parameters is + // NULL. + if (implicit_parameters->size == 0) { + return NULL; + } + + // If we don't have ordinary parameters, then we now must validate our set + // of implicit parameters. We can only have numbered parameters or it, but + // they cannot be mixed. + uint8_t numbered_parameter = 0; + bool it_parameter = false; + + for (size_t index = 0; index < implicit_parameters->size; index++) { + pm_node_t *node = implicit_parameters->nodes[index]; + + if (PM_NODE_TYPE_P(node, PM_LOCAL_VARIABLE_READ_NODE)) { + if (it_parameter) { + pm_parser_err_node(parser, node, PM_ERR_NUMBERED_PARAMETER_IT); + } else if (outer_scope_using_numbered_parameters_p(parser)) { + pm_parser_err_node(parser, node, PM_ERR_NUMBERED_PARAMETER_OUTER_BLOCK); + } else if (parser->current_scope->parameters & PM_SCOPE_PARAMETERS_NUMBERED_INNER) { + pm_parser_err_node(parser, node, PM_ERR_NUMBERED_PARAMETER_INNER_BLOCK); + } else if (pm_token_is_numbered_parameter(node->location.start, node->location.end)) { + numbered_parameter = MAX(numbered_parameter, (uint8_t) (node->location.start[1] - '0')); + } else { + assert(false && "unreachable"); + } + } else if (PM_NODE_TYPE_P(node, PM_IT_LOCAL_VARIABLE_READ_NODE)) { + if (numbered_parameter > 0) { + pm_parser_err_node(parser, node, PM_ERR_IT_NOT_ALLOWED_NUMBERED); + } else { + it_parameter = true; + } + } + } + + if (numbered_parameter > 0) { + // Go through the parent scopes and mark them as being disallowed from + // using numbered parameters because this inner scope is using them. + for (pm_scope_t *scope = parser->current_scope->previous; scope != NULL && !scope->closed; scope = scope->previous) { + scope->parameters |= PM_SCOPE_PARAMETERS_NUMBERED_INNER; + } + + const pm_location_t location = { .start = opening->start, .end = closing->end }; + return UP(pm_numbered_parameters_node_create(parser, &location, numbered_parameter)); + } + + if (it_parameter) { + return UP(pm_it_parameters_node_create(parser, opening, closing)); + } + + return NULL; +} + +/** + * Parse a block. + */ +static pm_block_node_t * +parse_block(pm_parser_t *parser, uint16_t depth) { + pm_token_t opening = parser->previous; + accept1(parser, PM_TOKEN_NEWLINE); + + pm_accepts_block_stack_push(parser, true); + pm_parser_scope_push(parser, false); + + pm_block_parameters_node_t *block_parameters = NULL; + + if (accept1(parser, PM_TOKEN_PIPE)) { + pm_token_t block_parameters_opening = parser->previous; + if (match1(parser, PM_TOKEN_PIPE)) { + block_parameters = pm_block_parameters_node_create(parser, NULL, &block_parameters_opening); + parser->command_start = true; + parser_lex(parser); + } else { + block_parameters = parse_block_parameters(parser, true, &block_parameters_opening, false, true, (uint16_t) (depth + 1)); + accept1(parser, PM_TOKEN_NEWLINE); + parser->command_start = true; + expect1(parser, PM_TOKEN_PIPE, PM_ERR_BLOCK_PARAM_PIPE_TERM); + } + + pm_block_parameters_node_closing_set(block_parameters, &parser->previous); + } + + accept1(parser, PM_TOKEN_NEWLINE); + pm_node_t *statements = NULL; + + if (opening.type == PM_TOKEN_BRACE_LEFT) { + if (!match1(parser, PM_TOKEN_BRACE_RIGHT)) { + statements = UP(parse_statements(parser, PM_CONTEXT_BLOCK_BRACES, (uint16_t) (depth + 1))); + } + + expect1_opening(parser, PM_TOKEN_BRACE_RIGHT, PM_ERR_BLOCK_TERM_BRACE, &opening); + } else { + if (!match1(parser, PM_TOKEN_KEYWORD_END)) { + if (!match3(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_ENSURE)) { + pm_accepts_block_stack_push(parser, true); + statements = UP(parse_statements(parser, PM_CONTEXT_BLOCK_KEYWORDS, (uint16_t) (depth + 1))); + pm_accepts_block_stack_pop(parser); + } + + if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) { + assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE)); + statements = UP(parse_rescues_implicit_begin(parser, 0, NULL, opening.start, (pm_statements_node_t *) statements, PM_RESCUES_BLOCK, (uint16_t) (depth + 1))); + } + } + + expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_BLOCK_TERM_END, &opening); + } + + pm_constant_id_list_t locals; + pm_locals_order(parser, &parser->current_scope->locals, &locals, pm_parser_scope_toplevel_p(parser)); + pm_node_t *parameters = parse_blocklike_parameters(parser, UP(block_parameters), &opening, &parser->previous); + + pm_parser_scope_pop(parser); + pm_accepts_block_stack_pop(parser); + + return pm_block_node_create(parser, &locals, &opening, parameters, statements, &parser->previous); +} + +/** + * Parse a list of arguments and their surrounding parentheses if they are + * present. It returns true if it found any pieces of arguments (parentheses, + * arguments, or blocks). + */ +static bool +parse_arguments_list(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_block, bool accepts_command_call, uint16_t depth) { + bool found = false; + + if (accept1(parser, PM_TOKEN_PARENTHESIS_LEFT)) { + found |= true; + arguments->opening_loc = PM_LOCATION_TOKEN_VALUE(&parser->previous); + + if (accept1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { + arguments->closing_loc = PM_LOCATION_TOKEN_VALUE(&parser->previous); + } else { + pm_accepts_block_stack_push(parser, true); + parse_arguments(parser, arguments, accepts_block, PM_TOKEN_PARENTHESIS_RIGHT, (uint16_t) (depth + 1)); + + if (!accept1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_ARGUMENT_TERM_PAREN, pm_token_type_human(parser->current.type)); + parser->previous.start = parser->previous.end; + parser->previous.type = PM_TOKEN_MISSING; + } + + pm_accepts_block_stack_pop(parser); + arguments->closing_loc = PM_LOCATION_TOKEN_VALUE(&parser->previous); + } + } else if (accepts_command_call && (token_begins_expression_p(parser->current.type) || match3(parser, PM_TOKEN_USTAR, PM_TOKEN_USTAR_STAR, PM_TOKEN_UAMPERSAND)) && !match1(parser, PM_TOKEN_BRACE_LEFT)) { + found |= true; + pm_accepts_block_stack_push(parser, false); + + // If we get here, then the subsequent token cannot be used as an infix + // operator. In this case we assume the subsequent token is part of an + // argument to this method call. + parse_arguments(parser, arguments, accepts_block, PM_TOKEN_EOF, (uint16_t) (depth + 1)); + + // If we have done with the arguments and still not consumed the comma, + // then we have a trailing comma where we need to check whether it is + // allowed or not. + if (parser->previous.type == PM_TOKEN_COMMA && !match1(parser, PM_TOKEN_SEMICOLON)) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->previous, PM_ERR_EXPECT_ARGUMENT, pm_token_type_human(parser->current.type)); + } + + pm_accepts_block_stack_pop(parser); + } + + // If we're at the end of the arguments, we can now check if there is a block + // node that starts with a {. If there is, then we can parse it and add it to + // the arguments. + if (accepts_block) { + pm_block_node_t *block = NULL; + + if (accept1(parser, PM_TOKEN_BRACE_LEFT)) { + found |= true; + block = parse_block(parser, (uint16_t) (depth + 1)); + pm_arguments_validate_block(parser, arguments, block); + } else if (pm_accepts_block_stack_p(parser) && accept1(parser, PM_TOKEN_KEYWORD_DO)) { + found |= true; + block = parse_block(parser, (uint16_t) (depth + 1)); + } + + if (block != NULL) { + if (arguments->block == NULL && !arguments->has_forwarding) { + arguments->block = UP(block); + } else { + pm_parser_err_node(parser, UP(block), PM_ERR_ARGUMENT_BLOCK_MULTI); + + if (arguments->block != NULL) { + if (arguments->arguments == NULL) { + arguments->arguments = pm_arguments_node_create(parser); + } + pm_arguments_node_arguments_append(arguments->arguments, arguments->block); + } + arguments->block = UP(block); + } + } + } + + return found; +} + +/** + * Check that the return is allowed in the current context. If it isn't, add an + * error to the parser. + */ +static void +parse_return(pm_parser_t *parser, pm_node_t *node) { + bool in_sclass = false; + for (pm_context_node_t *context_node = parser->current_context; context_node != NULL; context_node = context_node->prev) { + switch (context_node->context) { + case PM_CONTEXT_BEGIN_ELSE: + case PM_CONTEXT_BEGIN_ENSURE: + case PM_CONTEXT_BEGIN_RESCUE: + case PM_CONTEXT_BEGIN: + case PM_CONTEXT_CASE_IN: + case PM_CONTEXT_CASE_WHEN: + case PM_CONTEXT_DEFAULT_PARAMS: + case PM_CONTEXT_DEFINED: + case PM_CONTEXT_ELSE: + case PM_CONTEXT_ELSIF: + case PM_CONTEXT_EMBEXPR: + case PM_CONTEXT_FOR_INDEX: + case PM_CONTEXT_FOR: + case PM_CONTEXT_IF: + case PM_CONTEXT_LOOP_PREDICATE: + case PM_CONTEXT_MAIN: + case PM_CONTEXT_MULTI_TARGET: + case PM_CONTEXT_PARENS: + case PM_CONTEXT_POSTEXE: + case PM_CONTEXT_PREDICATE: + case PM_CONTEXT_PREEXE: + case PM_CONTEXT_RESCUE_MODIFIER: + case PM_CONTEXT_TERNARY: + case PM_CONTEXT_UNLESS: + case PM_CONTEXT_UNTIL: + case PM_CONTEXT_WHILE: + // Keep iterating up the lists of contexts, because returns can + // see through these. + continue; + case PM_CONTEXT_SCLASS_ELSE: + case PM_CONTEXT_SCLASS_ENSURE: + case PM_CONTEXT_SCLASS_RESCUE: + case PM_CONTEXT_SCLASS: + in_sclass = true; + continue; + case PM_CONTEXT_CLASS_ELSE: + case PM_CONTEXT_CLASS_ENSURE: + case PM_CONTEXT_CLASS_RESCUE: + case PM_CONTEXT_CLASS: + case PM_CONTEXT_MODULE_ELSE: + case PM_CONTEXT_MODULE_ENSURE: + case PM_CONTEXT_MODULE_RESCUE: + case PM_CONTEXT_MODULE: + // These contexts are invalid for a return. + pm_parser_err_node(parser, node, PM_ERR_RETURN_INVALID); + return; + case PM_CONTEXT_BLOCK_BRACES: + case PM_CONTEXT_BLOCK_ELSE: + case PM_CONTEXT_BLOCK_ENSURE: + case PM_CONTEXT_BLOCK_KEYWORDS: + case PM_CONTEXT_BLOCK_RESCUE: + case PM_CONTEXT_BLOCK_PARAMETERS: + case PM_CONTEXT_DEF_ELSE: + case PM_CONTEXT_DEF_ENSURE: + case PM_CONTEXT_DEF_PARAMS: + case PM_CONTEXT_DEF_RESCUE: + case PM_CONTEXT_DEF: + case PM_CONTEXT_LAMBDA_BRACES: + case PM_CONTEXT_LAMBDA_DO_END: + case PM_CONTEXT_LAMBDA_ELSE: + case PM_CONTEXT_LAMBDA_ENSURE: + case PM_CONTEXT_LAMBDA_RESCUE: + // These contexts are valid for a return, and we should not + // continue to loop. + return; + case PM_CONTEXT_NONE: + // This case should never happen. + assert(false && "unreachable"); + break; + } + } + if (in_sclass && parser->version >= PM_OPTIONS_VERSION_CRUBY_3_4) { + pm_parser_err_node(parser, node, PM_ERR_RETURN_INVALID); + } +} + +/** + * Check that the block exit (next, break, redo) is allowed in the current + * context. If it isn't, add an error to the parser. + */ +static void +parse_block_exit(pm_parser_t *parser, pm_node_t *node) { + for (pm_context_node_t *context_node = parser->current_context; context_node != NULL; context_node = context_node->prev) { + switch (context_node->context) { + case PM_CONTEXT_BLOCK_BRACES: + case PM_CONTEXT_BLOCK_KEYWORDS: + case PM_CONTEXT_BLOCK_ELSE: + case PM_CONTEXT_BLOCK_ENSURE: + case PM_CONTEXT_BLOCK_PARAMETERS: + case PM_CONTEXT_BLOCK_RESCUE: + case PM_CONTEXT_DEFINED: + case PM_CONTEXT_FOR: + case PM_CONTEXT_LAMBDA_BRACES: + case PM_CONTEXT_LAMBDA_DO_END: + case PM_CONTEXT_LAMBDA_ELSE: + case PM_CONTEXT_LAMBDA_ENSURE: + case PM_CONTEXT_LAMBDA_RESCUE: + case PM_CONTEXT_LOOP_PREDICATE: + case PM_CONTEXT_POSTEXE: + case PM_CONTEXT_UNTIL: + case PM_CONTEXT_WHILE: + // These are the good cases. We're allowed to have a block exit + // in these contexts. + return; + case PM_CONTEXT_DEF: + case PM_CONTEXT_DEF_PARAMS: + case PM_CONTEXT_DEF_ELSE: + case PM_CONTEXT_DEF_ENSURE: + case PM_CONTEXT_DEF_RESCUE: + case PM_CONTEXT_MAIN: + case PM_CONTEXT_PREEXE: + case PM_CONTEXT_SCLASS: + case PM_CONTEXT_SCLASS_ELSE: + case PM_CONTEXT_SCLASS_ENSURE: + case PM_CONTEXT_SCLASS_RESCUE: + // These are the bad cases. We're not allowed to have a block + // exit in these contexts. + // + // If we get here, then we're about to mark this block exit + // as invalid. However, it could later _become_ valid if we + // find a trailing while/until on the expression. In this + // case instead of adding the error here, we'll add the + // block exit to the list of exits for the expression, and + // the node parsing will handle validating it instead. + assert(parser->current_block_exits != NULL); + pm_node_list_append(parser->current_block_exits, node); + return; + case PM_CONTEXT_BEGIN_ELSE: + case PM_CONTEXT_BEGIN_ENSURE: + case PM_CONTEXT_BEGIN_RESCUE: + case PM_CONTEXT_BEGIN: + case PM_CONTEXT_CASE_IN: + case PM_CONTEXT_CASE_WHEN: + case PM_CONTEXT_CLASS_ELSE: + case PM_CONTEXT_CLASS_ENSURE: + case PM_CONTEXT_CLASS_RESCUE: + case PM_CONTEXT_CLASS: + case PM_CONTEXT_DEFAULT_PARAMS: + case PM_CONTEXT_ELSE: + case PM_CONTEXT_ELSIF: + case PM_CONTEXT_EMBEXPR: + case PM_CONTEXT_FOR_INDEX: + case PM_CONTEXT_IF: + case PM_CONTEXT_MODULE_ELSE: + case PM_CONTEXT_MODULE_ENSURE: + case PM_CONTEXT_MODULE_RESCUE: + case PM_CONTEXT_MODULE: + case PM_CONTEXT_MULTI_TARGET: + case PM_CONTEXT_PARENS: + case PM_CONTEXT_PREDICATE: + case PM_CONTEXT_RESCUE_MODIFIER: + case PM_CONTEXT_TERNARY: + case PM_CONTEXT_UNLESS: + // In these contexts we should continue walking up the list of + // contexts. + break; + case PM_CONTEXT_NONE: + // This case should never happen. + assert(false && "unreachable"); + break; + } + } +} + +/** + * When we hit an expression that could contain block exits, we need to stash + * the previous set and create a new one. + */ +static pm_node_list_t * +push_block_exits(pm_parser_t *parser, pm_node_list_t *current_block_exits) { + pm_node_list_t *previous_block_exits = parser->current_block_exits; + parser->current_block_exits = current_block_exits; + return previous_block_exits; +} + +/** + * If we did not match a trailing while/until and this was the last chance to do + * so, then all of the block exits in the list are invalid and we need to add an + * error for each of them. + */ +static void +flush_block_exits(pm_parser_t *parser, pm_node_list_t *previous_block_exits) { + pm_node_t *block_exit; + PM_NODE_LIST_FOREACH(parser->current_block_exits, index, block_exit) { + const char *type; + + switch (PM_NODE_TYPE(block_exit)) { + case PM_BREAK_NODE: type = "break"; break; + case PM_NEXT_NODE: type = "next"; break; + case PM_REDO_NODE: type = "redo"; break; + default: assert(false && "unreachable"); type = ""; break; + } + + PM_PARSER_ERR_NODE_FORMAT(parser, block_exit, PM_ERR_INVALID_BLOCK_EXIT, type); + } + + parser->current_block_exits = previous_block_exits; +} + +/** + * Pop the current level of block exits from the parser, and add errors to the + * parser if any of them are deemed to be invalid. + */ +static void +pop_block_exits(pm_parser_t *parser, pm_node_list_t *previous_block_exits) { + if (match2(parser, PM_TOKEN_KEYWORD_WHILE_MODIFIER, PM_TOKEN_KEYWORD_UNTIL_MODIFIER)) { + // If we matched a trailing while/until, then all of the block exits in + // the contained list are valid. In this case we do not need to do + // anything. + parser->current_block_exits = previous_block_exits; + } else if (previous_block_exits != NULL) { + // If we did not matching a trailing while/until, then all of the block + // exits contained in the list are invalid for this specific context. + // However, they could still become valid in a higher level context if + // there is another list above this one. In this case we'll push all of + // the block exits up to the previous list. + pm_node_list_concat(previous_block_exits, parser->current_block_exits); + parser->current_block_exits = previous_block_exits; + } else { + // If we did not match a trailing while/until and this was the last + // chance to do so, then all of the block exits in the list are invalid + // and we need to add an error for each of them. + flush_block_exits(parser, previous_block_exits); + } +} + +static inline pm_node_t * +parse_predicate(pm_parser_t *parser, pm_binding_power_t binding_power, pm_context_t context, pm_token_t *then_keyword, uint16_t depth) { + context_push(parser, PM_CONTEXT_PREDICATE); + pm_diagnostic_id_t error_id = context == PM_CONTEXT_IF ? PM_ERR_CONDITIONAL_IF_PREDICATE : PM_ERR_CONDITIONAL_UNLESS_PREDICATE; + pm_node_t *predicate = parse_value_expression(parser, binding_power, true, false, error_id, (uint16_t) (depth + 1)); + + // Predicates are closed by a term, a "then", or a term and then a "then". + bool predicate_closed = accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); + + if (accept1(parser, PM_TOKEN_KEYWORD_THEN)) { + predicate_closed = true; + *then_keyword = parser->previous; + } + + if (!predicate_closed) { + pm_parser_err_current(parser, PM_ERR_CONDITIONAL_PREDICATE_TERM); + } + + context_pop(parser); + return predicate; +} + +static inline pm_node_t * +parse_conditional(pm_parser_t *parser, pm_context_t context, size_t opening_newline_index, bool if_after_else, uint16_t depth) { + pm_node_list_t current_block_exits = { 0 }; + pm_node_list_t *previous_block_exits = push_block_exits(parser, ¤t_block_exits); + + pm_token_t keyword = parser->previous; + pm_token_t then_keyword = not_provided(parser); + + pm_node_t *predicate = parse_predicate(parser, PM_BINDING_POWER_MODIFIER, context, &then_keyword, (uint16_t) (depth + 1)); + pm_statements_node_t *statements = NULL; + + if (!match3(parser, PM_TOKEN_KEYWORD_ELSIF, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { + pm_accepts_block_stack_push(parser, true); + statements = parse_statements(parser, context, (uint16_t) (depth + 1)); + pm_accepts_block_stack_pop(parser); + accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); + } + + pm_token_t end_keyword = not_provided(parser); + pm_node_t *parent = NULL; + + switch (context) { + case PM_CONTEXT_IF: + parent = UP(pm_if_node_create(parser, &keyword, predicate, &then_keyword, statements, NULL, &end_keyword)); + break; + case PM_CONTEXT_UNLESS: + parent = UP(pm_unless_node_create(parser, &keyword, predicate, &then_keyword, statements)); + break; + default: + assert(false && "unreachable"); + break; + } + + pm_node_t *current = parent; + + // Parse any number of elsif clauses. This will form a linked list of if + // nodes pointing to each other from the top. + if (context == PM_CONTEXT_IF) { + while (match1(parser, PM_TOKEN_KEYWORD_ELSIF)) { + if (parser_end_of_line_p(parser)) { + PM_PARSER_WARN_TOKEN_FORMAT_CONTENT(parser, parser->current, PM_WARN_KEYWORD_EOL); + } + + parser_warn_indentation_mismatch(parser, opening_newline_index, &keyword, false, false); + pm_token_t elsif_keyword = parser->current; + parser_lex(parser); + + pm_node_t *predicate = parse_predicate(parser, PM_BINDING_POWER_MODIFIER, PM_CONTEXT_ELSIF, &then_keyword, (uint16_t) (depth + 1)); + pm_accepts_block_stack_push(parser, true); + + pm_statements_node_t *statements = parse_statements(parser, PM_CONTEXT_ELSIF, (uint16_t) (depth + 1)); + pm_accepts_block_stack_pop(parser); + accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); + + pm_node_t *elsif = UP(pm_if_node_create(parser, &elsif_keyword, predicate, &then_keyword, statements, NULL, &end_keyword)); + ((pm_if_node_t *) current)->subsequent = elsif; + current = elsif; + } + } + + if (match1(parser, PM_TOKEN_KEYWORD_ELSE)) { + parser_warn_indentation_mismatch(parser, opening_newline_index, &keyword, false, false); + opening_newline_index = token_newline_index(parser); + + parser_lex(parser); + pm_token_t else_keyword = parser->previous; + + pm_accepts_block_stack_push(parser, true); + pm_statements_node_t *else_statements = parse_statements(parser, PM_CONTEXT_ELSE, (uint16_t) (depth + 1)); + pm_accepts_block_stack_pop(parser); + + accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); + parser_warn_indentation_mismatch(parser, opening_newline_index, &else_keyword, false, false); + expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_CONDITIONAL_TERM_ELSE, &keyword); + + pm_else_node_t *else_node = pm_else_node_create(parser, &else_keyword, else_statements, &parser->previous); + + switch (context) { + case PM_CONTEXT_IF: + ((pm_if_node_t *) current)->subsequent = UP(else_node); + break; + case PM_CONTEXT_UNLESS: + ((pm_unless_node_t *) parent)->else_clause = else_node; + break; + default: + assert(false && "unreachable"); + break; + } + } else { + parser_warn_indentation_mismatch(parser, opening_newline_index, &keyword, if_after_else, false); + expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_CONDITIONAL_TERM, &keyword); + } + + // Set the appropriate end location for all of the nodes in the subtree. + switch (context) { + case PM_CONTEXT_IF: { + pm_node_t *current = parent; + bool recursing = true; + + while (recursing) { + switch (PM_NODE_TYPE(current)) { + case PM_IF_NODE: + pm_if_node_end_keyword_loc_set((pm_if_node_t *) current, &parser->previous); + current = ((pm_if_node_t *) current)->subsequent; + recursing = current != NULL; + break; + case PM_ELSE_NODE: + pm_else_node_end_keyword_loc_set((pm_else_node_t *) current, &parser->previous); + recursing = false; + break; + default: { + recursing = false; + break; + } + } + } + break; + } + case PM_CONTEXT_UNLESS: + pm_unless_node_end_keyword_loc_set((pm_unless_node_t *) parent, &parser->previous); + break; + default: + assert(false && "unreachable"); + break; + } + + pop_block_exits(parser, previous_block_exits); + pm_node_list_free(¤t_block_exits); + + return parent; +} + +/** + * This macro allows you to define a case statement for all of the keywords. + * It's meant to be used in a switch statement. + */ +#define PM_CASE_KEYWORD PM_TOKEN_KEYWORD___ENCODING__: case PM_TOKEN_KEYWORD___FILE__: case PM_TOKEN_KEYWORD___LINE__: \ + case PM_TOKEN_KEYWORD_ALIAS: case PM_TOKEN_KEYWORD_AND: case PM_TOKEN_KEYWORD_BEGIN: case PM_TOKEN_KEYWORD_BEGIN_UPCASE: \ + case PM_TOKEN_KEYWORD_BREAK: case PM_TOKEN_KEYWORD_CASE: case PM_TOKEN_KEYWORD_CLASS: case PM_TOKEN_KEYWORD_DEF: \ + case PM_TOKEN_KEYWORD_DEFINED: case PM_TOKEN_KEYWORD_DO: case PM_TOKEN_KEYWORD_DO_LOOP: case PM_TOKEN_KEYWORD_ELSE: \ + case PM_TOKEN_KEYWORD_ELSIF: case PM_TOKEN_KEYWORD_END: case PM_TOKEN_KEYWORD_END_UPCASE: case PM_TOKEN_KEYWORD_ENSURE: \ + case PM_TOKEN_KEYWORD_FALSE: case PM_TOKEN_KEYWORD_FOR: case PM_TOKEN_KEYWORD_IF: case PM_TOKEN_KEYWORD_IN: \ + case PM_TOKEN_KEYWORD_MODULE: case PM_TOKEN_KEYWORD_NEXT: case PM_TOKEN_KEYWORD_NIL: case PM_TOKEN_KEYWORD_NOT: \ + case PM_TOKEN_KEYWORD_OR: case PM_TOKEN_KEYWORD_REDO: case PM_TOKEN_KEYWORD_RESCUE: case PM_TOKEN_KEYWORD_RETRY: \ + case PM_TOKEN_KEYWORD_RETURN: case PM_TOKEN_KEYWORD_SELF: case PM_TOKEN_KEYWORD_SUPER: case PM_TOKEN_KEYWORD_THEN: \ + case PM_TOKEN_KEYWORD_TRUE: case PM_TOKEN_KEYWORD_UNDEF: case PM_TOKEN_KEYWORD_UNLESS: case PM_TOKEN_KEYWORD_UNTIL: \ + case PM_TOKEN_KEYWORD_WHEN: case PM_TOKEN_KEYWORD_WHILE: case PM_TOKEN_KEYWORD_YIELD + +/** + * This macro allows you to define a case statement for all of the operators. + * It's meant to be used in a switch statement. + */ +#define PM_CASE_OPERATOR PM_TOKEN_AMPERSAND: case PM_TOKEN_BACKTICK: case PM_TOKEN_BANG_EQUAL: \ + case PM_TOKEN_BANG_TILDE: case PM_TOKEN_BANG: case PM_TOKEN_BRACKET_LEFT_RIGHT_EQUAL: \ + case PM_TOKEN_BRACKET_LEFT_RIGHT: case PM_TOKEN_CARET: case PM_TOKEN_EQUAL_EQUAL_EQUAL: case PM_TOKEN_EQUAL_EQUAL: \ + case PM_TOKEN_EQUAL_TILDE: case PM_TOKEN_GREATER_EQUAL: case PM_TOKEN_GREATER_GREATER: case PM_TOKEN_GREATER: \ + case PM_TOKEN_LESS_EQUAL_GREATER: case PM_TOKEN_LESS_EQUAL: case PM_TOKEN_LESS_LESS: case PM_TOKEN_LESS: \ + case PM_TOKEN_MINUS: case PM_TOKEN_PERCENT: case PM_TOKEN_PIPE: case PM_TOKEN_PLUS: case PM_TOKEN_SLASH: \ + case PM_TOKEN_STAR_STAR: case PM_TOKEN_STAR: case PM_TOKEN_TILDE: case PM_TOKEN_UAMPERSAND: case PM_TOKEN_UMINUS: \ + case PM_TOKEN_UMINUS_NUM: case PM_TOKEN_UPLUS: case PM_TOKEN_USTAR: case PM_TOKEN_USTAR_STAR + +/** + * This macro allows you to define a case statement for all of the token types + * that represent the beginning of nodes that are "primitives" in a pattern + * matching expression. + */ +#define PM_CASE_PRIMITIVE PM_TOKEN_INTEGER: case PM_TOKEN_INTEGER_IMAGINARY: case PM_TOKEN_INTEGER_RATIONAL: \ + case PM_TOKEN_INTEGER_RATIONAL_IMAGINARY: case PM_TOKEN_FLOAT: case PM_TOKEN_FLOAT_IMAGINARY: \ + case PM_TOKEN_FLOAT_RATIONAL: case PM_TOKEN_FLOAT_RATIONAL_IMAGINARY: case PM_TOKEN_SYMBOL_BEGIN: \ + case PM_TOKEN_REGEXP_BEGIN: case PM_TOKEN_BACKTICK: case PM_TOKEN_PERCENT_LOWER_X: case PM_TOKEN_PERCENT_LOWER_I: \ + case PM_TOKEN_PERCENT_LOWER_W: case PM_TOKEN_PERCENT_UPPER_I: case PM_TOKEN_PERCENT_UPPER_W: \ + case PM_TOKEN_STRING_BEGIN: case PM_TOKEN_KEYWORD_NIL: case PM_TOKEN_KEYWORD_SELF: case PM_TOKEN_KEYWORD_TRUE: \ + case PM_TOKEN_KEYWORD_FALSE: case PM_TOKEN_KEYWORD___FILE__: case PM_TOKEN_KEYWORD___LINE__: \ + case PM_TOKEN_KEYWORD___ENCODING__: case PM_TOKEN_MINUS_GREATER: case PM_TOKEN_HEREDOC_START: \ + case PM_TOKEN_UMINUS_NUM: case PM_TOKEN_CHARACTER_LITERAL + +/** + * This macro allows you to define a case statement for all of the token types + * that could begin a parameter. + */ +#define PM_CASE_PARAMETER PM_TOKEN_UAMPERSAND: case PM_TOKEN_AMPERSAND: case PM_TOKEN_UDOT_DOT_DOT: \ + case PM_TOKEN_IDENTIFIER: case PM_TOKEN_LABEL: case PM_TOKEN_USTAR: case PM_TOKEN_STAR: case PM_TOKEN_STAR_STAR: \ + case PM_TOKEN_USTAR_STAR: case PM_TOKEN_CONSTANT: case PM_TOKEN_INSTANCE_VARIABLE: case PM_TOKEN_GLOBAL_VARIABLE: \ + case PM_TOKEN_CLASS_VARIABLE + +/** + * This macro allows you to define a case statement for all of the nodes that + * can be transformed into write targets. + */ +#define PM_CASE_WRITABLE PM_CLASS_VARIABLE_READ_NODE: case PM_CONSTANT_PATH_NODE: \ + case PM_CONSTANT_READ_NODE: case PM_GLOBAL_VARIABLE_READ_NODE: case PM_LOCAL_VARIABLE_READ_NODE: \ + case PM_INSTANCE_VARIABLE_READ_NODE: case PM_MULTI_TARGET_NODE: case PM_BACK_REFERENCE_READ_NODE: \ + case PM_NUMBERED_REFERENCE_READ_NODE: case PM_IT_LOCAL_VARIABLE_READ_NODE + +// Assert here that the flags are the same so that we can safely switch the type +// of the node without having to move the flags. +PM_STATIC_ASSERT(__LINE__, ((int) PM_STRING_FLAGS_FORCED_UTF8_ENCODING) == ((int) PM_ENCODING_FLAGS_FORCED_UTF8_ENCODING), "Expected the flags to match."); + +/** + * If the encoding was explicitly set through the lexing process, then we need + * to potentially mark the string's flags to indicate how to encode it. + */ +static inline pm_node_flags_t +parse_unescaped_encoding(const pm_parser_t *parser) { + if (parser->explicit_encoding != NULL) { + if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) { + // If the there's an explicit encoding and it's using a UTF-8 escape + // sequence, then mark the string as UTF-8. + return PM_STRING_FLAGS_FORCED_UTF8_ENCODING; + } else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) { + // If there's a non-UTF-8 escape sequence being used, then the + // string uses the source encoding, unless the source is marked as + // US-ASCII. In that case the string is forced as ASCII-8BIT in + // order to keep the string valid. + return PM_STRING_FLAGS_FORCED_BINARY_ENCODING; + } + } + return 0; +} + +/** + * Parse a node that is part of a string. If the subsequent tokens cannot be + * parsed as a string part, then NULL is returned. + */ +static pm_node_t * +parse_string_part(pm_parser_t *parser, uint16_t depth) { + switch (parser->current.type) { + // Here the lexer has returned to us plain string content. In this case + // we'll create a string node that has no opening or closing and return that + // as the part. These kinds of parts look like: + // + // "aaa #{bbb} #@ccc ddd" + // ^^^^ ^ ^^^^ + case PM_TOKEN_STRING_CONTENT: { + pm_token_t opening = not_provided(parser); + pm_token_t closing = not_provided(parser); + + pm_node_t *node = UP(pm_string_node_create_current_string(parser, &opening, &parser->current, &closing)); + pm_node_flag_set(node, parse_unescaped_encoding(parser)); + + parser_lex(parser); + return node; + } + // Here the lexer has returned the beginning of an embedded expression. In + // that case we'll parse the inner statements and return that as the part. + // These kinds of parts look like: + // + // "aaa #{bbb} #@ccc ddd" + // ^^^^^^ + case PM_TOKEN_EMBEXPR_BEGIN: { + // Ruby disallows seeing encoding around interpolation in strings, + // even though it is known at parse time. + parser->explicit_encoding = NULL; + + pm_lex_state_t state = parser->lex_state; + int brace_nesting = parser->brace_nesting; + + parser->brace_nesting = 0; + lex_state_set(parser, PM_LEX_STATE_BEG); + parser_lex(parser); + + pm_token_t opening = parser->previous; + pm_statements_node_t *statements = NULL; + + if (!match1(parser, PM_TOKEN_EMBEXPR_END)) { + pm_accepts_block_stack_push(parser, true); + statements = parse_statements(parser, PM_CONTEXT_EMBEXPR, (uint16_t) (depth + 1)); + pm_accepts_block_stack_pop(parser); + } + + parser->brace_nesting = brace_nesting; + lex_state_set(parser, state); + + expect1(parser, PM_TOKEN_EMBEXPR_END, PM_ERR_EMBEXPR_END); + pm_token_t closing = parser->previous; + + // If this set of embedded statements only contains a single + // statement, then Ruby does not consider it as a possible statement + // that could emit a line event. + if (statements != NULL && statements->body.size == 1) { + pm_node_flag_unset(statements->body.nodes[0], PM_NODE_FLAG_NEWLINE); + } + + return UP(pm_embedded_statements_node_create(parser, &opening, statements, &closing)); + } + + // Here the lexer has returned the beginning of an embedded variable. + // In that case we'll parse the variable and create an appropriate node + // for it and then return that node. These kinds of parts look like: + // + // "aaa #{bbb} #@ccc ddd" + // ^^^^^ + case PM_TOKEN_EMBVAR: { + // Ruby disallows seeing encoding around interpolation in strings, + // even though it is known at parse time. + parser->explicit_encoding = NULL; + + lex_state_set(parser, PM_LEX_STATE_BEG); + parser_lex(parser); + + pm_token_t operator = parser->previous; + pm_node_t *variable; + + switch (parser->current.type) { + // In this case a back reference is being interpolated. We'll + // create a global variable read node. + case PM_TOKEN_BACK_REFERENCE: + parser_lex(parser); + variable = UP(pm_back_reference_read_node_create(parser, &parser->previous)); + break; + // In this case an nth reference is being interpolated. We'll + // create a global variable read node. + case PM_TOKEN_NUMBERED_REFERENCE: + parser_lex(parser); + variable = UP(pm_numbered_reference_read_node_create(parser, &parser->previous)); + break; + // In this case a global variable is being interpolated. We'll + // create a global variable read node. + case PM_TOKEN_GLOBAL_VARIABLE: + parser_lex(parser); + variable = UP(pm_global_variable_read_node_create(parser, &parser->previous)); + break; + // In this case an instance variable is being interpolated. + // We'll create an instance variable read node. + case PM_TOKEN_INSTANCE_VARIABLE: + parser_lex(parser); + variable = UP(pm_instance_variable_read_node_create(parser, &parser->previous)); + break; + // In this case a class variable is being interpolated. We'll + // create a class variable read node. + case PM_TOKEN_CLASS_VARIABLE: + parser_lex(parser); + variable = UP(pm_class_variable_read_node_create(parser, &parser->previous)); + break; + // We can hit here if we got an invalid token. In that case + // we'll not attempt to lex this token and instead just return a + // missing node. + default: + expect1(parser, PM_TOKEN_IDENTIFIER, PM_ERR_EMBVAR_INVALID); + variable = UP(pm_missing_node_create(parser, parser->current.start, parser->current.end)); + break; + } + + return UP(pm_embedded_variable_node_create(parser, &operator, variable)); + } + default: + parser_lex(parser); + pm_parser_err_previous(parser, PM_ERR_CANNOT_PARSE_STRING_PART); + return NULL; + } +} + +/** + * When creating a symbol, unary operators that cannot be binary operators + * automatically drop trailing `@` characters. This happens at the parser level, + * such that `~@` is parsed as `~` and `!@` is parsed as `!`. We do that here. + */ +static const uint8_t * +parse_operator_symbol_name(const pm_token_t *name) { + switch (name->type) { + case PM_TOKEN_TILDE: + case PM_TOKEN_BANG: + if (name->end[-1] == '@') return name->end - 1; + PRISM_FALLTHROUGH + default: + return name->end; + } +} + +static pm_node_t * +parse_operator_symbol(pm_parser_t *parser, const pm_token_t *opening, pm_lex_state_t next_state) { + pm_token_t closing = not_provided(parser); + pm_symbol_node_t *symbol = pm_symbol_node_create(parser, opening, &parser->current, &closing); + + const uint8_t *end = parse_operator_symbol_name(&parser->current); + + if (next_state != PM_LEX_STATE_NONE) lex_state_set(parser, next_state); + parser_lex(parser); + + pm_string_shared_init(&symbol->unescaped, parser->previous.start, end); + pm_node_flag_set(UP(symbol), PM_SYMBOL_FLAGS_FORCED_US_ASCII_ENCODING); + + return UP(symbol); +} + +/** + * Parse a symbol node. This function will get called immediately after finding + * a symbol opening token. This handles parsing bare symbols and interpolated + * symbols. + */ +static pm_node_t * +parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_state, uint16_t depth) { + const pm_token_t opening = parser->previous; + + if (lex_mode->mode != PM_LEX_STRING) { + if (next_state != PM_LEX_STATE_NONE) lex_state_set(parser, next_state); + + switch (parser->current.type) { + case PM_CASE_OPERATOR: + return parse_operator_symbol(parser, &opening, next_state == PM_LEX_STATE_NONE ? PM_LEX_STATE_ENDFN : next_state); + case PM_TOKEN_IDENTIFIER: + case PM_TOKEN_CONSTANT: + case PM_TOKEN_INSTANCE_VARIABLE: + case PM_TOKEN_METHOD_NAME: + case PM_TOKEN_CLASS_VARIABLE: + case PM_TOKEN_GLOBAL_VARIABLE: + case PM_TOKEN_NUMBERED_REFERENCE: + case PM_TOKEN_BACK_REFERENCE: + case PM_CASE_KEYWORD: + parser_lex(parser); + break; + default: + expect2(parser, PM_TOKEN_IDENTIFIER, PM_TOKEN_METHOD_NAME, PM_ERR_SYMBOL_INVALID); + break; + } + + pm_token_t closing = not_provided(parser); + pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing); + + pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end); + pm_node_flag_set(UP(symbol), parse_symbol_encoding(parser, &parser->previous, &symbol->unescaped, false)); + + return UP(symbol); + } + + if (lex_mode->as.string.interpolation) { + // If we have the end of the symbol, then we can return an empty symbol. + if (match1(parser, PM_TOKEN_STRING_END)) { + if (next_state != PM_LEX_STATE_NONE) lex_state_set(parser, next_state); + parser_lex(parser); + + pm_token_t content = not_provided(parser); + pm_token_t closing = parser->previous; + return UP(pm_symbol_node_create(parser, &opening, &content, &closing)); + } + + // Now we can parse the first part of the symbol. + pm_node_t *part = parse_string_part(parser, (uint16_t) (depth + 1)); + + // If we got a string part, then it's possible that we could transform + // what looks like an interpolated symbol into a regular symbol. + if (part && PM_NODE_TYPE_P(part, PM_STRING_NODE) && match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) { + if (next_state != PM_LEX_STATE_NONE) lex_state_set(parser, next_state); + expect1(parser, PM_TOKEN_STRING_END, PM_ERR_SYMBOL_TERM_INTERPOLATED); + + return UP(pm_string_node_to_symbol_node(parser, (pm_string_node_t *) part, &opening, &parser->previous)); + } + + pm_interpolated_symbol_node_t *symbol = pm_interpolated_symbol_node_create(parser, &opening, NULL, &opening); + if (part) pm_interpolated_symbol_node_append(symbol, part); + + while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) { + if ((part = parse_string_part(parser, (uint16_t) (depth + 1))) != NULL) { + pm_interpolated_symbol_node_append(symbol, part); + } + } + + if (next_state != PM_LEX_STATE_NONE) lex_state_set(parser, next_state); + if (match1(parser, PM_TOKEN_EOF)) { + pm_parser_err_token(parser, &opening, PM_ERR_SYMBOL_TERM_INTERPOLATED); + } else { + expect1(parser, PM_TOKEN_STRING_END, PM_ERR_SYMBOL_TERM_INTERPOLATED); + } + + pm_interpolated_symbol_node_closing_loc_set(symbol, &parser->previous); + return UP(symbol); + } + + pm_token_t content; + pm_string_t unescaped; + + if (match1(parser, PM_TOKEN_STRING_CONTENT)) { + content = parser->current; + unescaped = parser->current_string; + parser_lex(parser); + + // If we have two string contents in a row, then the content of this + // symbol is split because of heredoc contents. This looks like: + // + // <<A; :'a + // A + // b' + // + // In this case, the best way we have to represent this is as an + // interpolated string node, so that's what we'll do here. + if (match1(parser, PM_TOKEN_STRING_CONTENT)) { + pm_interpolated_symbol_node_t *symbol = pm_interpolated_symbol_node_create(parser, &opening, NULL, &opening); + pm_token_t bounds = not_provided(parser); + + pm_node_t *part = UP(pm_string_node_create_unescaped(parser, &bounds, &content, &bounds, &unescaped)); + pm_interpolated_symbol_node_append(symbol, part); + + part = UP(pm_string_node_create_unescaped(parser, &bounds, &parser->current, &bounds, &parser->current_string)); + pm_interpolated_symbol_node_append(symbol, part); + + if (next_state != PM_LEX_STATE_NONE) { + lex_state_set(parser, next_state); + } + + parser_lex(parser); + expect1(parser, PM_TOKEN_STRING_END, PM_ERR_SYMBOL_TERM_DYNAMIC); + + pm_interpolated_symbol_node_closing_loc_set(symbol, &parser->previous); + return UP(symbol); + } + } else { + content = (pm_token_t) { .type = PM_TOKEN_STRING_CONTENT, .start = parser->previous.end, .end = parser->previous.end }; + pm_string_shared_init(&unescaped, content.start, content.end); + } + + if (next_state != PM_LEX_STATE_NONE) { + lex_state_set(parser, next_state); + } + + if (match1(parser, PM_TOKEN_EOF)) { + pm_parser_err_token(parser, &opening, PM_ERR_SYMBOL_TERM_DYNAMIC); + } else { + expect1(parser, PM_TOKEN_STRING_END, PM_ERR_SYMBOL_TERM_DYNAMIC); + } + + return UP(pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &content, &unescaped, false))); +} + +/** + * Parse an argument to undef which can either be a bare word, a symbol, a + * constant, or an interpolated symbol. + */ +static inline pm_node_t * +parse_undef_argument(pm_parser_t *parser, uint16_t depth) { + switch (parser->current.type) { + case PM_CASE_OPERATOR: { + const pm_token_t opening = not_provided(parser); + return parse_operator_symbol(parser, &opening, PM_LEX_STATE_NONE); + } + case PM_CASE_KEYWORD: + case PM_TOKEN_CONSTANT: + case PM_TOKEN_IDENTIFIER: + case PM_TOKEN_METHOD_NAME: { + parser_lex(parser); + + pm_token_t opening = not_provided(parser); + pm_token_t closing = not_provided(parser); + pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing); + + pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end); + pm_node_flag_set(UP(symbol), parse_symbol_encoding(parser, &parser->previous, &symbol->unescaped, false)); + + return UP(symbol); + } + case PM_TOKEN_SYMBOL_BEGIN: { + pm_lex_mode_t lex_mode = *parser->lex_modes.current; + parser_lex(parser); + + return parse_symbol(parser, &lex_mode, PM_LEX_STATE_NONE, (uint16_t) (depth + 1)); + } + default: + pm_parser_err_current(parser, PM_ERR_UNDEF_ARGUMENT); + return UP(pm_missing_node_create(parser, parser->current.start, parser->current.end)); + } +} + +/** + * Parse an argument to alias which can either be a bare word, a symbol, an + * interpolated symbol or a global variable. If this is the first argument, then + * we need to set the lex state to PM_LEX_STATE_FNAME | PM_LEX_STATE_FITEM + * between the first and second arguments. + */ +static inline pm_node_t * +parse_alias_argument(pm_parser_t *parser, bool first, uint16_t depth) { + switch (parser->current.type) { + case PM_CASE_OPERATOR: { + const pm_token_t opening = not_provided(parser); + return parse_operator_symbol(parser, &opening, first ? PM_LEX_STATE_FNAME | PM_LEX_STATE_FITEM : PM_LEX_STATE_NONE); + } + case PM_CASE_KEYWORD: + case PM_TOKEN_CONSTANT: + case PM_TOKEN_IDENTIFIER: + case PM_TOKEN_METHOD_NAME: { + if (first) lex_state_set(parser, PM_LEX_STATE_FNAME | PM_LEX_STATE_FITEM); + parser_lex(parser); + + pm_token_t opening = not_provided(parser); + pm_token_t closing = not_provided(parser); + pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing); + + pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end); + pm_node_flag_set(UP(symbol), parse_symbol_encoding(parser, &parser->previous, &symbol->unescaped, false)); + + return UP(symbol); + } + case PM_TOKEN_SYMBOL_BEGIN: { + pm_lex_mode_t lex_mode = *parser->lex_modes.current; + parser_lex(parser); + + return parse_symbol(parser, &lex_mode, first ? PM_LEX_STATE_FNAME | PM_LEX_STATE_FITEM : PM_LEX_STATE_NONE, (uint16_t) (depth + 1)); + } + case PM_TOKEN_BACK_REFERENCE: + parser_lex(parser); + return UP(pm_back_reference_read_node_create(parser, &parser->previous)); + case PM_TOKEN_NUMBERED_REFERENCE: + parser_lex(parser); + return UP(pm_numbered_reference_read_node_create(parser, &parser->previous)); + case PM_TOKEN_GLOBAL_VARIABLE: + parser_lex(parser); + return UP(pm_global_variable_read_node_create(parser, &parser->previous)); + default: + pm_parser_err_current(parser, PM_ERR_ALIAS_ARGUMENT); + return UP(pm_missing_node_create(parser, parser->current.start, parser->current.end)); + } +} + +/** + * Parse an identifier into either a local variable read. If the local variable + * is not found, it returns NULL instead. + */ +static pm_node_t * +parse_variable(pm_parser_t *parser) { + pm_constant_id_t name_id = pm_parser_constant_id_token(parser, &parser->previous); + int depth; + bool is_numbered_param = pm_token_is_numbered_parameter(parser->previous.start, parser->previous.end); + + if (!is_numbered_param && ((depth = pm_parser_local_depth_constant_id(parser, name_id)) != -1)) { + return UP(pm_local_variable_read_node_create_constant_id(parser, &parser->previous, name_id, (uint32_t) depth, false)); + } + + pm_scope_t *current_scope = parser->current_scope; + if (!current_scope->closed && !(current_scope->parameters & PM_SCOPE_PARAMETERS_IMPLICIT_DISALLOWED)) { + if (is_numbered_param) { + // When you use a numbered parameter, it implies the existence of + // all of the locals that exist before it. For example, referencing + // _2 means that _1 must exist. Therefore here we loop through all + // of the possibilities and add them into the constant pool. + uint8_t maximum = (uint8_t) (parser->previous.start[1] - '0'); + for (uint8_t number = 1; number <= maximum; number++) { + pm_parser_local_add_constant(parser, pm_numbered_parameter_names[number - 1], 2); + } + + if (!match1(parser, PM_TOKEN_EQUAL)) { + parser->current_scope->parameters |= PM_SCOPE_PARAMETERS_NUMBERED_FOUND; + } + + pm_node_t *node = UP(pm_local_variable_read_node_create_constant_id(parser, &parser->previous, name_id, 0, false)); + pm_node_list_append(¤t_scope->implicit_parameters, node); + + return node; + } else if ((parser->version >= PM_OPTIONS_VERSION_CRUBY_3_4) && pm_token_is_it(parser->previous.start, parser->previous.end)) { + pm_node_t *node = UP(pm_it_local_variable_read_node_create(parser, &parser->previous)); + pm_node_list_append(¤t_scope->implicit_parameters, node); + + return node; + } + } + + return NULL; +} + +/** + * Parse an identifier into either a local variable read or a call. + */ +static pm_node_t * +parse_variable_call(pm_parser_t *parser) { + pm_node_flags_t flags = 0; + + if (!match1(parser, PM_TOKEN_PARENTHESIS_LEFT) && (parser->previous.end[-1] != '!') && (parser->previous.end[-1] != '?')) { + pm_node_t *node = parse_variable(parser); + if (node != NULL) return node; + flags |= PM_CALL_NODE_FLAGS_VARIABLE_CALL; + } + + pm_call_node_t *node = pm_call_node_variable_call_create(parser, &parser->previous); + pm_node_flag_set(UP(node), flags); + + return UP(node); +} + +/** + * Parse the method definition name based on the current token available on the + * parser. If it does not match a valid method definition name, then a missing + * token is returned. + */ +static inline pm_token_t +parse_method_definition_name(pm_parser_t *parser) { + switch (parser->current.type) { + case PM_CASE_KEYWORD: + case PM_TOKEN_CONSTANT: + case PM_TOKEN_METHOD_NAME: + parser_lex(parser); + return parser->previous; + case PM_TOKEN_IDENTIFIER: + pm_refute_numbered_parameter(parser, parser->current.start, parser->current.end); + parser_lex(parser); + return parser->previous; + case PM_CASE_OPERATOR: + lex_state_set(parser, PM_LEX_STATE_ENDFN); + parser_lex(parser); + return parser->previous; + default: + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_DEF_NAME, pm_token_type_human(parser->current.type)); + return (pm_token_t) { .type = PM_TOKEN_MISSING, .start = parser->current.start, .end = parser->current.end }; + } +} + +static void +parse_heredoc_dedent_string(pm_string_t *string, size_t common_whitespace) { + // Get a reference to the string struct that is being held by the string + // node. This is the value we're going to actually manipulate. + pm_string_ensure_owned(string); + + // Now get the bounds of the existing string. We'll use this as a + // destination to move bytes into. We'll also use it for bounds checking + // since we don't require that these strings be null terminated. + size_t dest_length = pm_string_length(string); + const uint8_t *source_cursor = (uint8_t *) string->source; + const uint8_t *source_end = source_cursor + dest_length; + + // We're going to move bytes backward in the string when we get leading + // whitespace, so we'll maintain a pointer to the current position in the + // string that we're writing to. + size_t trimmed_whitespace = 0; + + // While we haven't reached the amount of common whitespace that we need to + // trim and we haven't reached the end of the string, we'll keep trimming + // whitespace. Trimming in this context means skipping over these bytes such + // that they aren't copied into the new string. + while ((source_cursor < source_end) && pm_char_is_inline_whitespace(*source_cursor) && trimmed_whitespace < common_whitespace) { + if (*source_cursor == '\t') { + trimmed_whitespace = (trimmed_whitespace / PM_TAB_WHITESPACE_SIZE + 1) * PM_TAB_WHITESPACE_SIZE; + if (trimmed_whitespace > common_whitespace) break; + } else { + trimmed_whitespace++; + } + + source_cursor++; + dest_length--; + } + + memmove((uint8_t *) string->source, source_cursor, (size_t) (source_end - source_cursor)); + string->length = dest_length; +} + +/** + * Take a heredoc node that is indented by a ~ and trim the leading whitespace. + */ +static void +parse_heredoc_dedent(pm_parser_t *parser, pm_node_list_t *nodes, size_t common_whitespace) { + // The next node should be dedented if it's the first node in the list or if + // it follows a string node. + bool dedent_next = true; + + // Iterate over all nodes, and trim whitespace accordingly. We're going to + // keep around two indices: a read and a write. If we end up trimming all of + // the whitespace from a node, then we'll drop it from the list entirely. + size_t write_index = 0; + + pm_node_t *node; + PM_NODE_LIST_FOREACH(nodes, read_index, node) { + // We're not manipulating child nodes that aren't strings. In this case + // we'll skip past it and indicate that the subsequent node should not + // be dedented. + if (!PM_NODE_TYPE_P(node, PM_STRING_NODE)) { + nodes->nodes[write_index++] = node; + dedent_next = false; + continue; + } + + pm_string_node_t *string_node = ((pm_string_node_t *) node); + if (dedent_next) { + parse_heredoc_dedent_string(&string_node->unescaped, common_whitespace); + } + + if (string_node->unescaped.length == 0) { + pm_node_destroy(parser, node); + } else { + nodes->nodes[write_index++] = node; + } + + // We always dedent the next node if it follows a string node. + dedent_next = true; + } + + nodes->size = write_index; +} + +/** + * Return a string content token at a particular location that is empty. + */ +static pm_token_t +parse_strings_empty_content(const uint8_t *location) { + return (pm_token_t) { .type = PM_TOKEN_STRING_CONTENT, .start = location, .end = location }; +} + +/** + * Parse a set of strings that could be concatenated together. + */ +static inline pm_node_t * +parse_strings(pm_parser_t *parser, pm_node_t *current, bool accepts_label, uint16_t depth) { + assert(parser->current.type == PM_TOKEN_STRING_BEGIN); + bool concating = false; + + while (match1(parser, PM_TOKEN_STRING_BEGIN)) { + pm_node_t *node = NULL; + + // Here we have found a string literal. We'll parse it and add it to + // the list of strings. + const pm_lex_mode_t *lex_mode = parser->lex_modes.current; + assert(lex_mode->mode == PM_LEX_STRING); + bool lex_interpolation = lex_mode->as.string.interpolation; + bool label_allowed = lex_mode->as.string.label_allowed && accepts_label; + + pm_token_t opening = parser->current; + parser_lex(parser); + + if (match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) { + expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_EOF); + // If we get here, then we have an end immediately after a + // start. In that case we'll create an empty content token and + // return an uninterpolated string. + pm_token_t content = parse_strings_empty_content(parser->previous.start); + pm_string_node_t *string = pm_string_node_create(parser, &opening, &content, &parser->previous); + + pm_string_shared_init(&string->unescaped, content.start, content.end); + node = UP(string); + } else if (accept1(parser, PM_TOKEN_LABEL_END)) { + // If we get here, then we have an end of a label immediately + // after a start. In that case we'll create an empty symbol + // node. + pm_token_t content = parse_strings_empty_content(parser->previous.start); + pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &content, &parser->previous); + + pm_string_shared_init(&symbol->unescaped, content.start, content.end); + node = UP(symbol); + + if (!label_allowed) pm_parser_err_node(parser, node, PM_ERR_UNEXPECTED_LABEL); + } else if (!lex_interpolation) { + // If we don't accept interpolation then we expect the string to + // start with a single string content node. + pm_string_t unescaped; + pm_token_t content; + + if (match1(parser, PM_TOKEN_EOF)) { + unescaped = PM_STRING_EMPTY; + content = not_provided(parser); + } else { + unescaped = parser->current_string; + expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_EXPECT_STRING_CONTENT); + content = parser->previous; + } + + // It is unfortunately possible to have multiple string content + // nodes in a row in the case that there's heredoc content in + // the middle of the string, like this cursed example: + // + // <<-END+'b + // a + // END + // c'+'d' + // + // In that case we need to switch to an interpolated string to + // be able to contain all of the parts. + if (match1(parser, PM_TOKEN_STRING_CONTENT)) { + pm_node_list_t parts = { 0 }; + + pm_token_t delimiters = not_provided(parser); + pm_node_t *part = UP(pm_string_node_create_unescaped(parser, &delimiters, &content, &delimiters, &unescaped)); + pm_node_list_append(&parts, part); + + do { + part = UP(pm_string_node_create_current_string(parser, &delimiters, &parser->current, &delimiters)); + pm_node_list_append(&parts, part); + parser_lex(parser); + } while (match1(parser, PM_TOKEN_STRING_CONTENT)); + + expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_EOF); + node = UP(pm_interpolated_string_node_create(parser, &opening, &parts, &parser->previous)); + + pm_node_list_free(&parts); + } else if (accept1(parser, PM_TOKEN_LABEL_END)) { + node = UP(pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &content, &unescaped, true))); + if (!label_allowed) pm_parser_err_node(parser, node, PM_ERR_UNEXPECTED_LABEL); + } else if (match1(parser, PM_TOKEN_EOF)) { + pm_parser_err_token(parser, &opening, PM_ERR_STRING_LITERAL_EOF); + node = UP(pm_string_node_create_unescaped(parser, &opening, &content, &parser->current, &unescaped)); + } else if (accept1(parser, PM_TOKEN_STRING_END)) { + node = UP(pm_string_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped)); + } else { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->previous, PM_ERR_STRING_LITERAL_TERM, pm_token_type_human(parser->previous.type)); + parser->previous.start = parser->previous.end; + parser->previous.type = PM_TOKEN_MISSING; + node = UP(pm_string_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped)); + } + } else if (match1(parser, PM_TOKEN_STRING_CONTENT)) { + // In this case we've hit string content so we know the string + // at least has something in it. We'll need to check if the + // following token is the end (in which case we can return a + // plain string) or if it's not then it has interpolation. + pm_token_t content = parser->current; + pm_string_t unescaped = parser->current_string; + parser_lex(parser); + + if (match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) { + node = UP(pm_string_node_create_unescaped(parser, &opening, &content, &parser->current, &unescaped)); + pm_node_flag_set(node, parse_unescaped_encoding(parser)); + + // Kind of odd behavior, but basically if we have an + // unterminated string and it ends in a newline, we back up one + // character so that the error message is on the last line of + // content in the string. + if (!accept1(parser, PM_TOKEN_STRING_END)) { + const uint8_t *location = parser->previous.end; + if (location > parser->start && location[-1] == '\n') location--; + pm_parser_err(parser, location, location, PM_ERR_STRING_LITERAL_EOF); + + parser->previous.start = parser->previous.end; + parser->previous.type = PM_TOKEN_MISSING; + } + } else if (accept1(parser, PM_TOKEN_LABEL_END)) { + node = UP(pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &content, &unescaped, true))); + if (!label_allowed) pm_parser_err_node(parser, node, PM_ERR_UNEXPECTED_LABEL); + } else { + // If we get here, then we have interpolation so we'll need + // to create a string or symbol node with interpolation. + pm_node_list_t parts = { 0 }; + pm_token_t string_opening = not_provided(parser); + pm_token_t string_closing = not_provided(parser); + + pm_node_t *part = UP(pm_string_node_create_unescaped(parser, &string_opening, &parser->previous, &string_closing, &unescaped)); + pm_node_flag_set(part, parse_unescaped_encoding(parser)); + pm_node_list_append(&parts, part); + + while (!match3(parser, PM_TOKEN_STRING_END, PM_TOKEN_LABEL_END, PM_TOKEN_EOF)) { + if ((part = parse_string_part(parser, (uint16_t) (depth + 1))) != NULL) { + pm_node_list_append(&parts, part); + } + } + + if (accept1(parser, PM_TOKEN_LABEL_END)) { + node = UP(pm_interpolated_symbol_node_create(parser, &opening, &parts, &parser->previous)); + if (!label_allowed) pm_parser_err_node(parser, node, PM_ERR_UNEXPECTED_LABEL); + } else if (match1(parser, PM_TOKEN_EOF)) { + pm_parser_err_token(parser, &opening, PM_ERR_STRING_INTERPOLATED_TERM); + node = UP(pm_interpolated_string_node_create(parser, &opening, &parts, &parser->current)); + } else { + expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_INTERPOLATED_TERM); + node = UP(pm_interpolated_string_node_create(parser, &opening, &parts, &parser->previous)); + } + + pm_node_list_free(&parts); + } + } else { + // If we get here, then the first part of the string is not plain + // string content, in which case we need to parse the string as an + // interpolated string. + pm_node_list_t parts = { 0 }; + pm_node_t *part; + + while (!match3(parser, PM_TOKEN_STRING_END, PM_TOKEN_LABEL_END, PM_TOKEN_EOF)) { + if ((part = parse_string_part(parser, (uint16_t) (depth + 1))) != NULL) { + pm_node_list_append(&parts, part); + } + } + + if (accept1(parser, PM_TOKEN_LABEL_END)) { + node = UP(pm_interpolated_symbol_node_create(parser, &opening, &parts, &parser->previous)); + if (!label_allowed) pm_parser_err_node(parser, node, PM_ERR_UNEXPECTED_LABEL); + } else if (match1(parser, PM_TOKEN_EOF)) { + pm_parser_err_token(parser, &opening, PM_ERR_STRING_INTERPOLATED_TERM); + node = UP(pm_interpolated_string_node_create(parser, &opening, &parts, &parser->current)); + } else { + expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_INTERPOLATED_TERM); + node = UP(pm_interpolated_string_node_create(parser, &opening, &parts, &parser->previous)); + } + + pm_node_list_free(&parts); + } + + if (current == NULL) { + // If the node we just parsed is a symbol node, then we can't + // concatenate it with anything else, so we can now return that + // node. + if (PM_NODE_TYPE_P(node, PM_SYMBOL_NODE) || PM_NODE_TYPE_P(node, PM_INTERPOLATED_SYMBOL_NODE)) { + return node; + } + + // If we don't already have a node, then it's fine and we can just + // set the result to be the node we just parsed. + current = node; + } else { + // Otherwise we need to check the type of the node we just parsed. + // If it cannot be concatenated with the previous node, then we'll + // need to add a syntax error. + if (!PM_NODE_TYPE_P(node, PM_STRING_NODE) && !PM_NODE_TYPE_P(node, PM_INTERPOLATED_STRING_NODE)) { + pm_parser_err_node(parser, node, PM_ERR_STRING_CONCATENATION); + } + + // If we haven't already created our container for concatenation, + // we'll do that now. + if (!concating) { + if (!PM_NODE_TYPE_P(current, PM_STRING_NODE) && !PM_NODE_TYPE_P(current, PM_INTERPOLATED_STRING_NODE)) { + pm_parser_err_node(parser, current, PM_ERR_STRING_CONCATENATION); + } + + concating = true; + pm_token_t bounds = not_provided(parser); + + pm_interpolated_string_node_t *container = pm_interpolated_string_node_create(parser, &bounds, NULL, &bounds); + pm_interpolated_string_node_append(container, current); + current = UP(container); + } + + pm_interpolated_string_node_append((pm_interpolated_string_node_t *) current, node); + } + } + + return current; +} + +#define PM_PARSE_PATTERN_SINGLE 0 +#define PM_PARSE_PATTERN_TOP 1 +#define PM_PARSE_PATTERN_MULTI 2 + +static pm_node_t * +parse_pattern(pm_parser_t *parser, pm_constant_id_list_t *captures, uint8_t flags, pm_diagnostic_id_t diag_id, uint16_t depth); + +/** + * Add the newly created local to the list of captures for this pattern matching + * expression. If it is duplicated from a previous local, then we'll need to add + * an error to the parser. + */ +static void +parse_pattern_capture(pm_parser_t *parser, pm_constant_id_list_t *captures, pm_constant_id_t capture, const pm_location_t *location) { + // Skip this capture if it starts with an underscore. + if (peek_at(parser, location->start) == '_') return; + + if (pm_constant_id_list_includes(captures, capture)) { + pm_parser_err(parser, location->start, location->end, PM_ERR_PATTERN_CAPTURE_DUPLICATE); + } else { + pm_constant_id_list_append(captures, capture); + } +} + +/** + * Accept any number of constants joined by :: delimiters. + */ +static pm_node_t * +parse_pattern_constant_path(pm_parser_t *parser, pm_constant_id_list_t *captures, pm_node_t *node, uint16_t depth) { + // Now, if there are any :: operators that follow, parse them as constant + // path nodes. + while (accept1(parser, PM_TOKEN_COLON_COLON)) { + pm_token_t delimiter = parser->previous; + expect1(parser, PM_TOKEN_CONSTANT, PM_ERR_CONSTANT_PATH_COLON_COLON_CONSTANT); + node = UP(pm_constant_path_node_create(parser, node, &delimiter, &parser->previous)); + } + + // If there is a [ or ( that follows, then this is part of a larger pattern + // expression. We'll parse the inner pattern here, then modify the returned + // inner pattern with our constant path attached. + if (!match2(parser, PM_TOKEN_BRACKET_LEFT, PM_TOKEN_PARENTHESIS_LEFT)) { + return node; + } + + pm_token_t opening; + pm_token_t closing; + pm_node_t *inner = NULL; + + if (accept1(parser, PM_TOKEN_BRACKET_LEFT)) { + opening = parser->previous; + accept1(parser, PM_TOKEN_NEWLINE); + + if (!accept1(parser, PM_TOKEN_BRACKET_RIGHT)) { + inner = parse_pattern(parser, captures, PM_PARSE_PATTERN_TOP | PM_PARSE_PATTERN_MULTI, PM_ERR_PATTERN_EXPRESSION_AFTER_BRACKET, (uint16_t) (depth + 1)); + accept1(parser, PM_TOKEN_NEWLINE); + expect1_opening(parser, PM_TOKEN_BRACKET_RIGHT, PM_ERR_PATTERN_TERM_BRACKET, &opening); + } + + closing = parser->previous; + } else { + parser_lex(parser); + opening = parser->previous; + accept1(parser, PM_TOKEN_NEWLINE); + + if (!accept1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { + inner = parse_pattern(parser, captures, PM_PARSE_PATTERN_TOP | PM_PARSE_PATTERN_MULTI, PM_ERR_PATTERN_EXPRESSION_AFTER_PAREN, (uint16_t) (depth + 1)); + accept1(parser, PM_TOKEN_NEWLINE); + expect1_opening(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_PATTERN_TERM_PAREN, &opening); + } + + closing = parser->previous; + } + + if (!inner) { + // If there was no inner pattern, then we have something like Foo() or + // Foo[]. In that case we'll create an array pattern with no requireds. + return UP(pm_array_pattern_node_constant_create(parser, node, &opening, &closing)); + } + + // Now that we have the inner pattern, check to see if it's an array, find, + // or hash pattern. If it is, then we'll attach our constant path to it if + // it doesn't already have a constant. If it's not one of those node types + // or it does have a constant, then we'll create an array pattern. + switch (PM_NODE_TYPE(inner)) { + case PM_ARRAY_PATTERN_NODE: { + pm_array_pattern_node_t *pattern_node = (pm_array_pattern_node_t *) inner; + + if (pattern_node->constant == NULL && pattern_node->opening_loc.start == NULL) { + pattern_node->base.location.start = node->location.start; + pattern_node->base.location.end = closing.end; + + pattern_node->constant = node; + pattern_node->opening_loc = PM_LOCATION_TOKEN_VALUE(&opening); + pattern_node->closing_loc = PM_LOCATION_TOKEN_VALUE(&closing); + + return UP(pattern_node); + } + + break; + } + case PM_FIND_PATTERN_NODE: { + pm_find_pattern_node_t *pattern_node = (pm_find_pattern_node_t *) inner; + + if (pattern_node->constant == NULL && pattern_node->opening_loc.start == NULL) { + pattern_node->base.location.start = node->location.start; + pattern_node->base.location.end = closing.end; + + pattern_node->constant = node; + pattern_node->opening_loc = PM_LOCATION_TOKEN_VALUE(&opening); + pattern_node->closing_loc = PM_LOCATION_TOKEN_VALUE(&closing); + + return UP(pattern_node); + } + + break; + } + case PM_HASH_PATTERN_NODE: { + pm_hash_pattern_node_t *pattern_node = (pm_hash_pattern_node_t *) inner; + + if (pattern_node->constant == NULL && pattern_node->opening_loc.start == NULL) { + pattern_node->base.location.start = node->location.start; + pattern_node->base.location.end = closing.end; + + pattern_node->constant = node; + pattern_node->opening_loc = PM_LOCATION_TOKEN_VALUE(&opening); + pattern_node->closing_loc = PM_LOCATION_TOKEN_VALUE(&closing); + + return UP(pattern_node); + } + + break; + } + default: + break; + } + + // If we got here, then we didn't return one of the inner patterns by + // attaching its constant. In this case we'll create an array pattern and + // attach our constant to it. + pm_array_pattern_node_t *pattern_node = pm_array_pattern_node_constant_create(parser, node, &opening, &closing); + pm_array_pattern_node_requireds_append(pattern_node, inner); + return UP(pattern_node); +} + +/** + * Parse a rest pattern. + */ +static pm_splat_node_t * +parse_pattern_rest(pm_parser_t *parser, pm_constant_id_list_t *captures) { + assert(parser->previous.type == PM_TOKEN_USTAR); + pm_token_t operator = parser->previous; + pm_node_t *name = NULL; + + // Rest patterns don't necessarily have a name associated with them. So we + // will check for that here. If they do, then we'll add it to the local + // table since this pattern will cause it to become a local variable. + if (accept1(parser, PM_TOKEN_IDENTIFIER)) { + pm_token_t identifier = parser->previous; + pm_constant_id_t constant_id = pm_parser_constant_id_token(parser, &identifier); + + int depth; + if ((depth = pm_parser_local_depth_constant_id(parser, constant_id)) == -1) { + pm_parser_local_add(parser, constant_id, identifier.start, identifier.end, 0); + } + + parse_pattern_capture(parser, captures, constant_id, &PM_LOCATION_TOKEN_VALUE(&identifier)); + name = UP(pm_local_variable_target_node_create( + parser, + &PM_LOCATION_TOKEN_VALUE(&identifier), + constant_id, + (uint32_t) (depth == -1 ? 0 : depth) + )); + } + + // Finally we can return the created node. + return pm_splat_node_create(parser, &operator, name); +} + +/** + * Parse a keyword rest node. + */ +static pm_node_t * +parse_pattern_keyword_rest(pm_parser_t *parser, pm_constant_id_list_t *captures) { + assert(parser->current.type == PM_TOKEN_USTAR_STAR); + parser_lex(parser); + + pm_token_t operator = parser->previous; + pm_node_t *value = NULL; + + if (accept1(parser, PM_TOKEN_KEYWORD_NIL)) { + return UP(pm_no_keywords_parameter_node_create(parser, &operator, &parser->previous)); + } + + if (accept1(parser, PM_TOKEN_IDENTIFIER)) { + pm_constant_id_t constant_id = pm_parser_constant_id_token(parser, &parser->previous); + + int depth; + if ((depth = pm_parser_local_depth_constant_id(parser, constant_id)) == -1) { + pm_parser_local_add(parser, constant_id, parser->previous.start, parser->previous.end, 0); + } + + parse_pattern_capture(parser, captures, constant_id, &PM_LOCATION_TOKEN_VALUE(&parser->previous)); + value = UP(pm_local_variable_target_node_create( + parser, + &PM_LOCATION_TOKEN_VALUE(&parser->previous), + constant_id, + (uint32_t) (depth == -1 ? 0 : depth) + )); + } + + return UP(pm_assoc_splat_node_create(parser, value, &operator)); +} + +/** + * Check that the slice of the source given by the bounds parameters constitutes + * a valid local variable name. + */ +static bool +pm_slice_is_valid_local(const pm_parser_t *parser, const uint8_t *start, const uint8_t *end) { + ptrdiff_t length = end - start; + if (length == 0) return false; + + // First ensure that it starts with a valid identifier starting character. + size_t width = char_is_identifier_start(parser, start, end - start); + if (width == 0) return false; + + // Next, ensure that it's not an uppercase character. + if (parser->encoding_changed) { + if (parser->encoding->isupper_char(start, length)) return false; + } else { + if (pm_encoding_utf_8_isupper_char(start, length)) return false; + } + + // Next, iterate through all of the bytes of the string to ensure that they + // are all valid identifier characters. + const uint8_t *cursor = start + width; + while ((width = char_is_identifier(parser, cursor, end - cursor))) cursor += width; + return cursor == end; +} + +/** + * Create an implicit node for the value of a hash pattern that has omitted the + * value. This will use an implicit local variable target. + */ +static pm_node_t * +parse_pattern_hash_implicit_value(pm_parser_t *parser, pm_constant_id_list_t *captures, pm_symbol_node_t *key) { + const pm_location_t *value_loc = &((pm_symbol_node_t *) key)->value_loc; + + pm_constant_id_t constant_id = pm_parser_constant_id_location(parser, value_loc->start, value_loc->end); + int depth = -1; + + if (pm_slice_is_valid_local(parser, value_loc->start, value_loc->end)) { + depth = pm_parser_local_depth_constant_id(parser, constant_id); + } else { + pm_parser_err(parser, key->base.location.start, key->base.location.end, PM_ERR_PATTERN_HASH_KEY_LOCALS); + + if ((value_loc->end > value_loc->start) && ((value_loc->end[-1] == '!') || (value_loc->end[-1] == '?'))) { + PM_PARSER_ERR_LOCATION_FORMAT(parser, value_loc, PM_ERR_INVALID_LOCAL_VARIABLE_WRITE, (int) (value_loc->end - value_loc->start), (const char *) value_loc->start); + } + } + + if (depth == -1) { + pm_parser_local_add(parser, constant_id, value_loc->start, value_loc->end, 0); + } + + parse_pattern_capture(parser, captures, constant_id, value_loc); + pm_local_variable_target_node_t *target = pm_local_variable_target_node_create( + parser, + value_loc, + constant_id, + (uint32_t) (depth == -1 ? 0 : depth) + ); + + return UP(pm_implicit_node_create(parser, UP(target))); +} + +/** + * Add a node to the list of keys for a hash pattern, and if it is a duplicate + * then add an error to the parser. + */ +static void +parse_pattern_hash_key(pm_parser_t *parser, pm_static_literals_t *keys, pm_node_t *node) { + if (pm_static_literals_add(&parser->newline_list, parser->start_line, keys, node, true) != NULL) { + pm_parser_err_node(parser, node, PM_ERR_PATTERN_HASH_KEY_DUPLICATE); + } +} + +/** + * Parse a hash pattern. + */ +static pm_hash_pattern_node_t * +parse_pattern_hash(pm_parser_t *parser, pm_constant_id_list_t *captures, pm_node_t *first_node, uint16_t depth) { + pm_node_list_t assocs = { 0 }; + pm_static_literals_t keys = { 0 }; + pm_node_t *rest = NULL; + + switch (PM_NODE_TYPE(first_node)) { + case PM_ASSOC_SPLAT_NODE: + case PM_NO_KEYWORDS_PARAMETER_NODE: + rest = first_node; + break; + case PM_SYMBOL_NODE: { + if (pm_symbol_node_label_p(first_node)) { + parse_pattern_hash_key(parser, &keys, first_node); + pm_node_t *value; + + if (match8(parser, PM_TOKEN_COMMA, PM_TOKEN_KEYWORD_THEN, PM_TOKEN_BRACE_RIGHT, PM_TOKEN_BRACKET_RIGHT, PM_TOKEN_PARENTHESIS_RIGHT, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON, PM_TOKEN_EOF)) { + // Otherwise, we will create an implicit local variable + // target for the value. + value = parse_pattern_hash_implicit_value(parser, captures, (pm_symbol_node_t *) first_node); + } else { + // Here we have a value for the first assoc in the list, so + // we will parse it now. + value = parse_pattern(parser, captures, PM_PARSE_PATTERN_SINGLE, PM_ERR_PATTERN_EXPRESSION_AFTER_KEY, (uint16_t) (depth + 1)); + } + + pm_token_t operator = not_provided(parser); + pm_node_t *assoc = UP(pm_assoc_node_create(parser, first_node, &operator, value)); + + pm_node_list_append(&assocs, assoc); + break; + } + } + PRISM_FALLTHROUGH + default: { + // If we get anything else, then this is an error. For this we'll + // create a missing node for the value and create an assoc node for + // the first node in the list. + pm_diagnostic_id_t diag_id = PM_NODE_TYPE_P(first_node, PM_INTERPOLATED_SYMBOL_NODE) ? PM_ERR_PATTERN_HASH_KEY_INTERPOLATED : PM_ERR_PATTERN_HASH_KEY_LABEL; + pm_parser_err_node(parser, first_node, diag_id); + + pm_token_t operator = not_provided(parser); + pm_node_t *value = UP(pm_missing_node_create(parser, first_node->location.start, first_node->location.end)); + pm_node_t *assoc = UP(pm_assoc_node_create(parser, first_node, &operator, value)); + + pm_node_list_append(&assocs, assoc); + break; + } + } + + // If there are any other assocs, then we'll parse them now. + while (accept1(parser, PM_TOKEN_COMMA)) { + // Here we need to break to support trailing commas. + if (match7(parser, PM_TOKEN_KEYWORD_THEN, PM_TOKEN_BRACE_RIGHT, PM_TOKEN_BRACKET_RIGHT, PM_TOKEN_PARENTHESIS_RIGHT, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON, PM_TOKEN_EOF)) { + // Trailing commas are not allowed to follow a rest pattern. + if (rest != NULL) { + pm_parser_err_token(parser, &parser->current, PM_ERR_PATTERN_EXPRESSION_AFTER_REST); + } + + break; + } + + if (match1(parser, PM_TOKEN_USTAR_STAR)) { + pm_node_t *assoc = parse_pattern_keyword_rest(parser, captures); + + if (rest == NULL) { + rest = assoc; + } else { + pm_parser_err_node(parser, assoc, PM_ERR_PATTERN_EXPRESSION_AFTER_REST); + pm_node_list_append(&assocs, assoc); + } + } else { + pm_node_t *key; + + if (match1(parser, PM_TOKEN_STRING_BEGIN)) { + key = parse_strings(parser, NULL, true, (uint16_t) (depth + 1)); + + if (PM_NODE_TYPE_P(key, PM_INTERPOLATED_SYMBOL_NODE)) { + pm_parser_err_node(parser, key, PM_ERR_PATTERN_HASH_KEY_INTERPOLATED); + } else if (!pm_symbol_node_label_p(key)) { + pm_parser_err_node(parser, key, PM_ERR_PATTERN_LABEL_AFTER_COMMA); + } + } else { + expect1(parser, PM_TOKEN_LABEL, PM_ERR_PATTERN_LABEL_AFTER_COMMA); + key = UP(pm_symbol_node_label_create(parser, &parser->previous)); + } + + parse_pattern_hash_key(parser, &keys, key); + pm_node_t *value = NULL; + + if (match7(parser, PM_TOKEN_COMMA, PM_TOKEN_KEYWORD_THEN, PM_TOKEN_BRACE_RIGHT, PM_TOKEN_BRACKET_RIGHT, PM_TOKEN_PARENTHESIS_RIGHT, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) { + if (PM_NODE_TYPE_P(key, PM_SYMBOL_NODE)) { + value = parse_pattern_hash_implicit_value(parser, captures, (pm_symbol_node_t *) key); + } else { + value = UP(pm_missing_node_create(parser, key->location.end, key->location.end)); + } + } else { + value = parse_pattern(parser, captures, PM_PARSE_PATTERN_SINGLE, PM_ERR_PATTERN_EXPRESSION_AFTER_KEY, (uint16_t) (depth + 1)); + } + + pm_token_t operator = not_provided(parser); + pm_node_t *assoc = UP(pm_assoc_node_create(parser, key, &operator, value)); + + if (rest != NULL) { + pm_parser_err_node(parser, assoc, PM_ERR_PATTERN_EXPRESSION_AFTER_REST); + } + + pm_node_list_append(&assocs, assoc); + } + } + + pm_hash_pattern_node_t *node = pm_hash_pattern_node_node_list_create(parser, &assocs, rest); + xfree(assocs.nodes); + + pm_static_literals_free(&keys); + return node; +} + +/** + * Parse a pattern expression primitive. + */ +static pm_node_t * +parse_pattern_primitive(pm_parser_t *parser, pm_constant_id_list_t *captures, pm_diagnostic_id_t diag_id, uint16_t depth) { + switch (parser->current.type) { + case PM_TOKEN_IDENTIFIER: + case PM_TOKEN_METHOD_NAME: { + parser_lex(parser); + pm_constant_id_t constant_id = pm_parser_constant_id_token(parser, &parser->previous); + + int depth; + if ((depth = pm_parser_local_depth_constant_id(parser, constant_id)) == -1) { + pm_parser_local_add(parser, constant_id, parser->previous.start, parser->previous.end, 0); + } + + parse_pattern_capture(parser, captures, constant_id, &PM_LOCATION_TOKEN_VALUE(&parser->previous)); + return UP(pm_local_variable_target_node_create( + parser, + &PM_LOCATION_TOKEN_VALUE(&parser->previous), + constant_id, + (uint32_t) (depth == -1 ? 0 : depth) + )); + } + case PM_TOKEN_BRACKET_LEFT_ARRAY: { + pm_token_t opening = parser->current; + parser_lex(parser); + + if (accept1(parser, PM_TOKEN_BRACKET_RIGHT)) { + // If we have an empty array pattern, then we'll just return a new + // array pattern node. + return UP(pm_array_pattern_node_empty_create(parser, &opening, &parser->previous)); + } + + // Otherwise, we'll parse the inner pattern, then deal with it depending + // on the type it returns. + pm_node_t *inner = parse_pattern(parser, captures, PM_PARSE_PATTERN_MULTI, PM_ERR_PATTERN_EXPRESSION_AFTER_BRACKET, (uint16_t) (depth + 1)); + + accept1(parser, PM_TOKEN_NEWLINE); + expect1_opening(parser, PM_TOKEN_BRACKET_RIGHT, PM_ERR_PATTERN_TERM_BRACKET, &opening); + pm_token_t closing = parser->previous; + + switch (PM_NODE_TYPE(inner)) { + case PM_ARRAY_PATTERN_NODE: { + pm_array_pattern_node_t *pattern_node = (pm_array_pattern_node_t *) inner; + if (pattern_node->opening_loc.start == NULL) { + pattern_node->base.location.start = opening.start; + pattern_node->base.location.end = closing.end; + + pattern_node->opening_loc = PM_LOCATION_TOKEN_VALUE(&opening); + pattern_node->closing_loc = PM_LOCATION_TOKEN_VALUE(&closing); + + return UP(pattern_node); + } + + break; + } + case PM_FIND_PATTERN_NODE: { + pm_find_pattern_node_t *pattern_node = (pm_find_pattern_node_t *) inner; + if (pattern_node->opening_loc.start == NULL) { + pattern_node->base.location.start = opening.start; + pattern_node->base.location.end = closing.end; + + pattern_node->opening_loc = PM_LOCATION_TOKEN_VALUE(&opening); + pattern_node->closing_loc = PM_LOCATION_TOKEN_VALUE(&closing); + + return UP(pattern_node); + } + + break; + } + default: + break; + } + + pm_array_pattern_node_t *node = pm_array_pattern_node_empty_create(parser, &opening, &closing); + pm_array_pattern_node_requireds_append(node, inner); + return UP(node); + } + case PM_TOKEN_BRACE_LEFT: { + bool previous_pattern_matching_newlines = parser->pattern_matching_newlines; + parser->pattern_matching_newlines = false; + + pm_hash_pattern_node_t *node; + pm_token_t opening = parser->current; + parser_lex(parser); + + if (accept1(parser, PM_TOKEN_BRACE_RIGHT)) { + // If we have an empty hash pattern, then we'll just return a new hash + // pattern node. + node = pm_hash_pattern_node_empty_create(parser, &opening, &parser->previous); + } else { + pm_node_t *first_node; + + switch (parser->current.type) { + case PM_TOKEN_LABEL: + parser_lex(parser); + first_node = UP(pm_symbol_node_label_create(parser, &parser->previous)); + break; + case PM_TOKEN_USTAR_STAR: + first_node = parse_pattern_keyword_rest(parser, captures); + break; + case PM_TOKEN_STRING_BEGIN: + first_node = parse_expression(parser, PM_BINDING_POWER_MAX, false, true, PM_ERR_PATTERN_HASH_KEY_LABEL, (uint16_t) (depth + 1)); + break; + default: { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_PATTERN_HASH_KEY, pm_token_type_human(parser->current.type)); + parser_lex(parser); + + first_node = UP(pm_missing_node_create(parser, parser->previous.start, parser->previous.end)); + break; + } + } + + node = parse_pattern_hash(parser, captures, first_node, (uint16_t) (depth + 1)); + + accept1(parser, PM_TOKEN_NEWLINE); + expect1_opening(parser, PM_TOKEN_BRACE_RIGHT, PM_ERR_PATTERN_TERM_BRACE, &opening); + pm_token_t closing = parser->previous; + + node->base.location.start = opening.start; + node->base.location.end = closing.end; + + node->opening_loc = PM_LOCATION_TOKEN_VALUE(&opening); + node->closing_loc = PM_LOCATION_TOKEN_VALUE(&closing); + } + + parser->pattern_matching_newlines = previous_pattern_matching_newlines; + return UP(node); + } + case PM_TOKEN_UDOT_DOT: + case PM_TOKEN_UDOT_DOT_DOT: { + pm_token_t operator = parser->current; + parser_lex(parser); + + // Since we have a unary range operator, we need to parse the subsequent + // expression as the right side of the range. + switch (parser->current.type) { + case PM_CASE_PRIMITIVE: { + pm_node_t *right = parse_expression(parser, PM_BINDING_POWER_MAX, false, false, PM_ERR_PATTERN_EXPRESSION_AFTER_RANGE, (uint16_t) (depth + 1)); + return UP(pm_range_node_create(parser, NULL, &operator, right)); + } + default: { + pm_parser_err_token(parser, &operator, PM_ERR_PATTERN_EXPRESSION_AFTER_RANGE); + pm_node_t *right = UP(pm_missing_node_create(parser, operator.start, operator.end)); + return UP(pm_range_node_create(parser, NULL, &operator, right)); + } + } + } + case PM_CASE_PRIMITIVE: { + pm_node_t *node = parse_expression(parser, PM_BINDING_POWER_MAX, false, true, diag_id, (uint16_t) (depth + 1)); + + // If we found a label, we need to immediately return to the caller. + if (pm_symbol_node_label_p(node)) return node; + + // Call nodes (arithmetic operations) are not allowed in patterns + if (PM_NODE_TYPE(node) == PM_CALL_NODE) { + pm_parser_err_node(parser, node, diag_id); + pm_missing_node_t *missing_node = pm_missing_node_create(parser, node->location.start, node->location.end); + + pm_node_unreference(parser, node); + pm_node_destroy(parser, node); + return UP(missing_node); + } + + // Now that we have a primitive, we need to check if it's part of a range. + if (accept2(parser, PM_TOKEN_DOT_DOT, PM_TOKEN_DOT_DOT_DOT)) { + pm_token_t operator = parser->previous; + + // Now that we have the operator, we need to check if this is followed + // by another expression. If it is, then we will create a full range + // node. Otherwise, we'll create an endless range. + switch (parser->current.type) { + case PM_CASE_PRIMITIVE: { + pm_node_t *right = parse_expression(parser, PM_BINDING_POWER_MAX, false, false, PM_ERR_PATTERN_EXPRESSION_AFTER_RANGE, (uint16_t) (depth + 1)); + return UP(pm_range_node_create(parser, node, &operator, right)); + } + default: + return UP(pm_range_node_create(parser, node, &operator, NULL)); + } + } + + return node; + } + case PM_TOKEN_CARET: { + parser_lex(parser); + pm_token_t operator = parser->previous; + + // At this point we have a pin operator. We need to check the subsequent + // expression to determine if it's a variable or an expression. + switch (parser->current.type) { + case PM_TOKEN_IDENTIFIER: { + parser_lex(parser); + pm_node_t *variable = UP(parse_variable(parser)); + + if (variable == NULL) { + PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, parser->previous, PM_ERR_NO_LOCAL_VARIABLE); + variable = UP(pm_local_variable_read_node_missing_create(parser, &parser->previous, 0)); + } + + return UP(pm_pinned_variable_node_create(parser, &operator, variable)); + } + case PM_TOKEN_INSTANCE_VARIABLE: { + parser_lex(parser); + pm_node_t *variable = UP(pm_instance_variable_read_node_create(parser, &parser->previous)); + + return UP(pm_pinned_variable_node_create(parser, &operator, variable)); + } + case PM_TOKEN_CLASS_VARIABLE: { + parser_lex(parser); + pm_node_t *variable = UP(pm_class_variable_read_node_create(parser, &parser->previous)); + + return UP(pm_pinned_variable_node_create(parser, &operator, variable)); + } + case PM_TOKEN_GLOBAL_VARIABLE: { + parser_lex(parser); + pm_node_t *variable = UP(pm_global_variable_read_node_create(parser, &parser->previous)); + + return UP(pm_pinned_variable_node_create(parser, &operator, variable)); + } + case PM_TOKEN_NUMBERED_REFERENCE: { + parser_lex(parser); + pm_node_t *variable = UP(pm_numbered_reference_read_node_create(parser, &parser->previous)); + + return UP(pm_pinned_variable_node_create(parser, &operator, variable)); + } + case PM_TOKEN_BACK_REFERENCE: { + parser_lex(parser); + pm_node_t *variable = UP(pm_back_reference_read_node_create(parser, &parser->previous)); + + return UP(pm_pinned_variable_node_create(parser, &operator, variable)); + } + case PM_TOKEN_PARENTHESIS_LEFT: { + bool previous_pattern_matching_newlines = parser->pattern_matching_newlines; + parser->pattern_matching_newlines = false; + + pm_token_t lparen = parser->current; + parser_lex(parser); + + pm_node_t *expression = parse_value_expression(parser, PM_BINDING_POWER_STATEMENT, true, false, PM_ERR_PATTERN_EXPRESSION_AFTER_PIN, (uint16_t) (depth + 1)); + parser->pattern_matching_newlines = previous_pattern_matching_newlines; + + accept1(parser, PM_TOKEN_NEWLINE); + expect1_opening(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_PATTERN_TERM_PAREN, &lparen); + return UP(pm_pinned_expression_node_create(parser, expression, &operator, &lparen, &parser->previous)); + } + default: { + // If we get here, then we have a pin operator followed by something + // not understood. We'll create a missing node and return that. + pm_parser_err_token(parser, &operator, PM_ERR_PATTERN_EXPRESSION_AFTER_PIN); + pm_node_t *variable = UP(pm_missing_node_create(parser, operator.start, operator.end)); + return UP(pm_pinned_variable_node_create(parser, &operator, variable)); + } + } + } + case PM_TOKEN_UCOLON_COLON: { + pm_token_t delimiter = parser->current; + parser_lex(parser); + + expect1(parser, PM_TOKEN_CONSTANT, PM_ERR_CONSTANT_PATH_COLON_COLON_CONSTANT); + pm_constant_path_node_t *node = pm_constant_path_node_create(parser, NULL, &delimiter, &parser->previous); + + return parse_pattern_constant_path(parser, captures, UP(node), (uint16_t) (depth + 1)); + } + case PM_TOKEN_CONSTANT: { + pm_token_t constant = parser->current; + parser_lex(parser); + + pm_node_t *node = UP(pm_constant_read_node_create(parser, &constant)); + return parse_pattern_constant_path(parser, captures, node, (uint16_t) (depth + 1)); + } + default: + pm_parser_err_current(parser, diag_id); + return UP(pm_missing_node_create(parser, parser->current.start, parser->current.end)); + } +} + +static bool +parse_pattern_alternation_error_each(const pm_node_t *node, void *data) { + switch (PM_NODE_TYPE(node)) { + case PM_LOCAL_VARIABLE_TARGET_NODE: + pm_parser_err((pm_parser_t *) data, node->location.start, node->location.end, PM_ERR_PATTERN_CAPTURE_IN_ALTERNATIVE); + return false; + default: + return true; + } +} + +/** + * When we get here, we know that we already have a syntax error, because we + * know we have captured a variable and that we are in an alternation. + */ +static void +parse_pattern_alternation_error(pm_parser_t *parser, const pm_node_t *node) { + pm_visit_node(node, parse_pattern_alternation_error_each, parser); +} + +/** + * Parse any number of primitives joined by alternation and ended optionally by + * assignment. + */ +static pm_node_t * +parse_pattern_primitives(pm_parser_t *parser, pm_constant_id_list_t *captures, pm_node_t *first_node, pm_diagnostic_id_t diag_id, uint16_t depth) { + pm_node_t *node = first_node; + bool alternation = false; + + while ((node == NULL) || (alternation = accept1(parser, PM_TOKEN_PIPE))) { + if (alternation && !PM_NODE_TYPE_P(node, PM_ALTERNATION_PATTERN_NODE) && captures->size) { + parse_pattern_alternation_error(parser, node); + } + + switch (parser->current.type) { + case PM_TOKEN_IDENTIFIER: + case PM_TOKEN_BRACKET_LEFT_ARRAY: + case PM_TOKEN_BRACE_LEFT: + case PM_TOKEN_CARET: + case PM_TOKEN_CONSTANT: + case PM_TOKEN_UCOLON_COLON: + case PM_TOKEN_UDOT_DOT: + case PM_TOKEN_UDOT_DOT_DOT: + case PM_CASE_PRIMITIVE: { + if (!alternation) { + node = parse_pattern_primitive(parser, captures, diag_id, (uint16_t) (depth + 1)); + } else { + pm_token_t operator = parser->previous; + pm_node_t *right = parse_pattern_primitive(parser, captures, PM_ERR_PATTERN_EXPRESSION_AFTER_PIPE, (uint16_t) (depth + 1)); + + if (captures->size) parse_pattern_alternation_error(parser, right); + node = UP(pm_alternation_pattern_node_create(parser, node, right, &operator)); + } + + break; + } + case PM_TOKEN_PARENTHESIS_LEFT: + case PM_TOKEN_PARENTHESIS_LEFT_PARENTHESES: { + pm_token_t operator = parser->previous; + pm_token_t opening = parser->current; + parser_lex(parser); + + pm_node_t *body = parse_pattern(parser, captures, PM_PARSE_PATTERN_SINGLE, PM_ERR_PATTERN_EXPRESSION_AFTER_PAREN, (uint16_t) (depth + 1)); + accept1(parser, PM_TOKEN_NEWLINE); + expect1_opening(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_PATTERN_TERM_PAREN, &opening); + pm_node_t *right = UP(pm_parentheses_node_create(parser, &opening, body, &parser->previous, 0)); + + if (!alternation) { + node = right; + } else { + if (captures->size) parse_pattern_alternation_error(parser, right); + node = UP(pm_alternation_pattern_node_create(parser, node, right, &operator)); + } + + break; + } + default: { + pm_parser_err_current(parser, diag_id); + pm_node_t *right = UP(pm_missing_node_create(parser, parser->current.start, parser->current.end)); + + if (!alternation) { + node = right; + } else { + if (captures->size) parse_pattern_alternation_error(parser, right); + node = UP(pm_alternation_pattern_node_create(parser, node, right, &parser->previous)); + } + + break; + } + } + } + + // If we have an =>, then we are assigning this pattern to a variable. + // In this case we should create an assignment node. + while (accept1(parser, PM_TOKEN_EQUAL_GREATER)) { + pm_token_t operator = parser->previous; + expect1(parser, PM_TOKEN_IDENTIFIER, PM_ERR_PATTERN_IDENT_AFTER_HROCKET); + + pm_constant_id_t constant_id = pm_parser_constant_id_token(parser, &parser->previous); + int depth; + + if ((depth = pm_parser_local_depth_constant_id(parser, constant_id)) == -1) { + pm_parser_local_add(parser, constant_id, parser->previous.start, parser->previous.end, 0); + } + + parse_pattern_capture(parser, captures, constant_id, &PM_LOCATION_TOKEN_VALUE(&parser->previous)); + pm_local_variable_target_node_t *target = pm_local_variable_target_node_create( + parser, + &PM_LOCATION_TOKEN_VALUE(&parser->previous), + constant_id, + (uint32_t) (depth == -1 ? 0 : depth) + ); + + node = UP(pm_capture_pattern_node_create(parser, node, target, &operator)); + } + + return node; +} + +/** + * Parse a pattern matching expression. + */ +static pm_node_t * +parse_pattern(pm_parser_t *parser, pm_constant_id_list_t *captures, uint8_t flags, pm_diagnostic_id_t diag_id, uint16_t depth) { + pm_node_t *node = NULL; + + bool leading_rest = false; + bool trailing_rest = false; + + switch (parser->current.type) { + case PM_TOKEN_LABEL: { + parser_lex(parser); + pm_node_t *key = UP(pm_symbol_node_label_create(parser, &parser->previous)); + node = UP(parse_pattern_hash(parser, captures, key, (uint16_t) (depth + 1))); + + if (!(flags & PM_PARSE_PATTERN_TOP)) { + pm_parser_err_node(parser, node, PM_ERR_PATTERN_HASH_IMPLICIT); + } + + return node; + } + case PM_TOKEN_USTAR_STAR: { + node = parse_pattern_keyword_rest(parser, captures); + node = UP(parse_pattern_hash(parser, captures, node, (uint16_t) (depth + 1))); + + if (!(flags & PM_PARSE_PATTERN_TOP)) { + pm_parser_err_node(parser, node, PM_ERR_PATTERN_HASH_IMPLICIT); + } + + return node; + } + case PM_TOKEN_STRING_BEGIN: { + // We need special handling for string beginnings because they could + // be dynamic symbols leading to hash patterns. + node = parse_pattern_primitive(parser, captures, diag_id, (uint16_t) (depth + 1)); + + if (pm_symbol_node_label_p(node)) { + node = UP(parse_pattern_hash(parser, captures, node, (uint16_t) (depth + 1))); + + if (!(flags & PM_PARSE_PATTERN_TOP)) { + pm_parser_err_node(parser, node, PM_ERR_PATTERN_HASH_IMPLICIT); + } + + return node; + } + + node = parse_pattern_primitives(parser, captures, node, diag_id, (uint16_t) (depth + 1)); + break; + } + case PM_TOKEN_USTAR: { + if (flags & (PM_PARSE_PATTERN_TOP | PM_PARSE_PATTERN_MULTI)) { + parser_lex(parser); + node = UP(parse_pattern_rest(parser, captures)); + leading_rest = true; + break; + } + } + PRISM_FALLTHROUGH + default: + node = parse_pattern_primitives(parser, captures, NULL, diag_id, (uint16_t) (depth + 1)); + break; + } + + // If we got a dynamic label symbol, then we need to treat it like the + // beginning of a hash pattern. + if (pm_symbol_node_label_p(node)) { + return UP(parse_pattern_hash(parser, captures, node, (uint16_t) (depth + 1))); + } + + if ((flags & PM_PARSE_PATTERN_MULTI) && match1(parser, PM_TOKEN_COMMA)) { + // If we have a comma, then we are now parsing either an array pattern + // or a find pattern. We need to parse all of the patterns, put them + // into a big list, and then determine which type of node we have. + pm_node_list_t nodes = { 0 }; + pm_node_list_append(&nodes, node); + + // Gather up all of the patterns into the list. + while (accept1(parser, PM_TOKEN_COMMA)) { + // Break early here in case we have a trailing comma. + if (match7(parser, PM_TOKEN_KEYWORD_THEN, PM_TOKEN_BRACE_RIGHT, PM_TOKEN_BRACKET_RIGHT, PM_TOKEN_PARENTHESIS_RIGHT, PM_TOKEN_SEMICOLON, PM_TOKEN_KEYWORD_AND, PM_TOKEN_KEYWORD_OR)) { + node = UP(pm_implicit_rest_node_create(parser, &parser->previous)); + pm_node_list_append(&nodes, node); + trailing_rest = true; + break; + } + + if (accept1(parser, PM_TOKEN_USTAR)) { + node = UP(parse_pattern_rest(parser, captures)); + + // If we have already parsed a splat pattern, then this is an + // error. We will continue to parse the rest of the patterns, + // but we will indicate it as an error. + if (trailing_rest) { + pm_parser_err_previous(parser, PM_ERR_PATTERN_REST); + } + + trailing_rest = true; + } else { + node = parse_pattern_primitives(parser, captures, NULL, PM_ERR_PATTERN_EXPRESSION_AFTER_COMMA, (uint16_t) (depth + 1)); + } + + pm_node_list_append(&nodes, node); + } + + // If the first pattern and the last pattern are rest patterns, then we + // will call this a find pattern, regardless of how many rest patterns + // are in between because we know we already added the appropriate + // errors. Otherwise we will create an array pattern. + if (leading_rest && PM_NODE_TYPE_P(nodes.nodes[nodes.size - 1], PM_SPLAT_NODE)) { + node = UP(pm_find_pattern_node_create(parser, &nodes)); + + if (nodes.size == 2) { + pm_parser_err_node(parser, node, PM_ERR_PATTERN_FIND_MISSING_INNER); + } + } else { + node = UP(pm_array_pattern_node_node_list_create(parser, &nodes)); + + if (leading_rest && trailing_rest) { + pm_parser_err_node(parser, node, PM_ERR_PATTERN_ARRAY_MULTIPLE_RESTS); + } + } + + xfree(nodes.nodes); + } else if (leading_rest) { + // Otherwise, if we parsed a single splat pattern, then we know we have + // an array pattern, so we can go ahead and create that node. + node = UP(pm_array_pattern_node_rest_create(parser, node)); + } + + return node; +} + +/** + * Incorporate a negative sign into a numeric node by subtracting 1 character + * from its start bounds. If it's a compound node, then we will recursively + * apply this function to its value. + */ +static inline void +parse_negative_numeric(pm_node_t *node) { + switch (PM_NODE_TYPE(node)) { + case PM_INTEGER_NODE: { + pm_integer_node_t *cast = (pm_integer_node_t *) node; + cast->base.location.start--; + cast->value.negative = true; + break; + } + case PM_FLOAT_NODE: { + pm_float_node_t *cast = (pm_float_node_t *) node; + cast->base.location.start--; + cast->value = -cast->value; + break; + } + case PM_RATIONAL_NODE: { + pm_rational_node_t *cast = (pm_rational_node_t *) node; + cast->base.location.start--; + cast->numerator.negative = true; + break; + } + case PM_IMAGINARY_NODE: + node->location.start--; + parse_negative_numeric(((pm_imaginary_node_t *) node)->numeric); + break; + default: + assert(false && "unreachable"); + break; + } +} + +/** + * Append an error to the error list on the parser using the given diagnostic + * ID. This function is a specialization that handles formatting the specific + * kind of error that is being appended. + */ +static void +pm_parser_err_prefix(pm_parser_t *parser, pm_diagnostic_id_t diag_id) { + switch (diag_id) { + case PM_ERR_HASH_KEY: { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->previous, diag_id, pm_token_type_human(parser->previous.type)); + break; + } + case PM_ERR_HASH_VALUE: + case PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR: { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, diag_id, pm_token_type_human(parser->current.type)); + break; + } + case PM_ERR_UNARY_RECEIVER: { + const char *human = (parser->current.type == PM_TOKEN_EOF ? "end-of-input" : pm_token_type_human(parser->current.type)); + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->previous, diag_id, human, parser->previous.start[0]); + break; + } + case PM_ERR_UNARY_DISALLOWED: + case PM_ERR_EXPECT_ARGUMENT: { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, diag_id, pm_token_type_human(parser->current.type)); + break; + } + default: + pm_parser_err_previous(parser, diag_id); + break; + } +} + +/** + * Ensures that the current retry token is valid in the current context. + */ +static void +parse_retry(pm_parser_t *parser, const pm_node_t *node) { +#define CONTEXT_NONE 0 +#define CONTEXT_THROUGH_ENSURE 1 +#define CONTEXT_THROUGH_ELSE 2 + + pm_context_node_t *context_node = parser->current_context; + int context = CONTEXT_NONE; + + while (context_node != NULL) { + switch (context_node->context) { + case PM_CONTEXT_BEGIN_RESCUE: + case PM_CONTEXT_BLOCK_RESCUE: + case PM_CONTEXT_CLASS_RESCUE: + case PM_CONTEXT_DEF_RESCUE: + case PM_CONTEXT_LAMBDA_RESCUE: + case PM_CONTEXT_MODULE_RESCUE: + case PM_CONTEXT_SCLASS_RESCUE: + case PM_CONTEXT_DEFINED: + case PM_CONTEXT_RESCUE_MODIFIER: + // These are the good cases. We're allowed to have a retry here. + return; + case PM_CONTEXT_CLASS: + case PM_CONTEXT_DEF: + case PM_CONTEXT_DEF_PARAMS: + case PM_CONTEXT_MAIN: + case PM_CONTEXT_MODULE: + case PM_CONTEXT_PREEXE: + case PM_CONTEXT_SCLASS: + // These are the bad cases. We're not allowed to have a retry in + // these contexts. + if (context == CONTEXT_NONE) { + pm_parser_err_node(parser, node, PM_ERR_INVALID_RETRY_WITHOUT_RESCUE); + } else if (context == CONTEXT_THROUGH_ENSURE) { + pm_parser_err_node(parser, node, PM_ERR_INVALID_RETRY_AFTER_ENSURE); + } else if (context == CONTEXT_THROUGH_ELSE) { + pm_parser_err_node(parser, node, PM_ERR_INVALID_RETRY_AFTER_ELSE); + } + return; + case PM_CONTEXT_BEGIN_ELSE: + case PM_CONTEXT_BLOCK_ELSE: + case PM_CONTEXT_CLASS_ELSE: + case PM_CONTEXT_DEF_ELSE: + case PM_CONTEXT_LAMBDA_ELSE: + case PM_CONTEXT_MODULE_ELSE: + case PM_CONTEXT_SCLASS_ELSE: + // These are also bad cases, but with a more specific error + // message indicating the else. + context = CONTEXT_THROUGH_ELSE; + break; + case PM_CONTEXT_BEGIN_ENSURE: + case PM_CONTEXT_BLOCK_ENSURE: + case PM_CONTEXT_CLASS_ENSURE: + case PM_CONTEXT_DEF_ENSURE: + case PM_CONTEXT_LAMBDA_ENSURE: + case PM_CONTEXT_MODULE_ENSURE: + case PM_CONTEXT_SCLASS_ENSURE: + // These are also bad cases, but with a more specific error + // message indicating the ensure. + context = CONTEXT_THROUGH_ENSURE; + break; + case PM_CONTEXT_NONE: + // This case should never happen. + assert(false && "unreachable"); + break; + case PM_CONTEXT_BEGIN: + case PM_CONTEXT_BLOCK_BRACES: + case PM_CONTEXT_BLOCK_KEYWORDS: + case PM_CONTEXT_BLOCK_PARAMETERS: + case PM_CONTEXT_CASE_IN: + case PM_CONTEXT_CASE_WHEN: + case PM_CONTEXT_DEFAULT_PARAMS: + case PM_CONTEXT_ELSE: + case PM_CONTEXT_ELSIF: + case PM_CONTEXT_EMBEXPR: + case PM_CONTEXT_FOR_INDEX: + case PM_CONTEXT_FOR: + case PM_CONTEXT_IF: + case PM_CONTEXT_LAMBDA_BRACES: + case PM_CONTEXT_LAMBDA_DO_END: + case PM_CONTEXT_LOOP_PREDICATE: + case PM_CONTEXT_MULTI_TARGET: + case PM_CONTEXT_PARENS: + case PM_CONTEXT_POSTEXE: + case PM_CONTEXT_PREDICATE: + case PM_CONTEXT_TERNARY: + case PM_CONTEXT_UNLESS: + case PM_CONTEXT_UNTIL: + case PM_CONTEXT_WHILE: + // In these contexts we should continue walking up the list of + // contexts. + break; + } + + context_node = context_node->prev; + } + +#undef CONTEXT_NONE +#undef CONTEXT_ENSURE +#undef CONTEXT_ELSE +} + +/** + * Ensures that the current yield token is valid in the current context. + */ +static void +parse_yield(pm_parser_t *parser, const pm_node_t *node) { + pm_context_node_t *context_node = parser->current_context; + + while (context_node != NULL) { + switch (context_node->context) { + case PM_CONTEXT_DEF: + case PM_CONTEXT_DEF_PARAMS: + case PM_CONTEXT_DEFINED: + case PM_CONTEXT_DEF_ENSURE: + case PM_CONTEXT_DEF_RESCUE: + case PM_CONTEXT_DEF_ELSE: + // These are the good cases. We're allowed to have a block exit + // in these contexts. + return; + case PM_CONTEXT_CLASS: + case PM_CONTEXT_CLASS_ENSURE: + case PM_CONTEXT_CLASS_RESCUE: + case PM_CONTEXT_CLASS_ELSE: + case PM_CONTEXT_MAIN: + case PM_CONTEXT_MODULE: + case PM_CONTEXT_MODULE_ENSURE: + case PM_CONTEXT_MODULE_RESCUE: + case PM_CONTEXT_MODULE_ELSE: + case PM_CONTEXT_SCLASS: + case PM_CONTEXT_SCLASS_RESCUE: + case PM_CONTEXT_SCLASS_ENSURE: + case PM_CONTEXT_SCLASS_ELSE: + // These are the bad cases. We're not allowed to have a retry in + // these contexts. + pm_parser_err_node(parser, node, PM_ERR_INVALID_YIELD); + return; + case PM_CONTEXT_NONE: + // This case should never happen. + assert(false && "unreachable"); + break; + case PM_CONTEXT_BEGIN: + case PM_CONTEXT_BEGIN_ELSE: + case PM_CONTEXT_BEGIN_ENSURE: + case PM_CONTEXT_BEGIN_RESCUE: + case PM_CONTEXT_BLOCK_BRACES: + case PM_CONTEXT_BLOCK_KEYWORDS: + case PM_CONTEXT_BLOCK_ELSE: + case PM_CONTEXT_BLOCK_ENSURE: + case PM_CONTEXT_BLOCK_PARAMETERS: + case PM_CONTEXT_BLOCK_RESCUE: + case PM_CONTEXT_CASE_IN: + case PM_CONTEXT_CASE_WHEN: + case PM_CONTEXT_DEFAULT_PARAMS: + case PM_CONTEXT_ELSE: + case PM_CONTEXT_ELSIF: + case PM_CONTEXT_EMBEXPR: + case PM_CONTEXT_FOR_INDEX: + case PM_CONTEXT_FOR: + case PM_CONTEXT_IF: + case PM_CONTEXT_LAMBDA_BRACES: + case PM_CONTEXT_LAMBDA_DO_END: + case PM_CONTEXT_LAMBDA_ELSE: + case PM_CONTEXT_LAMBDA_ENSURE: + case PM_CONTEXT_LAMBDA_RESCUE: + case PM_CONTEXT_LOOP_PREDICATE: + case PM_CONTEXT_MULTI_TARGET: + case PM_CONTEXT_PARENS: + case PM_CONTEXT_POSTEXE: + case PM_CONTEXT_PREDICATE: + case PM_CONTEXT_PREEXE: + case PM_CONTEXT_RESCUE_MODIFIER: + case PM_CONTEXT_TERNARY: + case PM_CONTEXT_UNLESS: + case PM_CONTEXT_UNTIL: + case PM_CONTEXT_WHILE: + // In these contexts we should continue walking up the list of + // contexts. + break; + } + + context_node = context_node->prev; + } +} + +/** + * This struct is used to pass information between the regular expression parser + * and the error callback. + */ +typedef struct { + /** The parser that we are parsing the regular expression for. */ + pm_parser_t *parser; + + /** The start of the regular expression. */ + const uint8_t *start; + + /** The end of the regular expression. */ + const uint8_t *end; + + /** + * Whether or not the source of the regular expression is shared. This + * impacts the location of error messages, because if it is shared then we + * can use the location directly and if it is not, then we use the bounds of + * the regular expression itself. + */ + bool shared; +} parse_regular_expression_error_data_t; + +/** + * This callback is called when the regular expression parser encounters a + * syntax error. + */ +static void +parse_regular_expression_error(const uint8_t *start, const uint8_t *end, const char *message, void *data) { + parse_regular_expression_error_data_t *callback_data = (parse_regular_expression_error_data_t *) data; + pm_location_t location; + + if (callback_data->shared) { + location = (pm_location_t) { .start = start, .end = end }; + } else { + location = (pm_location_t) { .start = callback_data->start, .end = callback_data->end }; + } + + PM_PARSER_ERR_FORMAT(callback_data->parser, location.start, location.end, PM_ERR_REGEXP_PARSE_ERROR, message); +} + +/** + * Parse the errors for the regular expression and add them to the parser. + */ +static void +parse_regular_expression_errors(pm_parser_t *parser, pm_regular_expression_node_t *node) { + const pm_string_t *unescaped = &node->unescaped; + parse_regular_expression_error_data_t error_data = { + .parser = parser, + .start = node->base.location.start, + .end = node->base.location.end, + .shared = unescaped->type == PM_STRING_SHARED + }; + + pm_regexp_parse(parser, pm_string_source(unescaped), pm_string_length(unescaped), PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_EXTENDED), NULL, NULL, parse_regular_expression_error, &error_data); +} + +/** + * Parse an expression that begins with the previous node that we just lexed. + */ +static inline pm_node_t * +parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, bool accepts_command_call, bool accepts_label, pm_diagnostic_id_t diag_id, uint16_t depth) { + switch (parser->current.type) { + case PM_TOKEN_BRACKET_LEFT_ARRAY: { + parser_lex(parser); + + pm_array_node_t *array = pm_array_node_create(parser, &parser->previous); + pm_accepts_block_stack_push(parser, true); + bool parsed_bare_hash = false; + + while (!match2(parser, PM_TOKEN_BRACKET_RIGHT, PM_TOKEN_EOF)) { + bool accepted_newline = accept1(parser, PM_TOKEN_NEWLINE); + + // Handle the case where we don't have a comma and we have a + // newline followed by a right bracket. + if (accepted_newline && match1(parser, PM_TOKEN_BRACKET_RIGHT)) { + break; + } + + // Ensure that we have a comma between elements in the array. + if (array->elements.size > 0) { + if (accept1(parser, PM_TOKEN_COMMA)) { + // If there was a comma but we also accepts a newline, + // then this is a syntax error. + if (accepted_newline) { + pm_parser_err_previous(parser, PM_ERR_INVALID_COMMA); + } + } else { + // If there was no comma, then we need to add a syntax + // error. + const uint8_t *location = parser->previous.end; + PM_PARSER_ERR_FORMAT(parser, location, location, PM_ERR_ARRAY_SEPARATOR, pm_token_type_human(parser->current.type)); + + parser->previous.start = location; + parser->previous.type = PM_TOKEN_MISSING; + } + } + + // If we have a right bracket immediately following a comma, + // this is allowed since it's a trailing comma. In this case we + // can break out of the loop. + if (match1(parser, PM_TOKEN_BRACKET_RIGHT)) break; + + pm_node_t *element; + + if (accept1(parser, PM_TOKEN_USTAR)) { + pm_token_t operator = parser->previous; + pm_node_t *expression = NULL; + + if (match3(parser, PM_TOKEN_BRACKET_RIGHT, PM_TOKEN_COMMA, PM_TOKEN_EOF)) { + pm_parser_scope_forwarding_positionals_check(parser, &operator); + } else { + expression = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, false, PM_ERR_ARRAY_EXPRESSION_AFTER_STAR, (uint16_t) (depth + 1)); + } + + element = UP(pm_splat_node_create(parser, &operator, expression)); + } else if (match2(parser, PM_TOKEN_LABEL, PM_TOKEN_USTAR_STAR)) { + if (parsed_bare_hash) { + pm_parser_err_current(parser, PM_ERR_EXPRESSION_BARE_HASH); + } + + element = UP(pm_keyword_hash_node_create(parser)); + pm_static_literals_t hash_keys = { 0 }; + + if (!match8(parser, PM_TOKEN_EOF, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON, PM_TOKEN_EOF, PM_TOKEN_BRACE_RIGHT, PM_TOKEN_BRACKET_RIGHT, PM_TOKEN_KEYWORD_DO, PM_TOKEN_PARENTHESIS_RIGHT)) { + parse_assocs(parser, &hash_keys, element, (uint16_t) (depth + 1)); + } + + pm_static_literals_free(&hash_keys); + parsed_bare_hash = true; + } else { + element = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, true, PM_ERR_ARRAY_EXPRESSION, (uint16_t) (depth + 1)); + + if (pm_symbol_node_label_p(element) || accept1(parser, PM_TOKEN_EQUAL_GREATER)) { + if (parsed_bare_hash) { + pm_parser_err_previous(parser, PM_ERR_EXPRESSION_BARE_HASH); + } + + pm_keyword_hash_node_t *hash = pm_keyword_hash_node_create(parser); + pm_static_literals_t hash_keys = { 0 }; + pm_hash_key_static_literals_add(parser, &hash_keys, element); + + pm_token_t operator; + if (parser->previous.type == PM_TOKEN_EQUAL_GREATER) { + operator = parser->previous; + } else { + operator = not_provided(parser); + } + + pm_node_t *value = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, false, PM_ERR_HASH_VALUE, (uint16_t) (depth + 1)); + pm_node_t *assoc = UP(pm_assoc_node_create(parser, element, &operator, value)); + pm_keyword_hash_node_elements_append(hash, assoc); + + element = UP(hash); + if (accept1(parser, PM_TOKEN_COMMA) && !match1(parser, PM_TOKEN_BRACKET_RIGHT)) { + parse_assocs(parser, &hash_keys, element, (uint16_t) (depth + 1)); + } + + pm_static_literals_free(&hash_keys); + parsed_bare_hash = true; + } + } + + pm_array_node_elements_append(array, element); + if (PM_NODE_TYPE_P(element, PM_MISSING_NODE)) break; + } + + accept1(parser, PM_TOKEN_NEWLINE); + + if (!accept1(parser, PM_TOKEN_BRACKET_RIGHT)) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_ARRAY_TERM, pm_token_type_human(parser->current.type)); + parser->previous.start = parser->previous.end; + parser->previous.type = PM_TOKEN_MISSING; + } + + pm_array_node_close_set(array, &parser->previous); + pm_accepts_block_stack_pop(parser); + + return UP(array); + } + case PM_TOKEN_PARENTHESIS_LEFT: + case PM_TOKEN_PARENTHESIS_LEFT_PARENTHESES: { + pm_token_t opening = parser->current; + pm_node_flags_t flags = 0; + + pm_node_list_t current_block_exits = { 0 }; + pm_node_list_t *previous_block_exits = push_block_exits(parser, ¤t_block_exits); + + parser_lex(parser); + while (true) { + if (accept1(parser, PM_TOKEN_SEMICOLON)) { + flags |= PM_PARENTHESES_NODE_FLAGS_MULTIPLE_STATEMENTS; + } else if (!accept1(parser, PM_TOKEN_NEWLINE)) { + break; + } + } + + // If this is the end of the file or we match a right parenthesis, then + // we have an empty parentheses node, and we can immediately return. + if (match2(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_TOKEN_EOF)) { + expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_EXPECT_RPAREN); + + pop_block_exits(parser, previous_block_exits); + pm_node_list_free(¤t_block_exits); + + return UP(pm_parentheses_node_create(parser, &opening, NULL, &parser->previous, flags)); + } + + // Otherwise, we're going to parse the first statement in the list + // of statements within the parentheses. + pm_accepts_block_stack_push(parser, true); + context_push(parser, PM_CONTEXT_PARENS); + pm_node_t *statement = parse_expression(parser, PM_BINDING_POWER_STATEMENT, true, false, PM_ERR_CANNOT_PARSE_EXPRESSION, (uint16_t) (depth + 1)); + context_pop(parser); + + // Determine if this statement is followed by a terminator. In the + // case of a single statement, this is fine. But in the case of + // multiple statements it's required. + bool terminator_found = false; + + if (accept1(parser, PM_TOKEN_SEMICOLON)) { + terminator_found = true; + flags |= PM_PARENTHESES_NODE_FLAGS_MULTIPLE_STATEMENTS; + } else if (accept1(parser, PM_TOKEN_NEWLINE)) { + terminator_found = true; + } + + if (terminator_found) { + while (true) { + if (accept1(parser, PM_TOKEN_SEMICOLON)) { + flags |= PM_PARENTHESES_NODE_FLAGS_MULTIPLE_STATEMENTS; + } else if (!accept1(parser, PM_TOKEN_NEWLINE)) { + break; + } + } + } + + // If we hit a right parenthesis, then we're done parsing the + // parentheses node, and we can check which kind of node we should + // return. + if (match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { + if (opening.type == PM_TOKEN_PARENTHESIS_LEFT_PARENTHESES) { + lex_state_set(parser, PM_LEX_STATE_ENDARG); + } + + parser_lex(parser); + pm_accepts_block_stack_pop(parser); + + pop_block_exits(parser, previous_block_exits); + pm_node_list_free(¤t_block_exits); + + if (PM_NODE_TYPE_P(statement, PM_MULTI_TARGET_NODE) || PM_NODE_TYPE_P(statement, PM_SPLAT_NODE)) { + // If we have a single statement and are ending on a right + // parenthesis, then we need to check if this is possibly a + // multiple target node. + pm_multi_target_node_t *multi_target; + + if (PM_NODE_TYPE_P(statement, PM_MULTI_TARGET_NODE) && ((pm_multi_target_node_t *) statement)->lparen_loc.start == NULL) { + multi_target = (pm_multi_target_node_t *) statement; + } else { + multi_target = pm_multi_target_node_create(parser); + pm_multi_target_node_targets_append(parser, multi_target, statement); + } + + pm_location_t lparen_loc = PM_LOCATION_TOKEN_VALUE(&opening); + pm_location_t rparen_loc = PM_LOCATION_TOKEN_VALUE(&parser->previous); + + multi_target->lparen_loc = lparen_loc; + multi_target->rparen_loc = rparen_loc; + multi_target->base.location.start = lparen_loc.start; + multi_target->base.location.end = rparen_loc.end; + + pm_node_t *result; + if (match1(parser, PM_TOKEN_COMMA) && (binding_power == PM_BINDING_POWER_STATEMENT)) { + result = parse_targets(parser, UP(multi_target), PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); + accept1(parser, PM_TOKEN_NEWLINE); + } else { + result = UP(multi_target); + } + + if (context_p(parser, PM_CONTEXT_MULTI_TARGET)) { + // All set, this is explicitly allowed by the parent + // context. + } else if (context_p(parser, PM_CONTEXT_FOR_INDEX) && match1(parser, PM_TOKEN_KEYWORD_IN)) { + // All set, we're inside a for loop and we're parsing + // multiple targets. + } else if (binding_power != PM_BINDING_POWER_STATEMENT) { + // Multi targets are not allowed when it's not a + // statement level. + pm_parser_err_node(parser, result, PM_ERR_WRITE_TARGET_UNEXPECTED); + } else if (!match2(parser, PM_TOKEN_EQUAL, PM_TOKEN_PARENTHESIS_RIGHT)) { + // Multi targets must be followed by an equal sign in + // order to be valid (or a right parenthesis if they are + // nested). + pm_parser_err_node(parser, result, PM_ERR_WRITE_TARGET_UNEXPECTED); + } + + return result; + } + + // If we have a single statement and are ending on a right parenthesis + // and we didn't return a multiple assignment node, then we can return a + // regular parentheses node now. + pm_statements_node_t *statements = pm_statements_node_create(parser); + pm_statements_node_body_append(parser, statements, statement, true); + + return UP(pm_parentheses_node_create(parser, &opening, UP(statements), &parser->previous, flags)); + } + + // If we have more than one statement in the set of parentheses, + // then we are going to parse all of them as a list of statements. + // We'll do that here. + context_push(parser, PM_CONTEXT_PARENS); + flags |= PM_PARENTHESES_NODE_FLAGS_MULTIPLE_STATEMENTS; + + pm_statements_node_t *statements = pm_statements_node_create(parser); + pm_statements_node_body_append(parser, statements, statement, true); + + // If we didn't find a terminator and we didn't find a right + // parenthesis, then this is a syntax error. + if (!terminator_found && !match1(parser, PM_TOKEN_EOF)) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_type_human(parser->current.type)); + } + + // Parse each statement within the parentheses. + while (true) { + pm_node_t *node = parse_expression(parser, PM_BINDING_POWER_STATEMENT, true, false, PM_ERR_CANNOT_PARSE_EXPRESSION, (uint16_t) (depth + 1)); + pm_statements_node_body_append(parser, statements, node, true); + + // If we're recovering from a syntax error, then we need to stop + // parsing the statements now. + if (parser->recovering) { + // If this is the level of context where the recovery has + // happened, then we can mark the parser as done recovering. + if (match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) parser->recovering = false; + break; + } + + // If we couldn't parse an expression at all, then we need to + // bail out of the loop. + if (PM_NODE_TYPE_P(node, PM_MISSING_NODE)) break; + + // If we successfully parsed a statement, then we are going to + // need terminator to delimit them. + if (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) { + while (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)); + if (match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) break; + } else if (match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { + break; + } else if (!match1(parser, PM_TOKEN_EOF)) { + // If we're at the end of the file, then we're going to add + // an error after this for the ) anyway. + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_type_human(parser->current.type)); + } + } + + context_pop(parser); + pm_accepts_block_stack_pop(parser); + expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_EXPECT_RPAREN); + + // When we're parsing multi targets, we allow them to be followed by + // a right parenthesis if they are at the statement level. This is + // only possible if they are the final statement in a parentheses. + // We need to explicitly reject that here. + { + pm_node_t *statement = statements->body.nodes[statements->body.size - 1]; + + if (PM_NODE_TYPE_P(statement, PM_SPLAT_NODE)) { + pm_multi_target_node_t *multi_target = pm_multi_target_node_create(parser); + pm_multi_target_node_targets_append(parser, multi_target, statement); + + statement = UP(multi_target); + statements->body.nodes[statements->body.size - 1] = statement; + } + + if (PM_NODE_TYPE_P(statement, PM_MULTI_TARGET_NODE)) { + const uint8_t *offset = statement->location.end; + pm_token_t operator = { .type = PM_TOKEN_EQUAL, .start = offset, .end = offset }; + pm_node_t *value = UP(pm_missing_node_create(parser, offset, offset)); + + statement = UP(pm_multi_write_node_create(parser, (pm_multi_target_node_t *) statement, &operator, value)); + statements->body.nodes[statements->body.size - 1] = statement; + + pm_parser_err_node(parser, statement, PM_ERR_WRITE_TARGET_UNEXPECTED); + } + } + + pop_block_exits(parser, previous_block_exits); + pm_node_list_free(¤t_block_exits); + + pm_void_statements_check(parser, statements, true); + return UP(pm_parentheses_node_create(parser, &opening, UP(statements), &parser->previous, flags)); + } + case PM_TOKEN_BRACE_LEFT: { + // If we were passed a current_hash_keys via the parser, then that + // means we're already parsing a hash and we want to share the set + // of hash keys with this inner hash we're about to parse for the + // sake of warnings. We'll set it to NULL after we grab it to make + // sure subsequent expressions don't use it. Effectively this is a + // way of getting around passing it to every call to + // parse_expression. + pm_static_literals_t *current_hash_keys = parser->current_hash_keys; + parser->current_hash_keys = NULL; + + pm_accepts_block_stack_push(parser, true); + parser_lex(parser); + + pm_token_t opening = parser->previous; + pm_hash_node_t *node = pm_hash_node_create(parser, &opening); + + if (!match2(parser, PM_TOKEN_BRACE_RIGHT, PM_TOKEN_EOF)) { + if (current_hash_keys != NULL) { + parse_assocs(parser, current_hash_keys, UP(node), (uint16_t) (depth + 1)); + } else { + pm_static_literals_t hash_keys = { 0 }; + parse_assocs(parser, &hash_keys, UP(node), (uint16_t) (depth + 1)); + pm_static_literals_free(&hash_keys); + } + + accept1(parser, PM_TOKEN_NEWLINE); + } + + pm_accepts_block_stack_pop(parser); + expect1_opening(parser, PM_TOKEN_BRACE_RIGHT, PM_ERR_HASH_TERM, &opening); + pm_hash_node_closing_loc_set(node, &parser->previous); + + return UP(node); + } + case PM_TOKEN_CHARACTER_LITERAL: { + pm_token_t closing = not_provided(parser); + pm_node_t *node = UP(pm_string_node_create_current_string( + parser, + &(pm_token_t) { + .type = PM_TOKEN_STRING_BEGIN, + .start = parser->current.start, + .end = parser->current.start + 1 + }, + &(pm_token_t) { + .type = PM_TOKEN_STRING_CONTENT, + .start = parser->current.start + 1, + .end = parser->current.end + }, + &closing + )); + + pm_node_flag_set(node, parse_unescaped_encoding(parser)); + + // Skip past the character literal here, since now we have handled + // parser->explicit_encoding correctly. + parser_lex(parser); + + // Characters can be followed by strings in which case they are + // automatically concatenated. + if (match1(parser, PM_TOKEN_STRING_BEGIN)) { + return parse_strings(parser, node, false, (uint16_t) (depth + 1)); + } + + return node; + } + case PM_TOKEN_CLASS_VARIABLE: { + parser_lex(parser); + pm_node_t *node = UP(pm_class_variable_read_node_create(parser, &parser->previous)); + + if (binding_power == PM_BINDING_POWER_STATEMENT && match1(parser, PM_TOKEN_COMMA)) { + node = parse_targets_validate(parser, node, PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); + } + + return node; + } + case PM_TOKEN_CONSTANT: { + parser_lex(parser); + pm_token_t constant = parser->previous; + + // If a constant is immediately followed by parentheses, then this is in + // fact a method call, not a constant read. + if ( + match1(parser, PM_TOKEN_PARENTHESIS_LEFT) || + (accepts_command_call && (token_begins_expression_p(parser->current.type) || match3(parser, PM_TOKEN_UAMPERSAND, PM_TOKEN_USTAR, PM_TOKEN_USTAR_STAR))) || + (pm_accepts_block_stack_p(parser) && match1(parser, PM_TOKEN_KEYWORD_DO)) || + match1(parser, PM_TOKEN_BRACE_LEFT) + ) { + pm_arguments_t arguments = { 0 }; + parse_arguments_list(parser, &arguments, true, accepts_command_call, (uint16_t) (depth + 1)); + return UP(pm_call_node_fcall_create(parser, &constant, &arguments)); + } + + pm_node_t *node = UP(pm_constant_read_node_create(parser, &parser->previous)); + + if ((binding_power == PM_BINDING_POWER_STATEMENT) && match1(parser, PM_TOKEN_COMMA)) { + // If we get here, then we have a comma immediately following a + // constant, so we're going to parse this as a multiple assignment. + node = parse_targets_validate(parser, node, PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); + } + + return node; + } + case PM_TOKEN_UCOLON_COLON: { + parser_lex(parser); + pm_token_t delimiter = parser->previous; + + expect1(parser, PM_TOKEN_CONSTANT, PM_ERR_CONSTANT_PATH_COLON_COLON_CONSTANT); + pm_node_t *node = UP(pm_constant_path_node_create(parser, NULL, &delimiter, &parser->previous)); + + if ((binding_power == PM_BINDING_POWER_STATEMENT) && match1(parser, PM_TOKEN_COMMA)) { + node = parse_targets_validate(parser, node, PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); + } + + return node; + } + case PM_TOKEN_UDOT_DOT: + case PM_TOKEN_UDOT_DOT_DOT: { + pm_token_t operator = parser->current; + parser_lex(parser); + + pm_node_t *right = parse_expression(parser, pm_binding_powers[operator.type].left, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + + // Unary .. and ... are special because these are non-associative + // operators that can also be unary operators. In this case we need + // to explicitly reject code that has a .. or ... that follows this + // expression. + if (match2(parser, PM_TOKEN_DOT_DOT, PM_TOKEN_DOT_DOT_DOT)) { + pm_parser_err_current(parser, PM_ERR_UNEXPECTED_RANGE_OPERATOR); + } + + return UP(pm_range_node_create(parser, NULL, &operator, right)); + } + case PM_TOKEN_FLOAT: + parser_lex(parser); + return UP(pm_float_node_create(parser, &parser->previous)); + case PM_TOKEN_FLOAT_IMAGINARY: + parser_lex(parser); + return UP(pm_float_node_imaginary_create(parser, &parser->previous)); + case PM_TOKEN_FLOAT_RATIONAL: + parser_lex(parser); + return UP(pm_float_node_rational_create(parser, &parser->previous)); + case PM_TOKEN_FLOAT_RATIONAL_IMAGINARY: + parser_lex(parser); + return UP(pm_float_node_rational_imaginary_create(parser, &parser->previous)); + case PM_TOKEN_NUMBERED_REFERENCE: { + parser_lex(parser); + pm_node_t *node = UP(pm_numbered_reference_read_node_create(parser, &parser->previous)); + + if (binding_power == PM_BINDING_POWER_STATEMENT && match1(parser, PM_TOKEN_COMMA)) { + node = parse_targets_validate(parser, node, PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); + } + + return node; + } + case PM_TOKEN_GLOBAL_VARIABLE: { + parser_lex(parser); + pm_node_t *node = UP(pm_global_variable_read_node_create(parser, &parser->previous)); + + if (binding_power == PM_BINDING_POWER_STATEMENT && match1(parser, PM_TOKEN_COMMA)) { + node = parse_targets_validate(parser, node, PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); + } + + return node; + } + case PM_TOKEN_BACK_REFERENCE: { + parser_lex(parser); + pm_node_t *node = UP(pm_back_reference_read_node_create(parser, &parser->previous)); + + if (binding_power == PM_BINDING_POWER_STATEMENT && match1(parser, PM_TOKEN_COMMA)) { + node = parse_targets_validate(parser, node, PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); + } + + return node; + } + case PM_TOKEN_IDENTIFIER: + case PM_TOKEN_METHOD_NAME: { + parser_lex(parser); + pm_token_t identifier = parser->previous; + pm_node_t *node = parse_variable_call(parser); + + if (PM_NODE_TYPE_P(node, PM_CALL_NODE)) { + // If parse_variable_call returned with a call node, then we + // know the identifier is not in the local table. In that case + // we need to check if there are arguments following the + // identifier. + pm_call_node_t *call = (pm_call_node_t *) node; + pm_arguments_t arguments = { 0 }; + + if (parse_arguments_list(parser, &arguments, true, accepts_command_call, (uint16_t) (depth + 1))) { + // Since we found arguments, we need to turn off the + // variable call bit in the flags. + pm_node_flag_unset(UP(call), PM_CALL_NODE_FLAGS_VARIABLE_CALL); + + call->opening_loc = arguments.opening_loc; + call->arguments = arguments.arguments; + call->closing_loc = arguments.closing_loc; + call->block = arguments.block; + + const uint8_t *end = pm_arguments_end(&arguments); + if (!end) { + end = call->message_loc.end; + } + call->base.location.end = end; + } + } else { + // Otherwise, we know the identifier is in the local table. This + // can still be a method call if it is followed by arguments or + // a block, so we need to check for that here. + if ( + (accepts_command_call && (token_begins_expression_p(parser->current.type) || match3(parser, PM_TOKEN_UAMPERSAND, PM_TOKEN_USTAR, PM_TOKEN_USTAR_STAR))) || + (pm_accepts_block_stack_p(parser) && match1(parser, PM_TOKEN_KEYWORD_DO)) || + match1(parser, PM_TOKEN_BRACE_LEFT) + ) { + pm_arguments_t arguments = { 0 }; + parse_arguments_list(parser, &arguments, true, accepts_command_call, (uint16_t) (depth + 1)); + pm_call_node_t *fcall = pm_call_node_fcall_create(parser, &identifier, &arguments); + + if (PM_NODE_TYPE_P(node, PM_IT_LOCAL_VARIABLE_READ_NODE)) { + // If we're about to convert an 'it' implicit local + // variable read into a method call, we need to remove + // it from the list of implicit local variables. + pm_node_unreference(parser, node); + } else { + // Otherwise, we're about to convert a regular local + // variable read into a method call, in which case we + // need to indicate that this was not a read for the + // purposes of warnings. + assert(PM_NODE_TYPE_P(node, PM_LOCAL_VARIABLE_READ_NODE)); + + if (pm_token_is_numbered_parameter(identifier.start, identifier.end)) { + pm_node_unreference(parser, node); + } else { + pm_local_variable_read_node_t *cast = (pm_local_variable_read_node_t *) node; + pm_locals_unread(&pm_parser_scope_find(parser, cast->depth)->locals, cast->name); + } + } + + pm_node_destroy(parser, node); + return UP(fcall); + } + } + + if ((binding_power == PM_BINDING_POWER_STATEMENT) && match1(parser, PM_TOKEN_COMMA)) { + node = parse_targets_validate(parser, node, PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); + } + + return node; + } + case PM_TOKEN_HEREDOC_START: { + // Here we have found a heredoc. We'll parse it and add it to the + // list of strings. + assert(parser->lex_modes.current->mode == PM_LEX_HEREDOC); + pm_heredoc_lex_mode_t lex_mode = parser->lex_modes.current->as.heredoc.base; + + size_t common_whitespace = (size_t) -1; + parser->lex_modes.current->as.heredoc.common_whitespace = &common_whitespace; + + parser_lex(parser); + pm_token_t opening = parser->previous; + + pm_node_t *node; + pm_node_t *part; + + if (match2(parser, PM_TOKEN_HEREDOC_END, PM_TOKEN_EOF)) { + // If we get here, then we have an empty heredoc. We'll create + // an empty content token and return an empty string node. + expect1_heredoc_term(parser, lex_mode.ident_start, lex_mode.ident_length); + pm_token_t content = parse_strings_empty_content(parser->previous.start); + + if (lex_mode.quote == PM_HEREDOC_QUOTE_BACKTICK) { + node = UP(pm_xstring_node_create_unescaped(parser, &opening, &content, &parser->previous, &PM_STRING_EMPTY)); + } else { + node = UP(pm_string_node_create_unescaped(parser, &opening, &content, &parser->previous, &PM_STRING_EMPTY)); + } + + node->location.end = opening.end; + } else if ((part = parse_string_part(parser, (uint16_t) (depth + 1))) == NULL) { + // If we get here, then we tried to find something in the + // heredoc but couldn't actually parse anything, so we'll just + // return a missing node. + // + // parse_string_part handles its own errors, so there is no need + // for us to add one here. + node = UP(pm_missing_node_create(parser, parser->previous.start, parser->previous.end)); + } else if (PM_NODE_TYPE_P(part, PM_STRING_NODE) && match2(parser, PM_TOKEN_HEREDOC_END, PM_TOKEN_EOF)) { + // If we get here, then the part that we parsed was plain string + // content and we're at the end of the heredoc, so we can return + // just a string node with the heredoc opening and closing as + // its opening and closing. + pm_node_flag_set(part, parse_unescaped_encoding(parser)); + pm_string_node_t *cast = (pm_string_node_t *) part; + + cast->opening_loc = PM_LOCATION_TOKEN_VALUE(&opening); + cast->closing_loc = PM_LOCATION_TOKEN_VALUE(&parser->current); + cast->base.location = cast->opening_loc; + + if (lex_mode.quote == PM_HEREDOC_QUOTE_BACKTICK) { + assert(sizeof(pm_string_node_t) == sizeof(pm_x_string_node_t)); + cast->base.type = PM_X_STRING_NODE; + } + + if (lex_mode.indent == PM_HEREDOC_INDENT_TILDE && (common_whitespace != (size_t) -1) && (common_whitespace != 0)) { + parse_heredoc_dedent_string(&cast->unescaped, common_whitespace); + } + + node = UP(cast); + expect1_heredoc_term(parser, lex_mode.ident_start, lex_mode.ident_length); + } else { + // If we get here, then we have multiple parts in the heredoc, + // so we'll need to create an interpolated string node to hold + // them all. + pm_node_list_t parts = { 0 }; + pm_node_list_append(&parts, part); + + while (!match2(parser, PM_TOKEN_HEREDOC_END, PM_TOKEN_EOF)) { + if ((part = parse_string_part(parser, (uint16_t) (depth + 1))) != NULL) { + pm_node_list_append(&parts, part); + } + } + + // Now that we have all of the parts, create the correct type of + // interpolated node. + if (lex_mode.quote == PM_HEREDOC_QUOTE_BACKTICK) { + pm_interpolated_x_string_node_t *cast = pm_interpolated_xstring_node_create(parser, &opening, &opening); + cast->parts = parts; + + expect1_heredoc_term(parser, lex_mode.ident_start, lex_mode.ident_length); + pm_interpolated_xstring_node_closing_set(cast, &parser->previous); + + cast->base.location = cast->opening_loc; + node = UP(cast); + } else { + pm_interpolated_string_node_t *cast = pm_interpolated_string_node_create(parser, &opening, &parts, &opening); + pm_node_list_free(&parts); + + expect1_heredoc_term(parser, lex_mode.ident_start, lex_mode.ident_length); + pm_interpolated_string_node_closing_set(cast, &parser->previous); + + cast->base.location = cast->opening_loc; + node = UP(cast); + } + + // If this is a heredoc that is indented with a ~, then we need + // to dedent each line by the common leading whitespace. + if (lex_mode.indent == PM_HEREDOC_INDENT_TILDE && (common_whitespace != (size_t) -1) && (common_whitespace != 0)) { + pm_node_list_t *nodes; + if (lex_mode.quote == PM_HEREDOC_QUOTE_BACKTICK) { + nodes = &((pm_interpolated_x_string_node_t *) node)->parts; + } else { + nodes = &((pm_interpolated_string_node_t *) node)->parts; + } + + parse_heredoc_dedent(parser, nodes, common_whitespace); + } + } + + if (match1(parser, PM_TOKEN_STRING_BEGIN)) { + return parse_strings(parser, node, false, (uint16_t) (depth + 1)); + } + + return node; + } + case PM_TOKEN_INSTANCE_VARIABLE: { + parser_lex(parser); + pm_node_t *node = UP(pm_instance_variable_read_node_create(parser, &parser->previous)); + + if (binding_power == PM_BINDING_POWER_STATEMENT && match1(parser, PM_TOKEN_COMMA)) { + node = parse_targets_validate(parser, node, PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); + } + + return node; + } + case PM_TOKEN_INTEGER: { + pm_node_flags_t base = parser->integer_base; + parser_lex(parser); + return UP(pm_integer_node_create(parser, base, &parser->previous)); + } + case PM_TOKEN_INTEGER_IMAGINARY: { + pm_node_flags_t base = parser->integer_base; + parser_lex(parser); + return UP(pm_integer_node_imaginary_create(parser, base, &parser->previous)); + } + case PM_TOKEN_INTEGER_RATIONAL: { + pm_node_flags_t base = parser->integer_base; + parser_lex(parser); + return UP(pm_integer_node_rational_create(parser, base, &parser->previous)); + } + case PM_TOKEN_INTEGER_RATIONAL_IMAGINARY: { + pm_node_flags_t base = parser->integer_base; + parser_lex(parser); + return UP(pm_integer_node_rational_imaginary_create(parser, base, &parser->previous)); + } + case PM_TOKEN_KEYWORD___ENCODING__: + parser_lex(parser); + return UP(pm_source_encoding_node_create(parser, &parser->previous)); + case PM_TOKEN_KEYWORD___FILE__: + parser_lex(parser); + return UP(pm_source_file_node_create(parser, &parser->previous)); + case PM_TOKEN_KEYWORD___LINE__: + parser_lex(parser); + return UP(pm_source_line_node_create(parser, &parser->previous)); + case PM_TOKEN_KEYWORD_ALIAS: { + if (binding_power != PM_BINDING_POWER_STATEMENT) { + pm_parser_err_current(parser, PM_ERR_STATEMENT_ALIAS); + } + + parser_lex(parser); + pm_token_t keyword = parser->previous; + + pm_node_t *new_name = parse_alias_argument(parser, true, (uint16_t) (depth + 1)); + pm_node_t *old_name = parse_alias_argument(parser, false, (uint16_t) (depth + 1)); + + switch (PM_NODE_TYPE(new_name)) { + case PM_BACK_REFERENCE_READ_NODE: + case PM_NUMBERED_REFERENCE_READ_NODE: + case PM_GLOBAL_VARIABLE_READ_NODE: { + if (PM_NODE_TYPE_P(old_name, PM_BACK_REFERENCE_READ_NODE) || PM_NODE_TYPE_P(old_name, PM_NUMBERED_REFERENCE_READ_NODE) || PM_NODE_TYPE_P(old_name, PM_GLOBAL_VARIABLE_READ_NODE)) { + if (PM_NODE_TYPE_P(old_name, PM_NUMBERED_REFERENCE_READ_NODE)) { + pm_parser_err_node(parser, old_name, PM_ERR_ALIAS_ARGUMENT_NUMBERED_REFERENCE); + } + } else { + pm_parser_err_node(parser, old_name, PM_ERR_ALIAS_ARGUMENT); + } + + return UP(pm_alias_global_variable_node_create(parser, &keyword, new_name, old_name)); + } + case PM_SYMBOL_NODE: + case PM_INTERPOLATED_SYMBOL_NODE: { + if (!PM_NODE_TYPE_P(old_name, PM_SYMBOL_NODE) && !PM_NODE_TYPE_P(old_name, PM_INTERPOLATED_SYMBOL_NODE)) { + pm_parser_err_node(parser, old_name, PM_ERR_ALIAS_ARGUMENT); + } + } + PRISM_FALLTHROUGH + default: + return UP(pm_alias_method_node_create(parser, &keyword, new_name, old_name)); + } + } + case PM_TOKEN_KEYWORD_CASE: { + size_t opening_newline_index = token_newline_index(parser); + parser_lex(parser); + + pm_token_t case_keyword = parser->previous; + pm_node_t *predicate = NULL; + + pm_node_list_t current_block_exits = { 0 }; + pm_node_list_t *previous_block_exits = push_block_exits(parser, ¤t_block_exits); + + if (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) { + while (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)); + predicate = NULL; + } else if (match3(parser, PM_TOKEN_KEYWORD_WHEN, PM_TOKEN_KEYWORD_IN, PM_TOKEN_KEYWORD_END)) { + predicate = NULL; + } else if (!token_begins_expression_p(parser->current.type)) { + predicate = NULL; + } else { + predicate = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, true, false, PM_ERR_CASE_EXPRESSION_AFTER_CASE, (uint16_t) (depth + 1)); + while (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)); + } + + if (match1(parser, PM_TOKEN_KEYWORD_END)) { + parser_warn_indentation_mismatch(parser, opening_newline_index, &case_keyword, false, false); + parser_lex(parser); + + pop_block_exits(parser, previous_block_exits); + pm_node_list_free(¤t_block_exits); + + pm_parser_err_token(parser, &case_keyword, PM_ERR_CASE_MISSING_CONDITIONS); + return UP(pm_case_node_create(parser, &case_keyword, predicate, &parser->previous)); + } + + // At this point we can create a case node, though we don't yet know + // if it is a case-in or case-when node. + pm_token_t end_keyword = not_provided(parser); + pm_node_t *node; + + if (match1(parser, PM_TOKEN_KEYWORD_WHEN)) { + pm_case_node_t *case_node = pm_case_node_create(parser, &case_keyword, predicate, &end_keyword); + pm_static_literals_t literals = { 0 }; + + // At this point we've seen a when keyword, so we know this is a + // case-when node. We will continue to parse the when nodes + // until we hit the end of the list. + while (match1(parser, PM_TOKEN_KEYWORD_WHEN)) { + parser_warn_indentation_mismatch(parser, opening_newline_index, &case_keyword, false, true); + parser_lex(parser); + + pm_token_t when_keyword = parser->previous; + pm_when_node_t *when_node = pm_when_node_create(parser, &when_keyword); + + do { + if (accept1(parser, PM_TOKEN_USTAR)) { + pm_token_t operator = parser->previous; + pm_node_t *expression = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_STAR, (uint16_t) (depth + 1)); + + pm_splat_node_t *splat_node = pm_splat_node_create(parser, &operator, expression); + pm_when_node_conditions_append(when_node, UP(splat_node)); + + if (PM_NODE_TYPE_P(expression, PM_MISSING_NODE)) break; + } else { + pm_node_t *condition = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, false, PM_ERR_CASE_EXPRESSION_AFTER_WHEN, (uint16_t) (depth + 1)); + pm_when_node_conditions_append(when_node, condition); + + // If we found a missing node, then this is a syntax + // error and we should stop looping. + if (PM_NODE_TYPE_P(condition, PM_MISSING_NODE)) break; + + // If this is a string node, then we need to mark it + // as frozen because when clause strings are frozen. + if (PM_NODE_TYPE_P(condition, PM_STRING_NODE)) { + pm_node_flag_set(condition, PM_STRING_FLAGS_FROZEN | PM_NODE_FLAG_STATIC_LITERAL); + } else if (PM_NODE_TYPE_P(condition, PM_SOURCE_FILE_NODE)) { + pm_node_flag_set(condition, PM_NODE_FLAG_STATIC_LITERAL); + } + + pm_when_clause_static_literals_add(parser, &literals, condition); + } + } while (accept1(parser, PM_TOKEN_COMMA)); + + if (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) { + if (accept1(parser, PM_TOKEN_KEYWORD_THEN)) { + pm_when_node_then_keyword_loc_set(when_node, &parser->previous); + } + } else { + expect1(parser, PM_TOKEN_KEYWORD_THEN, PM_ERR_EXPECT_WHEN_DELIMITER); + pm_when_node_then_keyword_loc_set(when_node, &parser->previous); + } + + if (!match3(parser, PM_TOKEN_KEYWORD_WHEN, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { + pm_statements_node_t *statements = parse_statements(parser, PM_CONTEXT_CASE_WHEN, (uint16_t) (depth + 1)); + if (statements != NULL) { + pm_when_node_statements_set(when_node, statements); + } + } + + pm_case_node_condition_append(case_node, UP(when_node)); + } + + // If we didn't parse any conditions (in or when) then we need + // to indicate that we have an error. + if (case_node->conditions.size == 0) { + pm_parser_err_token(parser, &case_keyword, PM_ERR_CASE_MISSING_CONDITIONS); + } + + pm_static_literals_free(&literals); + node = UP(case_node); + } else { + pm_case_match_node_t *case_node = pm_case_match_node_create(parser, &case_keyword, predicate, &end_keyword); + + // If this is a case-match node (i.e., it is a pattern matching + // case statement) then we must have a predicate. + if (predicate == NULL) { + pm_parser_err_token(parser, &case_keyword, PM_ERR_CASE_MATCH_MISSING_PREDICATE); + } + + // At this point we expect that we're parsing a case-in node. We + // will continue to parse the in nodes until we hit the end of + // the list. + while (match1(parser, PM_TOKEN_KEYWORD_IN)) { + parser_warn_indentation_mismatch(parser, opening_newline_index, &case_keyword, false, true); + + bool previous_pattern_matching_newlines = parser->pattern_matching_newlines; + parser->pattern_matching_newlines = true; + + lex_state_set(parser, PM_LEX_STATE_BEG | PM_LEX_STATE_LABEL); + parser->command_start = false; + parser_lex(parser); + + pm_token_t in_keyword = parser->previous; + + pm_constant_id_list_t captures = { 0 }; + pm_node_t *pattern = parse_pattern(parser, &captures, PM_PARSE_PATTERN_TOP | PM_PARSE_PATTERN_MULTI, PM_ERR_PATTERN_EXPRESSION_AFTER_IN, (uint16_t) (depth + 1)); + + parser->pattern_matching_newlines = previous_pattern_matching_newlines; + pm_constant_id_list_free(&captures); + + // Since we're in the top-level of the case-in node we need + // to check for guard clauses in the form of `if` or + // `unless` statements. + if (accept1(parser, PM_TOKEN_KEYWORD_IF_MODIFIER)) { + pm_token_t keyword = parser->previous; + pm_node_t *predicate = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, true, false, PM_ERR_CONDITIONAL_IF_PREDICATE, (uint16_t) (depth + 1)); + pattern = UP(pm_if_node_modifier_create(parser, pattern, &keyword, predicate)); + } else if (accept1(parser, PM_TOKEN_KEYWORD_UNLESS_MODIFIER)) { + pm_token_t keyword = parser->previous; + pm_node_t *predicate = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, true, false, PM_ERR_CONDITIONAL_UNLESS_PREDICATE, (uint16_t) (depth + 1)); + pattern = UP(pm_unless_node_modifier_create(parser, pattern, &keyword, predicate)); + } + + // Now we need to check for the terminator of the in node's + // pattern. It can be a newline or semicolon optionally + // followed by a `then` keyword. + pm_token_t then_keyword; + if (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) { + if (accept1(parser, PM_TOKEN_KEYWORD_THEN)) { + then_keyword = parser->previous; + } else { + then_keyword = not_provided(parser); + } + } else { + expect1(parser, PM_TOKEN_KEYWORD_THEN, PM_ERR_EXPECT_IN_DELIMITER); + then_keyword = parser->previous; + } + + // Now we can actually parse the statements associated with + // the in node. + pm_statements_node_t *statements; + if (match3(parser, PM_TOKEN_KEYWORD_IN, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { + statements = NULL; + } else { + statements = parse_statements(parser, PM_CONTEXT_CASE_IN, (uint16_t) (depth + 1)); + } + + // Now that we have the full pattern and statements, we can + // create the node and attach it to the case node. + pm_node_t *condition = UP(pm_in_node_create(parser, pattern, statements, &in_keyword, &then_keyword)); + pm_case_match_node_condition_append(case_node, condition); + } + + // If we didn't parse any conditions (in or when) then we need + // to indicate that we have an error. + if (case_node->conditions.size == 0) { + pm_parser_err_token(parser, &case_keyword, PM_ERR_CASE_MISSING_CONDITIONS); + } + + node = UP(case_node); + } + + accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); + if (accept1(parser, PM_TOKEN_KEYWORD_ELSE)) { + pm_token_t else_keyword = parser->previous; + pm_else_node_t *else_node; + + if (!match1(parser, PM_TOKEN_KEYWORD_END)) { + else_node = pm_else_node_create(parser, &else_keyword, parse_statements(parser, PM_CONTEXT_ELSE, (uint16_t) (depth + 1)), &parser->current); + } else { + else_node = pm_else_node_create(parser, &else_keyword, NULL, &parser->current); + } + + if (PM_NODE_TYPE_P(node, PM_CASE_NODE)) { + pm_case_node_else_clause_set((pm_case_node_t *) node, else_node); + } else { + pm_case_match_node_else_clause_set((pm_case_match_node_t *) node, else_node); + } + } + + parser_warn_indentation_mismatch(parser, opening_newline_index, &case_keyword, false, false); + expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_CASE_TERM, &case_keyword); + + if (PM_NODE_TYPE_P(node, PM_CASE_NODE)) { + pm_case_node_end_keyword_loc_set((pm_case_node_t *) node, &parser->previous); + } else { + pm_case_match_node_end_keyword_loc_set((pm_case_match_node_t *) node, &parser->previous); + } + + pop_block_exits(parser, previous_block_exits); + pm_node_list_free(¤t_block_exits); + + return node; + } + case PM_TOKEN_KEYWORD_BEGIN: { + size_t opening_newline_index = token_newline_index(parser); + parser_lex(parser); + + pm_token_t begin_keyword = parser->previous; + accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); + + pm_node_list_t current_block_exits = { 0 }; + pm_node_list_t *previous_block_exits = push_block_exits(parser, ¤t_block_exits); + pm_statements_node_t *begin_statements = NULL; + + if (!match4(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { + pm_accepts_block_stack_push(parser, true); + begin_statements = parse_statements(parser, PM_CONTEXT_BEGIN, (uint16_t) (depth + 1)); + pm_accepts_block_stack_pop(parser); + accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); + } + + pm_begin_node_t *begin_node = pm_begin_node_create(parser, &begin_keyword, begin_statements); + parse_rescues(parser, opening_newline_index, &begin_keyword, begin_node, PM_RESCUES_BEGIN, (uint16_t) (depth + 1)); + expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_BEGIN_TERM, &begin_keyword); + + begin_node->base.location.end = parser->previous.end; + pm_begin_node_end_keyword_set(begin_node, &parser->previous); + + pop_block_exits(parser, previous_block_exits); + pm_node_list_free(¤t_block_exits); + + return UP(begin_node); + } + case PM_TOKEN_KEYWORD_BEGIN_UPCASE: { + pm_node_list_t current_block_exits = { 0 }; + pm_node_list_t *previous_block_exits = push_block_exits(parser, ¤t_block_exits); + + if (binding_power != PM_BINDING_POWER_STATEMENT) { + pm_parser_err_current(parser, PM_ERR_STATEMENT_PREEXE_BEGIN); + } + + parser_lex(parser); + pm_token_t keyword = parser->previous; + + expect1(parser, PM_TOKEN_BRACE_LEFT, PM_ERR_BEGIN_UPCASE_BRACE); + pm_token_t opening = parser->previous; + pm_statements_node_t *statements = parse_statements(parser, PM_CONTEXT_PREEXE, (uint16_t) (depth + 1)); + + expect1_opening(parser, PM_TOKEN_BRACE_RIGHT, PM_ERR_BEGIN_UPCASE_TERM, &opening); + pm_context_t context = parser->current_context->context; + if ((context != PM_CONTEXT_MAIN) && (context != PM_CONTEXT_PREEXE)) { + pm_parser_err_token(parser, &keyword, PM_ERR_BEGIN_UPCASE_TOPLEVEL); + } + + flush_block_exits(parser, previous_block_exits); + pm_node_list_free(¤t_block_exits); + + return UP(pm_pre_execution_node_create(parser, &keyword, &opening, statements, &parser->previous)); + } + case PM_TOKEN_KEYWORD_BREAK: + case PM_TOKEN_KEYWORD_NEXT: + case PM_TOKEN_KEYWORD_RETURN: { + parser_lex(parser); + + pm_token_t keyword = parser->previous; + pm_arguments_t arguments = { 0 }; + + if ( + token_begins_expression_p(parser->current.type) || + match2(parser, PM_TOKEN_USTAR, PM_TOKEN_USTAR_STAR) + ) { + pm_binding_power_t binding_power = pm_binding_powers[parser->current.type].left; + + if (binding_power == PM_BINDING_POWER_UNSET || binding_power >= PM_BINDING_POWER_RANGE) { + pm_token_t next = parser->current; + parse_arguments(parser, &arguments, false, PM_TOKEN_EOF, (uint16_t) (depth + 1)); + + // Reject `foo && return bar`. + if (!accepts_command_call && arguments.arguments != NULL) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, next, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_type_human(next.type)); + } + } + } + + switch (keyword.type) { + case PM_TOKEN_KEYWORD_BREAK: { + pm_node_t *node = UP(pm_break_node_create(parser, &keyword, arguments.arguments)); + if (!parser->partial_script) parse_block_exit(parser, node); + return node; + } + case PM_TOKEN_KEYWORD_NEXT: { + pm_node_t *node = UP(pm_next_node_create(parser, &keyword, arguments.arguments)); + if (!parser->partial_script) parse_block_exit(parser, node); + return node; + } + case PM_TOKEN_KEYWORD_RETURN: { + pm_node_t *node = UP(pm_return_node_create(parser, &keyword, arguments.arguments)); + parse_return(parser, node); + return node; + } + default: + assert(false && "unreachable"); + return UP(pm_missing_node_create(parser, parser->previous.start, parser->previous.end)); + } + } + case PM_TOKEN_KEYWORD_SUPER: { + parser_lex(parser); + + pm_token_t keyword = parser->previous; + pm_arguments_t arguments = { 0 }; + parse_arguments_list(parser, &arguments, true, accepts_command_call, (uint16_t) (depth + 1)); + + if ( + arguments.opening_loc.start == NULL && + arguments.arguments == NULL && + ((arguments.block == NULL) || PM_NODE_TYPE_P(arguments.block, PM_BLOCK_NODE)) + ) { + return UP(pm_forwarding_super_node_create(parser, &keyword, &arguments)); + } + + return UP(pm_super_node_create(parser, &keyword, &arguments)); + } + case PM_TOKEN_KEYWORD_YIELD: { + parser_lex(parser); + + pm_token_t keyword = parser->previous; + pm_arguments_t arguments = { 0 }; + parse_arguments_list(parser, &arguments, false, accepts_command_call, (uint16_t) (depth + 1)); + + // It's possible that we've parsed a block argument through our + // call to parse_arguments_list. If we found one, we should mark it + // as invalid and destroy it, as we don't have a place for it on the + // yield node. + if (arguments.block != NULL) { + pm_parser_err_node(parser, arguments.block, PM_ERR_UNEXPECTED_BLOCK_ARGUMENT); + pm_node_unreference(parser, arguments.block); + pm_node_destroy(parser, arguments.block); + arguments.block = NULL; + } + + pm_node_t *node = UP(pm_yield_node_create(parser, &keyword, &arguments.opening_loc, arguments.arguments, &arguments.closing_loc)); + if (!parser->parsing_eval && !parser->partial_script) parse_yield(parser, node); + + return node; + } + case PM_TOKEN_KEYWORD_CLASS: { + size_t opening_newline_index = token_newline_index(parser); + parser_lex(parser); + + pm_token_t class_keyword = parser->previous; + pm_do_loop_stack_push(parser, false); + + pm_node_list_t current_block_exits = { 0 }; + pm_node_list_t *previous_block_exits = push_block_exits(parser, ¤t_block_exits); + + if (accept1(parser, PM_TOKEN_LESS_LESS)) { + pm_token_t operator = parser->previous; + pm_node_t *expression = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, true, false, PM_ERR_EXPECT_EXPRESSION_AFTER_LESS_LESS, (uint16_t) (depth + 1)); + + pm_parser_scope_push(parser, true); + if (!match2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_EXPECT_SINGLETON_CLASS_DELIMITER, pm_token_type_human(parser->current.type)); + } + + pm_node_t *statements = NULL; + if (!match4(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { + pm_accepts_block_stack_push(parser, true); + statements = UP(parse_statements(parser, PM_CONTEXT_SCLASS, (uint16_t) (depth + 1))); + pm_accepts_block_stack_pop(parser); + } + + if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) { + assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE)); + statements = UP(parse_rescues_implicit_begin(parser, opening_newline_index, &class_keyword, class_keyword.start, (pm_statements_node_t *) statements, PM_RESCUES_SCLASS, (uint16_t) (depth + 1))); + } else { + parser_warn_indentation_mismatch(parser, opening_newline_index, &class_keyword, false, false); + } + + expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_CLASS_TERM, &class_keyword); + + pm_constant_id_list_t locals; + pm_locals_order(parser, &parser->current_scope->locals, &locals, false); + + pm_parser_scope_pop(parser); + pm_do_loop_stack_pop(parser); + + flush_block_exits(parser, previous_block_exits); + pm_node_list_free(¤t_block_exits); + + return UP(pm_singleton_class_node_create(parser, &locals, &class_keyword, &operator, expression, statements, &parser->previous)); + } + + pm_node_t *constant_path = parse_expression(parser, PM_BINDING_POWER_INDEX, false, false, PM_ERR_CLASS_NAME, (uint16_t) (depth + 1)); + pm_token_t name = parser->previous; + if (name.type != PM_TOKEN_CONSTANT) { + pm_parser_err_token(parser, &name, PM_ERR_CLASS_NAME); + } + + pm_token_t inheritance_operator; + pm_node_t *superclass; + + if (match1(parser, PM_TOKEN_LESS)) { + inheritance_operator = parser->current; + lex_state_set(parser, PM_LEX_STATE_BEG); + + parser->command_start = true; + parser_lex(parser); + + superclass = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, true, false, PM_ERR_CLASS_SUPERCLASS, (uint16_t) (depth + 1)); + } else { + inheritance_operator = not_provided(parser); + superclass = NULL; + } + + pm_parser_scope_push(parser, true); + + if (inheritance_operator.type != PM_TOKEN_NOT_PROVIDED) { + expect2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON, PM_ERR_CLASS_UNEXPECTED_END); + } else { + accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); + } + pm_node_t *statements = NULL; + + if (!match4(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { + pm_accepts_block_stack_push(parser, true); + statements = UP(parse_statements(parser, PM_CONTEXT_CLASS, (uint16_t) (depth + 1))); + pm_accepts_block_stack_pop(parser); + } + + if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) { + assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE)); + statements = UP(parse_rescues_implicit_begin(parser, opening_newline_index, &class_keyword, class_keyword.start, (pm_statements_node_t *) statements, PM_RESCUES_CLASS, (uint16_t) (depth + 1))); + } else { + parser_warn_indentation_mismatch(parser, opening_newline_index, &class_keyword, false, false); + } + + expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_CLASS_TERM, &class_keyword); + + if (context_def_p(parser)) { + pm_parser_err_token(parser, &class_keyword, PM_ERR_CLASS_IN_METHOD); + } + + pm_constant_id_list_t locals; + pm_locals_order(parser, &parser->current_scope->locals, &locals, false); + + pm_parser_scope_pop(parser); + pm_do_loop_stack_pop(parser); + + if (!PM_NODE_TYPE_P(constant_path, PM_CONSTANT_PATH_NODE) && !(PM_NODE_TYPE_P(constant_path, PM_CONSTANT_READ_NODE))) { + pm_parser_err_node(parser, constant_path, PM_ERR_CLASS_NAME); + } + + pop_block_exits(parser, previous_block_exits); + pm_node_list_free(¤t_block_exits); + + return UP(pm_class_node_create(parser, &locals, &class_keyword, constant_path, &name, &inheritance_operator, superclass, statements, &parser->previous)); + } + case PM_TOKEN_KEYWORD_DEF: { + pm_node_list_t current_block_exits = { 0 }; + pm_node_list_t *previous_block_exits = push_block_exits(parser, ¤t_block_exits); + + pm_token_t def_keyword = parser->current; + size_t opening_newline_index = token_newline_index(parser); + + pm_node_t *receiver = NULL; + pm_token_t operator = not_provided(parser); + pm_token_t name; + + // This context is necessary for lexing `...` in a bare params + // correctly. It must be pushed before lexing the first param, so it + // is here. + context_push(parser, PM_CONTEXT_DEF_PARAMS); + parser_lex(parser); + + // This will be false if the method name is not a valid identifier + // but could be followed by an operator. + bool valid_name = true; + + switch (parser->current.type) { + case PM_CASE_OPERATOR: + pm_parser_scope_push(parser, true); + lex_state_set(parser, PM_LEX_STATE_ENDFN); + parser_lex(parser); + + name = parser->previous; + break; + case PM_TOKEN_IDENTIFIER: { + parser_lex(parser); + + if (match2(parser, PM_TOKEN_DOT, PM_TOKEN_COLON_COLON)) { + receiver = parse_variable_call(parser); + + pm_parser_scope_push(parser, true); + lex_state_set(parser, PM_LEX_STATE_FNAME); + parser_lex(parser); + + operator = parser->previous; + name = parse_method_definition_name(parser); + } else { + pm_refute_numbered_parameter(parser, parser->previous.start, parser->previous.end); + pm_parser_scope_push(parser, true); + + name = parser->previous; + } + + break; + } + case PM_TOKEN_INSTANCE_VARIABLE: + case PM_TOKEN_CLASS_VARIABLE: + case PM_TOKEN_GLOBAL_VARIABLE: + valid_name = false; + PRISM_FALLTHROUGH + case PM_TOKEN_CONSTANT: + case PM_TOKEN_KEYWORD_NIL: + case PM_TOKEN_KEYWORD_SELF: + case PM_TOKEN_KEYWORD_TRUE: + case PM_TOKEN_KEYWORD_FALSE: + case PM_TOKEN_KEYWORD___FILE__: + case PM_TOKEN_KEYWORD___LINE__: + case PM_TOKEN_KEYWORD___ENCODING__: { + pm_parser_scope_push(parser, true); + parser_lex(parser); + + pm_token_t identifier = parser->previous; + + if (match2(parser, PM_TOKEN_DOT, PM_TOKEN_COLON_COLON)) { + lex_state_set(parser, PM_LEX_STATE_FNAME); + parser_lex(parser); + operator = parser->previous; + + switch (identifier.type) { + case PM_TOKEN_CONSTANT: + receiver = UP(pm_constant_read_node_create(parser, &identifier)); + break; + case PM_TOKEN_INSTANCE_VARIABLE: + receiver = UP(pm_instance_variable_read_node_create(parser, &identifier)); + break; + case PM_TOKEN_CLASS_VARIABLE: + receiver = UP(pm_class_variable_read_node_create(parser, &identifier)); + break; + case PM_TOKEN_GLOBAL_VARIABLE: + receiver = UP(pm_global_variable_read_node_create(parser, &identifier)); + break; + case PM_TOKEN_KEYWORD_NIL: + receiver = UP(pm_nil_node_create(parser, &identifier)); + break; + case PM_TOKEN_KEYWORD_SELF: + receiver = UP(pm_self_node_create(parser, &identifier)); + break; + case PM_TOKEN_KEYWORD_TRUE: + receiver = UP(pm_true_node_create(parser, &identifier)); + break; + case PM_TOKEN_KEYWORD_FALSE: + receiver = UP(pm_false_node_create(parser, &identifier)); + break; + case PM_TOKEN_KEYWORD___FILE__: + receiver = UP(pm_source_file_node_create(parser, &identifier)); + break; + case PM_TOKEN_KEYWORD___LINE__: + receiver = UP(pm_source_line_node_create(parser, &identifier)); + break; + case PM_TOKEN_KEYWORD___ENCODING__: + receiver = UP(pm_source_encoding_node_create(parser, &identifier)); + break; + default: + break; + } + + name = parse_method_definition_name(parser); + } else { + if (!valid_name) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, identifier, PM_ERR_DEF_NAME, pm_token_type_human(identifier.type)); + } + + name = identifier; + } + break; + } + case PM_TOKEN_PARENTHESIS_LEFT: { + // The current context is `PM_CONTEXT_DEF_PARAMS`, however + // the inner expression of this parenthesis should not be + // processed under this context. Thus, the context is popped + // here. + context_pop(parser); + parser_lex(parser); + + pm_token_t lparen = parser->previous; + pm_node_t *expression = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, true, false, PM_ERR_DEF_RECEIVER, (uint16_t) (depth + 1)); + + accept1(parser, PM_TOKEN_NEWLINE); + expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_EXPECT_RPAREN); + pm_token_t rparen = parser->previous; + + lex_state_set(parser, PM_LEX_STATE_FNAME); + expect2(parser, PM_TOKEN_DOT, PM_TOKEN_COLON_COLON, PM_ERR_DEF_RECEIVER_TERM); + + operator = parser->previous; + receiver = UP(pm_parentheses_node_create(parser, &lparen, expression, &rparen, 0)); + + // To push `PM_CONTEXT_DEF_PARAMS` again is for the same + // reason as described the above. + pm_parser_scope_push(parser, true); + context_push(parser, PM_CONTEXT_DEF_PARAMS); + name = parse_method_definition_name(parser); + break; + } + default: + pm_parser_scope_push(parser, true); + name = parse_method_definition_name(parser); + break; + } + + pm_token_t lparen; + pm_token_t rparen; + pm_parameters_node_t *params; + + bool accept_endless_def = true; + switch (parser->current.type) { + case PM_TOKEN_PARENTHESIS_LEFT: { + parser_lex(parser); + lparen = parser->previous; + + if (match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { + params = NULL; + } else { + params = parse_parameters(parser, PM_BINDING_POWER_DEFINED, true, false, true, true, false, (uint16_t) (depth + 1)); + } + + lex_state_set(parser, PM_LEX_STATE_BEG); + parser->command_start = true; + + context_pop(parser); + if (!accept1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_DEF_PARAMS_TERM_PAREN, pm_token_type_human(parser->current.type)); + parser->previous.start = parser->previous.end; + parser->previous.type = PM_TOKEN_MISSING; + } + + rparen = parser->previous; + break; + } + case PM_CASE_PARAMETER: { + // If we're about to lex a label, we need to add the label + // state to make sure the next newline is ignored. + if (parser->current.type == PM_TOKEN_LABEL) { + lex_state_set(parser, parser->lex_state | PM_LEX_STATE_LABEL); + } + + lparen = not_provided(parser); + rparen = not_provided(parser); + params = parse_parameters(parser, PM_BINDING_POWER_DEFINED, false, false, true, true, false, (uint16_t) (depth + 1)); + + // Reject `def * = 1` and similar. We have to specifically check + // for them because they create ambiguity with optional arguments. + accept_endless_def = false; + + context_pop(parser); + break; + } + default: { + lparen = not_provided(parser); + rparen = not_provided(parser); + params = NULL; + + context_pop(parser); + break; + } + } + + pm_node_t *statements = NULL; + pm_token_t equal; + pm_token_t end_keyword; + + if (accept1(parser, PM_TOKEN_EQUAL)) { + if (token_is_setter_name(&name)) { + pm_parser_err_token(parser, &name, PM_ERR_DEF_ENDLESS_SETTER); + } + if (!accept_endless_def) { + pm_parser_err_previous(parser, PM_ERR_DEF_ENDLESS_PARAMETERS); + } + if ( + parser->current_context->context == PM_CONTEXT_DEFAULT_PARAMS && + parser->current_context->prev->context == PM_CONTEXT_BLOCK_PARAMETERS + ) { + PM_PARSER_ERR_FORMAT(parser, def_keyword.start, parser->previous.end, PM_ERR_UNEXPECTED_PARAMETER_DEFAULT_VALUE, "endless method definition"); + } + equal = parser->previous; + + context_push(parser, PM_CONTEXT_DEF); + pm_do_loop_stack_push(parser, false); + statements = UP(pm_statements_node_create(parser)); + + bool allow_command_call; + if (parser->version >= PM_OPTIONS_VERSION_CRUBY_4_0) { + allow_command_call = accepts_command_call; + } else { + // Allow `def foo = puts "Hello"` but not `private def foo = puts "Hello"` + allow_command_call = binding_power == PM_BINDING_POWER_ASSIGNMENT || binding_power < PM_BINDING_POWER_COMPOSITION; + } + + pm_node_t *statement = parse_expression(parser, PM_BINDING_POWER_DEFINED + 1, allow_command_call, false, PM_ERR_DEF_ENDLESS, (uint16_t) (depth + 1)); + + if (accept1(parser, PM_TOKEN_KEYWORD_RESCUE_MODIFIER)) { + context_push(parser, PM_CONTEXT_RESCUE_MODIFIER); + + pm_token_t rescue_keyword = parser->previous; + pm_node_t *value = parse_expression(parser, pm_binding_powers[PM_TOKEN_KEYWORD_RESCUE_MODIFIER].right, false, false, PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1)); + context_pop(parser); + + statement = UP(pm_rescue_modifier_node_create(parser, statement, &rescue_keyword, value)); + } + + pm_statements_node_body_append(parser, (pm_statements_node_t *) statements, statement, false); + pm_do_loop_stack_pop(parser); + context_pop(parser); + end_keyword = not_provided(parser); + } else { + equal = not_provided(parser); + + if (lparen.type == PM_TOKEN_NOT_PROVIDED) { + lex_state_set(parser, PM_LEX_STATE_BEG); + parser->command_start = true; + expect2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON, PM_ERR_DEF_PARAMS_TERM); + } else { + accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); + } + + pm_accepts_block_stack_push(parser, true); + pm_do_loop_stack_push(parser, false); + + if (!match4(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { + pm_accepts_block_stack_push(parser, true); + statements = UP(parse_statements(parser, PM_CONTEXT_DEF, (uint16_t) (depth + 1))); + pm_accepts_block_stack_pop(parser); + } + + if (match3(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_ELSE)) { + assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE)); + statements = UP(parse_rescues_implicit_begin(parser, opening_newline_index, &def_keyword, def_keyword.start, (pm_statements_node_t *) statements, PM_RESCUES_DEF, (uint16_t) (depth + 1))); + } else { + parser_warn_indentation_mismatch(parser, opening_newline_index, &def_keyword, false, false); + } + + pm_accepts_block_stack_pop(parser); + pm_do_loop_stack_pop(parser); + + expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_DEF_TERM, &def_keyword); + end_keyword = parser->previous; + } + + pm_constant_id_list_t locals; + pm_locals_order(parser, &parser->current_scope->locals, &locals, false); + pm_parser_scope_pop(parser); + + /** + * If the final character is `@` as is the case when defining + * methods to override the unary operators, we should ignore + * the @ in the same way we do for symbols. + */ + pm_constant_id_t name_id = pm_parser_constant_id_location(parser, name.start, parse_operator_symbol_name(&name)); + + flush_block_exits(parser, previous_block_exits); + pm_node_list_free(¤t_block_exits); + + return UP(pm_def_node_create( + parser, + name_id, + &name, + receiver, + params, + statements, + &locals, + &def_keyword, + &operator, + &lparen, + &rparen, + &equal, + &end_keyword + )); + } + case PM_TOKEN_KEYWORD_DEFINED: { + parser_lex(parser); + pm_token_t keyword = parser->previous; + + pm_token_t lparen; + pm_token_t rparen; + pm_node_t *expression; + + context_push(parser, PM_CONTEXT_DEFINED); + bool newline = accept1(parser, PM_TOKEN_NEWLINE); + + if (accept1(parser, PM_TOKEN_PARENTHESIS_LEFT)) { + lparen = parser->previous; + + if (newline && accept1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { + expression = UP(pm_parentheses_node_create(parser, &lparen, NULL, &parser->previous, 0)); + lparen = not_provided(parser); + rparen = not_provided(parser); + } else { + expression = parse_expression(parser, PM_BINDING_POWER_COMPOSITION, true, false, PM_ERR_DEFINED_EXPRESSION, (uint16_t) (depth + 1)); + + if (parser->recovering) { + rparen = not_provided(parser); + } else { + accept1(parser, PM_TOKEN_NEWLINE); + expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_EXPECT_RPAREN); + rparen = parser->previous; + } + } + } else { + lparen = not_provided(parser); + rparen = not_provided(parser); + expression = parse_expression(parser, PM_BINDING_POWER_DEFINED, false, false, PM_ERR_DEFINED_EXPRESSION, (uint16_t) (depth + 1)); + } + + context_pop(parser); + return UP(pm_defined_node_create( + parser, + &lparen, + expression, + &rparen, + &keyword + )); + } + case PM_TOKEN_KEYWORD_END_UPCASE: { + if (binding_power != PM_BINDING_POWER_STATEMENT) { + pm_parser_err_current(parser, PM_ERR_STATEMENT_POSTEXE_END); + } + + parser_lex(parser); + pm_token_t keyword = parser->previous; + + if (context_def_p(parser)) { + pm_parser_warn_token(parser, &keyword, PM_WARN_END_IN_METHOD); + } + + expect1(parser, PM_TOKEN_BRACE_LEFT, PM_ERR_END_UPCASE_BRACE); + pm_token_t opening = parser->previous; + pm_statements_node_t *statements = parse_statements(parser, PM_CONTEXT_POSTEXE, (uint16_t) (depth + 1)); + + expect1_opening(parser, PM_TOKEN_BRACE_RIGHT, PM_ERR_END_UPCASE_TERM, &opening); + return UP(pm_post_execution_node_create(parser, &keyword, &opening, statements, &parser->previous)); + } + case PM_TOKEN_KEYWORD_FALSE: + parser_lex(parser); + return UP(pm_false_node_create(parser, &parser->previous)); + case PM_TOKEN_KEYWORD_FOR: { + size_t opening_newline_index = token_newline_index(parser); + parser_lex(parser); + + pm_token_t for_keyword = parser->previous; + pm_node_t *index; + + context_push(parser, PM_CONTEXT_FOR_INDEX); + + // First, parse out the first index expression. + if (accept1(parser, PM_TOKEN_USTAR)) { + pm_token_t star_operator = parser->previous; + pm_node_t *name = NULL; + + if (token_begins_expression_p(parser->current.type)) { + name = parse_expression(parser, PM_BINDING_POWER_INDEX, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_STAR, (uint16_t) (depth + 1)); + } + + index = UP(pm_splat_node_create(parser, &star_operator, name)); + } else if (token_begins_expression_p(parser->current.type)) { + index = parse_expression(parser, PM_BINDING_POWER_INDEX, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_COMMA, (uint16_t) (depth + 1)); + } else { + pm_parser_err_token(parser, &for_keyword, PM_ERR_FOR_INDEX); + index = UP(pm_missing_node_create(parser, for_keyword.start, for_keyword.end)); + } + + // Now, if there are multiple index expressions, parse them out. + if (match1(parser, PM_TOKEN_COMMA)) { + index = parse_targets(parser, index, PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); + } else { + index = parse_target(parser, index, false, false); + } + + context_pop(parser); + pm_do_loop_stack_push(parser, true); + + expect1(parser, PM_TOKEN_KEYWORD_IN, PM_ERR_FOR_IN); + pm_token_t in_keyword = parser->previous; + + pm_node_t *collection = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, true, false, PM_ERR_FOR_COLLECTION, (uint16_t) (depth + 1)); + pm_do_loop_stack_pop(parser); + + pm_token_t do_keyword; + if (accept1(parser, PM_TOKEN_KEYWORD_DO_LOOP)) { + do_keyword = parser->previous; + } else { + do_keyword = not_provided(parser); + if (!match2(parser, PM_TOKEN_SEMICOLON, PM_TOKEN_NEWLINE)) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_EXPECT_FOR_DELIMITER, pm_token_type_human(parser->current.type)); + } + } + + pm_statements_node_t *statements = NULL; + if (!match1(parser, PM_TOKEN_KEYWORD_END)) { + statements = parse_statements(parser, PM_CONTEXT_FOR, (uint16_t) (depth + 1)); + } + + parser_warn_indentation_mismatch(parser, opening_newline_index, &for_keyword, false, false); + expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_FOR_TERM, &for_keyword); + + return UP(pm_for_node_create(parser, index, collection, statements, &for_keyword, &in_keyword, &do_keyword, &parser->previous)); + } + case PM_TOKEN_KEYWORD_IF: + if (parser_end_of_line_p(parser)) { + PM_PARSER_WARN_TOKEN_FORMAT_CONTENT(parser, parser->current, PM_WARN_KEYWORD_EOL); + } + + size_t opening_newline_index = token_newline_index(parser); + bool if_after_else = parser->previous.type == PM_TOKEN_KEYWORD_ELSE; + parser_lex(parser); + + return parse_conditional(parser, PM_CONTEXT_IF, opening_newline_index, if_after_else, (uint16_t) (depth + 1)); + case PM_TOKEN_KEYWORD_UNDEF: { + if (binding_power != PM_BINDING_POWER_STATEMENT) { + pm_parser_err_current(parser, PM_ERR_STATEMENT_UNDEF); + } + + parser_lex(parser); + pm_undef_node_t *undef = pm_undef_node_create(parser, &parser->previous); + pm_node_t *name = parse_undef_argument(parser, (uint16_t) (depth + 1)); + + if (PM_NODE_TYPE_P(name, PM_MISSING_NODE)) { + pm_node_destroy(parser, name); + } else { + pm_undef_node_append(undef, name); + + while (match1(parser, PM_TOKEN_COMMA)) { + lex_state_set(parser, PM_LEX_STATE_FNAME | PM_LEX_STATE_FITEM); + parser_lex(parser); + name = parse_undef_argument(parser, (uint16_t) (depth + 1)); + + if (PM_NODE_TYPE_P(name, PM_MISSING_NODE)) { + pm_node_destroy(parser, name); + break; + } + + pm_undef_node_append(undef, name); + } + } + + return UP(undef); + } + case PM_TOKEN_KEYWORD_NOT: { + parser_lex(parser); + + pm_token_t message = parser->previous; + pm_arguments_t arguments = { 0 }; + pm_node_t *receiver = NULL; + + // If we do not accept a command call, then we also do not accept a + // not without parentheses. In this case we need to reject this + // syntax. + if (!accepts_command_call && !match1(parser, PM_TOKEN_PARENTHESIS_LEFT)) { + if (match1(parser, PM_TOKEN_PARENTHESIS_LEFT_PARENTHESES)) { + pm_parser_err(parser, parser->previous.end, parser->previous.end + 1, PM_ERR_EXPECT_LPAREN_AFTER_NOT_LPAREN); + } else { + accept1(parser, PM_TOKEN_NEWLINE); + pm_parser_err_current(parser, PM_ERR_EXPECT_LPAREN_AFTER_NOT_OTHER); + } + + return UP(pm_missing_node_create(parser, parser->current.start, parser->current.end)); + } + + accept1(parser, PM_TOKEN_NEWLINE); + + if (accept1(parser, PM_TOKEN_PARENTHESIS_LEFT)) { + pm_token_t lparen = parser->previous; + + if (accept1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { + receiver = UP(pm_parentheses_node_create(parser, &lparen, NULL, &parser->previous, 0)); + } else { + arguments.opening_loc = PM_LOCATION_TOKEN_VALUE(&lparen); + receiver = parse_expression(parser, PM_BINDING_POWER_COMPOSITION, true, false, PM_ERR_NOT_EXPRESSION, (uint16_t) (depth + 1)); + + if (!parser->recovering) { + accept1(parser, PM_TOKEN_NEWLINE); + expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_EXPECT_RPAREN); + arguments.closing_loc = PM_LOCATION_TOKEN_VALUE(&parser->previous); + } + } + } else { + receiver = parse_expression(parser, PM_BINDING_POWER_NOT, true, false, PM_ERR_NOT_EXPRESSION, (uint16_t) (depth + 1)); + } + + return UP(pm_call_node_not_create(parser, receiver, &message, &arguments)); + } + case PM_TOKEN_KEYWORD_UNLESS: { + size_t opening_newline_index = token_newline_index(parser); + parser_lex(parser); + + return parse_conditional(parser, PM_CONTEXT_UNLESS, opening_newline_index, false, (uint16_t) (depth + 1)); + } + case PM_TOKEN_KEYWORD_MODULE: { + pm_node_list_t current_block_exits = { 0 }; + pm_node_list_t *previous_block_exits = push_block_exits(parser, ¤t_block_exits); + + size_t opening_newline_index = token_newline_index(parser); + parser_lex(parser); + pm_token_t module_keyword = parser->previous; + + pm_node_t *constant_path = parse_expression(parser, PM_BINDING_POWER_INDEX, false, false, PM_ERR_MODULE_NAME, (uint16_t) (depth + 1)); + pm_token_t name; + + // If we can recover from a syntax error that occurred while parsing + // the name of the module, then we'll handle that here. + if (PM_NODE_TYPE_P(constant_path, PM_MISSING_NODE)) { + pop_block_exits(parser, previous_block_exits); + pm_node_list_free(¤t_block_exits); + + pm_token_t missing = (pm_token_t) { .type = PM_TOKEN_MISSING, .start = parser->previous.end, .end = parser->previous.end }; + return UP(pm_module_node_create(parser, NULL, &module_keyword, constant_path, &missing, NULL, &missing)); + } + + while (accept1(parser, PM_TOKEN_COLON_COLON)) { + pm_token_t double_colon = parser->previous; + + expect1(parser, PM_TOKEN_CONSTANT, PM_ERR_CONSTANT_PATH_COLON_COLON_CONSTANT); + constant_path = UP(pm_constant_path_node_create(parser, constant_path, &double_colon, &parser->previous)); + } + + // Here we retrieve the name of the module. If it wasn't a constant, + // then it's possible that `module foo` was passed, which is a + // syntax error. We handle that here as well. + name = parser->previous; + if (name.type != PM_TOKEN_CONSTANT) { + pm_parser_err_token(parser, &name, PM_ERR_MODULE_NAME); + } + + pm_parser_scope_push(parser, true); + accept2(parser, PM_TOKEN_SEMICOLON, PM_TOKEN_NEWLINE); + pm_node_t *statements = NULL; + + if (!match4(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { + pm_accepts_block_stack_push(parser, true); + statements = UP(parse_statements(parser, PM_CONTEXT_MODULE, (uint16_t) (depth + 1))); + pm_accepts_block_stack_pop(parser); + } + + if (match3(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_ELSE)) { + assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE)); + statements = UP(parse_rescues_implicit_begin(parser, opening_newline_index, &module_keyword, module_keyword.start, (pm_statements_node_t *) statements, PM_RESCUES_MODULE, (uint16_t) (depth + 1))); + } else { + parser_warn_indentation_mismatch(parser, opening_newline_index, &module_keyword, false, false); + } + + pm_constant_id_list_t locals; + pm_locals_order(parser, &parser->current_scope->locals, &locals, false); + + pm_parser_scope_pop(parser); + expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_MODULE_TERM, &module_keyword); + + if (context_def_p(parser)) { + pm_parser_err_token(parser, &module_keyword, PM_ERR_MODULE_IN_METHOD); + } + + pop_block_exits(parser, previous_block_exits); + pm_node_list_free(¤t_block_exits); + + return UP(pm_module_node_create(parser, &locals, &module_keyword, constant_path, &name, statements, &parser->previous)); + } + case PM_TOKEN_KEYWORD_NIL: + parser_lex(parser); + return UP(pm_nil_node_create(parser, &parser->previous)); + case PM_TOKEN_KEYWORD_REDO: { + parser_lex(parser); + + pm_node_t *node = UP(pm_redo_node_create(parser, &parser->previous)); + if (!parser->partial_script) parse_block_exit(parser, node); + + return node; + } + case PM_TOKEN_KEYWORD_RETRY: { + parser_lex(parser); + + pm_node_t *node = UP(pm_retry_node_create(parser, &parser->previous)); + parse_retry(parser, node); + + return node; + } + case PM_TOKEN_KEYWORD_SELF: + parser_lex(parser); + return UP(pm_self_node_create(parser, &parser->previous)); + case PM_TOKEN_KEYWORD_TRUE: + parser_lex(parser); + return UP(pm_true_node_create(parser, &parser->previous)); + case PM_TOKEN_KEYWORD_UNTIL: { + size_t opening_newline_index = token_newline_index(parser); + + context_push(parser, PM_CONTEXT_LOOP_PREDICATE); + pm_do_loop_stack_push(parser, true); + + parser_lex(parser); + pm_token_t keyword = parser->previous; + pm_node_t *predicate = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, true, false, PM_ERR_CONDITIONAL_UNTIL_PREDICATE, (uint16_t) (depth + 1)); + + pm_do_loop_stack_pop(parser); + context_pop(parser); + + pm_token_t do_keyword; + if (accept1(parser, PM_TOKEN_KEYWORD_DO_LOOP)) { + do_keyword = parser->previous; + } else { + do_keyword = not_provided(parser); + expect2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON, PM_ERR_CONDITIONAL_UNTIL_PREDICATE); + } + + pm_statements_node_t *statements = NULL; + if (!match1(parser, PM_TOKEN_KEYWORD_END)) { + pm_accepts_block_stack_push(parser, true); + statements = parse_statements(parser, PM_CONTEXT_UNTIL, (uint16_t) (depth + 1)); + pm_accepts_block_stack_pop(parser); + accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); + } + + parser_warn_indentation_mismatch(parser, opening_newline_index, &keyword, false, false); + expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_UNTIL_TERM, &keyword); + + return UP(pm_until_node_create(parser, &keyword, &do_keyword, &parser->previous, predicate, statements, 0)); + } + case PM_TOKEN_KEYWORD_WHILE: { + size_t opening_newline_index = token_newline_index(parser); + + context_push(parser, PM_CONTEXT_LOOP_PREDICATE); + pm_do_loop_stack_push(parser, true); + + parser_lex(parser); + pm_token_t keyword = parser->previous; + pm_node_t *predicate = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, true, false, PM_ERR_CONDITIONAL_WHILE_PREDICATE, (uint16_t) (depth + 1)); + + pm_do_loop_stack_pop(parser); + context_pop(parser); + + pm_token_t do_keyword; + if (accept1(parser, PM_TOKEN_KEYWORD_DO_LOOP)) { + do_keyword = parser->previous; + } else { + do_keyword = not_provided(parser); + expect2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON, PM_ERR_CONDITIONAL_WHILE_PREDICATE); + } + + pm_statements_node_t *statements = NULL; + if (!match1(parser, PM_TOKEN_KEYWORD_END)) { + pm_accepts_block_stack_push(parser, true); + statements = parse_statements(parser, PM_CONTEXT_WHILE, (uint16_t) (depth + 1)); + pm_accepts_block_stack_pop(parser); + accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); + } + + parser_warn_indentation_mismatch(parser, opening_newline_index, &keyword, false, false); + expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_WHILE_TERM, &keyword); + + return UP(pm_while_node_create(parser, &keyword, &do_keyword, &parser->previous, predicate, statements, 0)); + } + case PM_TOKEN_PERCENT_LOWER_I: { + parser_lex(parser); + pm_token_t opening = parser->previous; + pm_array_node_t *array = pm_array_node_create(parser, &opening); + pm_node_t *current = NULL; + + while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) { + accept1(parser, PM_TOKEN_WORDS_SEP); + if (match1(parser, PM_TOKEN_STRING_END)) break; + + // Interpolation is not possible but nested heredocs can still lead to + // consecutive (disjoint) string tokens when the final newline is escaped. + while (match1(parser, PM_TOKEN_STRING_CONTENT)) { + pm_token_t opening = not_provided(parser); + pm_token_t closing = not_provided(parser); + + // Record the string node, moving to interpolation if needed. + if (current == NULL) { + current = UP(pm_symbol_node_create_current_string(parser, &opening, &parser->current, &closing)); + parser_lex(parser); + } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_SYMBOL_NODE)) { + pm_node_t *string = UP(pm_string_node_create_current_string(parser, &opening, &parser->current, &closing)); + parser_lex(parser); + pm_interpolated_symbol_node_append((pm_interpolated_symbol_node_t *) current, string); + } else if (PM_NODE_TYPE_P(current, PM_SYMBOL_NODE)) { + pm_symbol_node_t *cast = (pm_symbol_node_t *) current; + pm_token_t bounds = not_provided(parser); + + pm_token_t content = { .type = PM_TOKEN_STRING_CONTENT, .start = cast->value_loc.start, .end = cast->value_loc.end }; + pm_node_t *first_string = UP(pm_string_node_create_unescaped(parser, &bounds, &content, &bounds, &cast->unescaped)); + pm_node_t *second_string = UP(pm_string_node_create_current_string(parser, &opening, &parser->previous, &closing)); + parser_lex(parser); + + pm_interpolated_symbol_node_t *interpolated = pm_interpolated_symbol_node_create(parser, &opening, NULL, &closing); + pm_interpolated_symbol_node_append(interpolated, first_string); + pm_interpolated_symbol_node_append(interpolated, second_string); + + xfree(current); + current = UP(interpolated); + } else { + assert(false && "unreachable"); + } + } + + if (current) { + pm_array_node_elements_append(array, current); + current = NULL; + } else { + expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_LIST_I_LOWER_ELEMENT); + } + } + + pm_token_t closing = parser->current; + if (match1(parser, PM_TOKEN_EOF)) { + pm_parser_err_token(parser, &opening, PM_ERR_LIST_I_LOWER_TERM); + closing = (pm_token_t) { .type = PM_TOKEN_MISSING, .start = parser->previous.end, .end = parser->previous.end }; + } else { + expect1(parser, PM_TOKEN_STRING_END, PM_ERR_LIST_I_LOWER_TERM); + } + pm_array_node_close_set(array, &closing); + + return UP(array); + } + case PM_TOKEN_PERCENT_UPPER_I: { + parser_lex(parser); + pm_token_t opening = parser->previous; + pm_array_node_t *array = pm_array_node_create(parser, &opening); + + // This is the current node that we are parsing that will be added to the + // list of elements. + pm_node_t *current = NULL; + + while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) { + switch (parser->current.type) { + case PM_TOKEN_WORDS_SEP: { + if (current == NULL) { + // If we hit a separator before we have any content, then we don't + // need to do anything. + } else { + // If we hit a separator after we've hit content, then we need to + // append that content to the list and reset the current node. + pm_array_node_elements_append(array, current); + current = NULL; + } + + parser_lex(parser); + break; + } + case PM_TOKEN_STRING_CONTENT: { + pm_token_t opening = not_provided(parser); + pm_token_t closing = not_provided(parser); + + if (current == NULL) { + // If we hit content and the current node is NULL, then this is + // the first string content we've seen. In that case we're going + // to create a new string node and set that to the current. + current = UP(pm_symbol_node_create_current_string(parser, &opening, &parser->current, &closing)); + parser_lex(parser); + } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_SYMBOL_NODE)) { + // If we hit string content and the current node is an + // interpolated string, then we need to append the string content + // to the list of child nodes. + pm_node_t *string = UP(pm_string_node_create_current_string(parser, &opening, &parser->current, &closing)); + parser_lex(parser); + + pm_interpolated_symbol_node_append((pm_interpolated_symbol_node_t *) current, string); + } else if (PM_NODE_TYPE_P(current, PM_SYMBOL_NODE)) { + // If we hit string content and the current node is a symbol node, + // then we need to convert the current node into an interpolated + // string and add the string content to the list of child nodes. + pm_symbol_node_t *cast = (pm_symbol_node_t *) current; + pm_token_t bounds = not_provided(parser); + + pm_token_t content = { .type = PM_TOKEN_STRING_CONTENT, .start = cast->value_loc.start, .end = cast->value_loc.end }; + pm_node_t *first_string = UP(pm_string_node_create_unescaped(parser, &bounds, &content, &bounds, &cast->unescaped)); + pm_node_t *second_string = UP(pm_string_node_create_current_string(parser, &opening, &parser->previous, &closing)); + parser_lex(parser); + + pm_interpolated_symbol_node_t *interpolated = pm_interpolated_symbol_node_create(parser, &opening, NULL, &closing); + pm_interpolated_symbol_node_append(interpolated, first_string); + pm_interpolated_symbol_node_append(interpolated, second_string); + + xfree(current); + current = UP(interpolated); + } else { + assert(false && "unreachable"); + } + + break; + } + case PM_TOKEN_EMBVAR: { + bool start_location_set = false; + if (current == NULL) { + // If we hit an embedded variable and the current node is NULL, + // then this is the start of a new string. We'll set the current + // node to a new interpolated string. + pm_token_t opening = not_provided(parser); + pm_token_t closing = not_provided(parser); + current = UP(pm_interpolated_symbol_node_create(parser, &opening, NULL, &closing)); + } else if (PM_NODE_TYPE_P(current, PM_SYMBOL_NODE)) { + // If we hit an embedded variable and the current node is a string + // node, then we'll convert the current into an interpolated + // string and add the string node to the list of parts. + pm_token_t opening = not_provided(parser); + pm_token_t closing = not_provided(parser); + pm_interpolated_symbol_node_t *interpolated = pm_interpolated_symbol_node_create(parser, &opening, NULL, &closing); + + current = UP(pm_symbol_node_to_string_node(parser, (pm_symbol_node_t *) current)); + pm_interpolated_symbol_node_append(interpolated, current); + interpolated->base.location.start = current->location.start; + start_location_set = true; + current = UP(interpolated); + } else { + // If we hit an embedded variable and the current node is an + // interpolated string, then we'll just add the embedded variable. + } + + pm_node_t *part = parse_string_part(parser, (uint16_t) (depth + 1)); + pm_interpolated_symbol_node_append((pm_interpolated_symbol_node_t *) current, part); + if (!start_location_set) { + current->location.start = part->location.start; + } + break; + } + case PM_TOKEN_EMBEXPR_BEGIN: { + bool start_location_set = false; + if (current == NULL) { + // If we hit an embedded expression and the current node is NULL, + // then this is the start of a new string. We'll set the current + // node to a new interpolated string. + pm_token_t opening = not_provided(parser); + pm_token_t closing = not_provided(parser); + current = UP(pm_interpolated_symbol_node_create(parser, &opening, NULL, &closing)); + } else if (PM_NODE_TYPE_P(current, PM_SYMBOL_NODE)) { + // If we hit an embedded expression and the current node is a + // string node, then we'll convert the current into an + // interpolated string and add the string node to the list of + // parts. + pm_token_t opening = not_provided(parser); + pm_token_t closing = not_provided(parser); + pm_interpolated_symbol_node_t *interpolated = pm_interpolated_symbol_node_create(parser, &opening, NULL, &closing); + + current = UP(pm_symbol_node_to_string_node(parser, (pm_symbol_node_t *) current)); + pm_interpolated_symbol_node_append(interpolated, current); + interpolated->base.location.start = current->location.start; + start_location_set = true; + current = UP(interpolated); + } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_SYMBOL_NODE)) { + // If we hit an embedded expression and the current node is an + // interpolated string, then we'll just continue on. + } else { + assert(false && "unreachable"); + } + + pm_node_t *part = parse_string_part(parser, (uint16_t) (depth + 1)); + pm_interpolated_symbol_node_append((pm_interpolated_symbol_node_t *) current, part); + if (!start_location_set) { + current->location.start = part->location.start; + } + break; + } + default: + expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_LIST_I_UPPER_ELEMENT); + parser_lex(parser); + break; + } + } + + // If we have a current node, then we need to append it to the list. + if (current) { + pm_array_node_elements_append(array, current); + } + + pm_token_t closing = parser->current; + if (match1(parser, PM_TOKEN_EOF)) { + pm_parser_err_token(parser, &opening, PM_ERR_LIST_I_UPPER_TERM); + closing = (pm_token_t) { .type = PM_TOKEN_MISSING, .start = parser->previous.end, .end = parser->previous.end }; + } else { + expect1(parser, PM_TOKEN_STRING_END, PM_ERR_LIST_I_UPPER_TERM); + } + pm_array_node_close_set(array, &closing); + + return UP(array); + } + case PM_TOKEN_PERCENT_LOWER_W: { + parser_lex(parser); + pm_token_t opening = parser->previous; + pm_array_node_t *array = pm_array_node_create(parser, &opening); + pm_node_t *current = NULL; + + while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) { + accept1(parser, PM_TOKEN_WORDS_SEP); + if (match1(parser, PM_TOKEN_STRING_END)) break; + + // Interpolation is not possible but nested heredocs can still lead to + // consecutive (disjoint) string tokens when the final newline is escaped. + while (match1(parser, PM_TOKEN_STRING_CONTENT)) { + pm_token_t opening = not_provided(parser); + pm_token_t closing = not_provided(parser); + + pm_node_t *string = UP(pm_string_node_create_current_string(parser, &opening, &parser->current, &closing)); + + // Record the string node, moving to interpolation if needed. + if (current == NULL) { + current = string; + } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_STRING_NODE)) { + pm_interpolated_string_node_append((pm_interpolated_string_node_t *) current, string); + } else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) { + pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, &opening, NULL, &closing); + pm_interpolated_string_node_append(interpolated, current); + pm_interpolated_string_node_append(interpolated, string); + current = UP(interpolated); + } else { + assert(false && "unreachable"); + } + parser_lex(parser); + } + + if (current) { + pm_array_node_elements_append(array, current); + current = NULL; + } else { + expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_LIST_W_LOWER_ELEMENT); + } + } + + pm_token_t closing = parser->current; + if (match1(parser, PM_TOKEN_EOF)) { + pm_parser_err_token(parser, &opening, PM_ERR_LIST_W_LOWER_TERM); + closing = (pm_token_t) { .type = PM_TOKEN_MISSING, .start = parser->previous.end, .end = parser->previous.end }; + } else { + expect1(parser, PM_TOKEN_STRING_END, PM_ERR_LIST_W_LOWER_TERM); + } + + pm_array_node_close_set(array, &closing); + return UP(array); + } + case PM_TOKEN_PERCENT_UPPER_W: { + parser_lex(parser); + pm_token_t opening = parser->previous; + pm_array_node_t *array = pm_array_node_create(parser, &opening); + + // This is the current node that we are parsing that will be added + // to the list of elements. + pm_node_t *current = NULL; + + while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) { + switch (parser->current.type) { + case PM_TOKEN_WORDS_SEP: { + // Reset the explicit encoding if we hit a separator + // since each element can have its own encoding. + parser->explicit_encoding = NULL; + + if (current == NULL) { + // If we hit a separator before we have any content, + // then we don't need to do anything. + } else { + // If we hit a separator after we've hit content, + // then we need to append that content to the list + // and reset the current node. + pm_array_node_elements_append(array, current); + current = NULL; + } + + parser_lex(parser); + break; + } + case PM_TOKEN_STRING_CONTENT: { + pm_token_t opening = not_provided(parser); + pm_token_t closing = not_provided(parser); + + pm_node_t *string = UP(pm_string_node_create_current_string(parser, &opening, &parser->current, &closing)); + pm_node_flag_set(string, parse_unescaped_encoding(parser)); + parser_lex(parser); + + if (current == NULL) { + // If we hit content and the current node is NULL, + // then this is the first string content we've seen. + // In that case we're going to create a new string + // node and set that to the current. + current = string; + } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_STRING_NODE)) { + // If we hit string content and the current node is + // an interpolated string, then we need to append + // the string content to the list of child nodes. + pm_interpolated_string_node_append((pm_interpolated_string_node_t *) current, string); + } else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) { + // If we hit string content and the current node is + // a string node, then we need to convert the + // current node into an interpolated string and add + // the string content to the list of child nodes. + pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, &opening, NULL, &closing); + pm_interpolated_string_node_append(interpolated, current); + pm_interpolated_string_node_append(interpolated, string); + current = UP(interpolated); + } else { + assert(false && "unreachable"); + } + + break; + } + case PM_TOKEN_EMBVAR: { + if (current == NULL) { + // If we hit an embedded variable and the current + // node is NULL, then this is the start of a new + // string. We'll set the current node to a new + // interpolated string. + pm_token_t opening = not_provided(parser); + pm_token_t closing = not_provided(parser); + current = UP(pm_interpolated_string_node_create(parser, &opening, NULL, &closing)); + } else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) { + // If we hit an embedded variable and the current + // node is a string node, then we'll convert the + // current into an interpolated string and add the + // string node to the list of parts. + pm_token_t opening = not_provided(parser); + pm_token_t closing = not_provided(parser); + pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, &opening, NULL, &closing); + pm_interpolated_string_node_append(interpolated, current); + current = UP(interpolated); + } else { + // If we hit an embedded variable and the current + // node is an interpolated string, then we'll just + // add the embedded variable. + } + + pm_node_t *part = parse_string_part(parser, (uint16_t) (depth + 1)); + pm_interpolated_string_node_append((pm_interpolated_string_node_t *) current, part); + break; + } + case PM_TOKEN_EMBEXPR_BEGIN: { + if (current == NULL) { + // If we hit an embedded expression and the current + // node is NULL, then this is the start of a new + // string. We'll set the current node to a new + // interpolated string. + pm_token_t opening = not_provided(parser); + pm_token_t closing = not_provided(parser); + current = UP(pm_interpolated_string_node_create(parser, &opening, NULL, &closing)); + } else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) { + // If we hit an embedded expression and the current + // node is a string node, then we'll convert the + // current into an interpolated string and add the + // string node to the list of parts. + pm_token_t opening = not_provided(parser); + pm_token_t closing = not_provided(parser); + pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, &opening, NULL, &closing); + pm_interpolated_string_node_append(interpolated, current); + current = UP(interpolated); + } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_STRING_NODE)) { + // If we hit an embedded expression and the current + // node is an interpolated string, then we'll just + // continue on. + } else { + assert(false && "unreachable"); + } + + pm_node_t *part = parse_string_part(parser, (uint16_t) (depth + 1)); + pm_interpolated_string_node_append((pm_interpolated_string_node_t *) current, part); + break; + } + default: + expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_LIST_W_UPPER_ELEMENT); + parser_lex(parser); + break; + } + } + + // If we have a current node, then we need to append it to the list. + if (current) { + pm_array_node_elements_append(array, current); + } + + pm_token_t closing = parser->current; + if (match1(parser, PM_TOKEN_EOF)) { + pm_parser_err_token(parser, &opening, PM_ERR_LIST_W_UPPER_TERM); + closing = (pm_token_t) { .type = PM_TOKEN_MISSING, .start = parser->previous.end, .end = parser->previous.end }; + } else { + expect1(parser, PM_TOKEN_STRING_END, PM_ERR_LIST_W_UPPER_TERM); + } + + pm_array_node_close_set(array, &closing); + return UP(array); + } + case PM_TOKEN_REGEXP_BEGIN: { + pm_token_t opening = parser->current; + parser_lex(parser); + + if (match1(parser, PM_TOKEN_REGEXP_END)) { + // If we get here, then we have an end immediately after a start. In + // that case we'll create an empty content token and return an + // uninterpolated regular expression. + pm_token_t content = (pm_token_t) { + .type = PM_TOKEN_STRING_CONTENT, + .start = parser->previous.end, + .end = parser->previous.end + }; + + parser_lex(parser); + + pm_node_t *node = UP(pm_regular_expression_node_create(parser, &opening, &content, &parser->previous)); + pm_node_flag_set(node, PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING); + + return node; + } + + pm_interpolated_regular_expression_node_t *interpolated; + + if (match1(parser, PM_TOKEN_STRING_CONTENT)) { + // In this case we've hit string content so we know the regular + // expression at least has something in it. We'll need to check if the + // following token is the end (in which case we can return a plain + // regular expression) or if it's not then it has interpolation. + pm_string_t unescaped = parser->current_string; + pm_token_t content = parser->current; + bool ascii_only = parser->current_regular_expression_ascii_only; + parser_lex(parser); + + // If we hit an end, then we can create a regular expression + // node without interpolation, which can be represented more + // succinctly and more easily compiled. + if (accept1(parser, PM_TOKEN_REGEXP_END)) { + pm_regular_expression_node_t *node = (pm_regular_expression_node_t *) pm_regular_expression_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped); + + // If we're not immediately followed by a =~, then we want + // to parse all of the errors at this point. If it is + // followed by a =~, then it will get parsed higher up while + // parsing the named captures as well. + if (!match1(parser, PM_TOKEN_EQUAL_TILDE)) { + parse_regular_expression_errors(parser, node); + } + + pm_node_flag_set(UP(node), parse_and_validate_regular_expression_encoding(parser, &unescaped, ascii_only, FL(node))); + return UP(node); + } + + // If we get here, then we have interpolation so we'll need to create + // a regular expression node with interpolation. + interpolated = pm_interpolated_regular_expression_node_create(parser, &opening); + + pm_token_t opening = not_provided(parser); + pm_token_t closing = not_provided(parser); + pm_node_t *part = UP(pm_string_node_create_unescaped(parser, &opening, &parser->previous, &closing, &unescaped)); + + if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) { + // This is extremely strange, but the first string part of a + // regular expression will always be tagged as binary if we + // are in a US-ASCII file, no matter its contents. + pm_node_flag_set(part, PM_STRING_FLAGS_FORCED_BINARY_ENCODING); + } + + pm_interpolated_regular_expression_node_append(interpolated, part); + } else { + // If the first part of the body of the regular expression is not a + // string content, then we have interpolation and we need to create an + // interpolated regular expression node. + interpolated = pm_interpolated_regular_expression_node_create(parser, &opening); + } + + // Now that we're here and we have interpolation, we'll parse all of the + // parts into the list. + pm_node_t *part; + while (!match2(parser, PM_TOKEN_REGEXP_END, PM_TOKEN_EOF)) { + if ((part = parse_string_part(parser, (uint16_t) (depth + 1))) != NULL) { + pm_interpolated_regular_expression_node_append(interpolated, part); + } + } + + pm_token_t closing = parser->current; + if (match1(parser, PM_TOKEN_EOF)) { + pm_parser_err_token(parser, &opening, PM_ERR_REGEXP_TERM); + closing = (pm_token_t) { .type = PM_TOKEN_MISSING, .start = parser->previous.end, .end = parser->previous.end }; + } else { + expect1(parser, PM_TOKEN_REGEXP_END, PM_ERR_REGEXP_TERM); + } + + pm_interpolated_regular_expression_node_closing_set(parser, interpolated, &closing); + return UP(interpolated); + } + case PM_TOKEN_BACKTICK: + case PM_TOKEN_PERCENT_LOWER_X: { + parser_lex(parser); + pm_token_t opening = parser->previous; + + // When we get here, we don't know if this string is going to have + // interpolation or not, even though it is allowed. Still, we want to be + // able to return a string node without interpolation if we can since + // it'll be faster. + if (match1(parser, PM_TOKEN_STRING_END)) { + // If we get here, then we have an end immediately after a start. In + // that case we'll create an empty content token and return an + // uninterpolated string. + pm_token_t content = (pm_token_t) { + .type = PM_TOKEN_STRING_CONTENT, + .start = parser->previous.end, + .end = parser->previous.end + }; + + parser_lex(parser); + return UP(pm_xstring_node_create(parser, &opening, &content, &parser->previous)); + } + + pm_interpolated_x_string_node_t *node; + + if (match1(parser, PM_TOKEN_STRING_CONTENT)) { + // In this case we've hit string content so we know the string + // at least has something in it. We'll need to check if the + // following token is the end (in which case we can return a + // plain string) or if it's not then it has interpolation. + pm_string_t unescaped = parser->current_string; + pm_token_t content = parser->current; + parser_lex(parser); + + if (match1(parser, PM_TOKEN_STRING_END)) { + pm_node_t *node = UP(pm_xstring_node_create_unescaped(parser, &opening, &content, &parser->current, &unescaped)); + pm_node_flag_set(node, parse_unescaped_encoding(parser)); + parser_lex(parser); + return node; + } + + // If we get here, then we have interpolation so we'll need to + // create a string node with interpolation. + node = pm_interpolated_xstring_node_create(parser, &opening, &opening); + + pm_token_t opening = not_provided(parser); + pm_token_t closing = not_provided(parser); + + pm_node_t *part = UP(pm_string_node_create_unescaped(parser, &opening, &parser->previous, &closing, &unescaped)); + pm_node_flag_set(part, parse_unescaped_encoding(parser)); + + pm_interpolated_xstring_node_append(node, part); + } else { + // If the first part of the body of the string is not a string + // content, then we have interpolation and we need to create an + // interpolated string node. + node = pm_interpolated_xstring_node_create(parser, &opening, &opening); + } + + pm_node_t *part; + while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) { + if ((part = parse_string_part(parser, (uint16_t) (depth + 1))) != NULL) { + pm_interpolated_xstring_node_append(node, part); + } + } + + pm_token_t closing = parser->current; + if (match1(parser, PM_TOKEN_EOF)) { + pm_parser_err_token(parser, &opening, PM_ERR_XSTRING_TERM); + closing = (pm_token_t) { .type = PM_TOKEN_MISSING, .start = parser->previous.end, .end = parser->previous.end }; + } else { + expect1(parser, PM_TOKEN_STRING_END, PM_ERR_XSTRING_TERM); + } + pm_interpolated_xstring_node_closing_set(node, &closing); + + return UP(node); + } + case PM_TOKEN_USTAR: { + parser_lex(parser); + + // * operators at the beginning of expressions are only valid in the + // context of a multiple assignment. We enforce that here. We'll + // still lex past it though and create a missing node place. + if (binding_power != PM_BINDING_POWER_STATEMENT) { + pm_parser_err_prefix(parser, diag_id); + return UP(pm_missing_node_create(parser, parser->previous.start, parser->previous.end)); + } + + pm_token_t operator = parser->previous; + pm_node_t *name = NULL; + + if (token_begins_expression_p(parser->current.type)) { + name = parse_expression(parser, PM_BINDING_POWER_INDEX, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_STAR, (uint16_t) (depth + 1)); + } + + pm_node_t *splat = UP(pm_splat_node_create(parser, &operator, name)); + + if (match1(parser, PM_TOKEN_COMMA)) { + return parse_targets_validate(parser, splat, PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); + } else { + return parse_target_validate(parser, splat, true); + } + } + case PM_TOKEN_BANG: { + if (binding_power > PM_BINDING_POWER_UNARY) { + pm_parser_err_prefix(parser, PM_ERR_UNARY_DISALLOWED); + } + + parser_lex(parser); + + pm_token_t operator = parser->previous; + pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, binding_power < PM_BINDING_POWER_MATCH, false, PM_ERR_UNARY_RECEIVER, (uint16_t) (depth + 1)); + pm_call_node_t *node = pm_call_node_unary_create(parser, &operator, receiver, "!"); + + pm_conditional_predicate(parser, receiver, PM_CONDITIONAL_PREDICATE_TYPE_NOT); + return UP(node); + } + case PM_TOKEN_TILDE: { + if (binding_power > PM_BINDING_POWER_UNARY) { + pm_parser_err_prefix(parser, PM_ERR_UNARY_DISALLOWED); + } + parser_lex(parser); + + pm_token_t operator = parser->previous; + pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, false, PM_ERR_UNARY_RECEIVER, (uint16_t) (depth + 1)); + pm_call_node_t *node = pm_call_node_unary_create(parser, &operator, receiver, "~"); + + return UP(node); + } + case PM_TOKEN_UMINUS: { + if (binding_power > PM_BINDING_POWER_UNARY) { + pm_parser_err_prefix(parser, PM_ERR_UNARY_DISALLOWED); + } + parser_lex(parser); + + pm_token_t operator = parser->previous; + pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, false, PM_ERR_UNARY_RECEIVER, (uint16_t) (depth + 1)); + pm_call_node_t *node = pm_call_node_unary_create(parser, &operator, receiver, "-@"); + + return UP(node); + } + case PM_TOKEN_UMINUS_NUM: { + parser_lex(parser); + + pm_token_t operator = parser->previous; + pm_node_t *node = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, false, PM_ERR_UNARY_RECEIVER, (uint16_t) (depth + 1)); + + if (accept1(parser, PM_TOKEN_STAR_STAR)) { + pm_token_t exponent_operator = parser->previous; + pm_node_t *exponent = parse_expression(parser, pm_binding_powers[exponent_operator.type].right, false, false, PM_ERR_EXPECT_ARGUMENT, (uint16_t) (depth + 1)); + node = UP(pm_call_node_binary_create(parser, node, &exponent_operator, exponent, 0)); + node = UP(pm_call_node_unary_create(parser, &operator, node, "-@")); + } else { + switch (PM_NODE_TYPE(node)) { + case PM_INTEGER_NODE: + case PM_FLOAT_NODE: + case PM_RATIONAL_NODE: + case PM_IMAGINARY_NODE: + parse_negative_numeric(node); + break; + default: + node = UP(pm_call_node_unary_create(parser, &operator, node, "-@")); + break; + } + } + + return node; + } + case PM_TOKEN_MINUS_GREATER: { + int previous_lambda_enclosure_nesting = parser->lambda_enclosure_nesting; + parser->lambda_enclosure_nesting = parser->enclosure_nesting; + + size_t opening_newline_index = token_newline_index(parser); + pm_accepts_block_stack_push(parser, true); + parser_lex(parser); + + pm_token_t operator = parser->previous; + pm_parser_scope_push(parser, false); + + pm_block_parameters_node_t *block_parameters; + + switch (parser->current.type) { + case PM_TOKEN_PARENTHESIS_LEFT: { + pm_token_t opening = parser->current; + parser_lex(parser); + + if (match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { + block_parameters = pm_block_parameters_node_create(parser, NULL, &opening); + } else { + block_parameters = parse_block_parameters(parser, false, &opening, true, true, (uint16_t) (depth + 1)); + } + + accept1(parser, PM_TOKEN_NEWLINE); + expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_EXPECT_RPAREN); + + pm_block_parameters_node_closing_set(block_parameters, &parser->previous); + break; + } + case PM_CASE_PARAMETER: { + pm_accepts_block_stack_push(parser, false); + pm_token_t opening = not_provided(parser); + block_parameters = parse_block_parameters(parser, false, &opening, true, false, (uint16_t) (depth + 1)); + pm_accepts_block_stack_pop(parser); + break; + } + default: { + block_parameters = NULL; + break; + } + } + + pm_token_t opening; + pm_node_t *body = NULL; + parser->lambda_enclosure_nesting = previous_lambda_enclosure_nesting; + + if (accept1(parser, PM_TOKEN_LAMBDA_BEGIN)) { + opening = parser->previous; + + if (!match1(parser, PM_TOKEN_BRACE_RIGHT)) { + body = UP(parse_statements(parser, PM_CONTEXT_LAMBDA_BRACES, (uint16_t) (depth + 1))); + } + + parser_warn_indentation_mismatch(parser, opening_newline_index, &operator, false, false); + expect1_opening(parser, PM_TOKEN_BRACE_RIGHT, PM_ERR_LAMBDA_TERM_BRACE, &opening); + } else { + expect1(parser, PM_TOKEN_KEYWORD_DO, PM_ERR_LAMBDA_OPEN); + opening = parser->previous; + + if (!match3(parser, PM_TOKEN_KEYWORD_END, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) { + pm_accepts_block_stack_push(parser, true); + body = UP(parse_statements(parser, PM_CONTEXT_LAMBDA_DO_END, (uint16_t) (depth + 1))); + pm_accepts_block_stack_pop(parser); + } + + if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) { + assert(body == NULL || PM_NODE_TYPE_P(body, PM_STATEMENTS_NODE)); + body = UP(parse_rescues_implicit_begin(parser, opening_newline_index, &operator, opening.start, (pm_statements_node_t *) body, PM_RESCUES_LAMBDA, (uint16_t) (depth + 1))); + } else { + parser_warn_indentation_mismatch(parser, opening_newline_index, &operator, false, false); + } + + expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_LAMBDA_TERM_END, &operator); + } + + pm_constant_id_list_t locals; + pm_locals_order(parser, &parser->current_scope->locals, &locals, pm_parser_scope_toplevel_p(parser)); + pm_node_t *parameters = parse_blocklike_parameters(parser, UP(block_parameters), &operator, &parser->previous); + + pm_parser_scope_pop(parser); + pm_accepts_block_stack_pop(parser); + + return UP(pm_lambda_node_create(parser, &locals, &operator, &opening, &parser->previous, parameters, body)); + } + case PM_TOKEN_UPLUS: { + if (binding_power > PM_BINDING_POWER_UNARY) { + pm_parser_err_prefix(parser, PM_ERR_UNARY_DISALLOWED); + } + parser_lex(parser); + + pm_token_t operator = parser->previous; + pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, false, PM_ERR_UNARY_RECEIVER, (uint16_t) (depth + 1)); + pm_call_node_t *node = pm_call_node_unary_create(parser, &operator, receiver, "+@"); + + return UP(node); + } + case PM_TOKEN_STRING_BEGIN: + return parse_strings(parser, NULL, accepts_label, (uint16_t) (depth + 1)); + case PM_TOKEN_SYMBOL_BEGIN: { + pm_lex_mode_t lex_mode = *parser->lex_modes.current; + parser_lex(parser); + + return parse_symbol(parser, &lex_mode, PM_LEX_STATE_END, (uint16_t) (depth + 1)); + } + default: { + pm_context_t recoverable = context_recoverable(parser, &parser->current); + + if (recoverable != PM_CONTEXT_NONE) { + parser->recovering = true; + + // If the given error is not the generic one, then we'll add it + // here because it will provide more context in addition to the + // recoverable error that we will also add. + if (diag_id != PM_ERR_CANNOT_PARSE_EXPRESSION) { + pm_parser_err_prefix(parser, diag_id); + } + + // If we get here, then we are assuming this token is closing a + // parent context, so we'll indicate that to the user so that + // they know how we behaved. + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_UNEXPECTED_TOKEN_CLOSE_CONTEXT, pm_token_type_human(parser->current.type), context_human(recoverable)); + } else if (diag_id == PM_ERR_CANNOT_PARSE_EXPRESSION) { + // We're going to make a special case here, because "cannot + // parse expression" is pretty generic, and we know here that we + // have an unexpected token. + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_UNEXPECTED_TOKEN_IGNORE, pm_token_type_human(parser->current.type)); + } else { + pm_parser_err_prefix(parser, diag_id); + } + + return UP(pm_missing_node_create(parser, parser->previous.start, parser->previous.end)); + } + } +} + +/** + * Parse a value that is going to be written to some kind of variable or method + * call. We need to handle this separately because the rescue modifier is + * permitted on the end of the these expressions, which is a deviation from its + * normal binding power. + * + * Note that this will only be called after an operator write, as in &&=, ||=, + * or any of the binary operators that can be written to a variable. + */ +static pm_node_t * +parse_assignment_value(pm_parser_t *parser, pm_binding_power_t previous_binding_power, pm_binding_power_t binding_power, bool accepts_command_call, pm_diagnostic_id_t diag_id, uint16_t depth) { + pm_node_t *value = parse_value_expression(parser, binding_power, previous_binding_power == PM_BINDING_POWER_ASSIGNMENT ? accepts_command_call : previous_binding_power < PM_BINDING_POWER_MATCH, false, diag_id, (uint16_t) (depth + 1)); + + // Contradicting binding powers, the right-hand-side value of the assignment + // allows the `rescue` modifier. + if (match1(parser, PM_TOKEN_KEYWORD_RESCUE_MODIFIER)) { + context_push(parser, PM_CONTEXT_RESCUE_MODIFIER); + + pm_token_t rescue = parser->current; + parser_lex(parser); + + pm_node_t *right = parse_expression(parser, pm_binding_powers[PM_TOKEN_KEYWORD_RESCUE_MODIFIER].right, false, false, PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1)); + context_pop(parser); + + return UP(pm_rescue_modifier_node_create(parser, value, &rescue, right)); + } + + return value; +} + +/** + * When a local variable write node is the value being written in a different + * write, the local variable is considered "used". + */ +static void +parse_assignment_value_local(pm_parser_t *parser, const pm_node_t *node) { + switch (PM_NODE_TYPE(node)) { + case PM_BEGIN_NODE: { + const pm_begin_node_t *cast = (const pm_begin_node_t *) node; + if (cast->statements != NULL) parse_assignment_value_local(parser, (const pm_node_t *) cast->statements); + break; + } + case PM_LOCAL_VARIABLE_WRITE_NODE: { + const pm_local_variable_write_node_t *cast = (const pm_local_variable_write_node_t *) node; + pm_locals_read(&pm_parser_scope_find(parser, cast->depth)->locals, cast->name); + break; + } + case PM_PARENTHESES_NODE: { + const pm_parentheses_node_t *cast = (const pm_parentheses_node_t *) node; + if (cast->body != NULL) parse_assignment_value_local(parser, cast->body); + break; + } + case PM_STATEMENTS_NODE: { + const pm_statements_node_t *cast = (const pm_statements_node_t *) node; + const pm_node_t *statement; + + PM_NODE_LIST_FOREACH(&cast->body, index, statement) { + parse_assignment_value_local(parser, statement); + } + break; + } + default: + break; + } +} + +/** + * Parse the value (or values, through an implicit array) that is going to be + * written to some kind of variable or method call. We need to handle this + * separately because the rescue modifier is permitted on the end of the these + * expressions, which is a deviation from its normal binding power. + * + * Additionally, if the value is a local variable write node (e.g., a = a = 1), + * the "a" is marked as being used so the parser should not warn on it. + * + * Note that this will only be called after an = operator, as that is the only + * operator that allows multiple values after it. + */ +static pm_node_t * +parse_assignment_values(pm_parser_t *parser, pm_binding_power_t previous_binding_power, pm_binding_power_t binding_power, bool accepts_command_call, pm_diagnostic_id_t diag_id, uint16_t depth) { + bool permitted = true; + if (previous_binding_power != PM_BINDING_POWER_STATEMENT && match1(parser, PM_TOKEN_USTAR)) permitted = false; + + pm_node_t *value = parse_starred_expression(parser, binding_power, previous_binding_power == PM_BINDING_POWER_ASSIGNMENT ? accepts_command_call : previous_binding_power < PM_BINDING_POWER_MODIFIER, diag_id, (uint16_t) (depth + 1)); + if (!permitted) pm_parser_err_node(parser, value, PM_ERR_UNEXPECTED_MULTI_WRITE); + + parse_assignment_value_local(parser, value); + bool single_value = true; + + if (previous_binding_power == PM_BINDING_POWER_STATEMENT && (PM_NODE_TYPE_P(value, PM_SPLAT_NODE) || match1(parser, PM_TOKEN_COMMA))) { + single_value = false; + + pm_token_t opening = not_provided(parser); + pm_array_node_t *array = pm_array_node_create(parser, &opening); + + pm_array_node_elements_append(array, value); + value = UP(array); + + while (accept1(parser, PM_TOKEN_COMMA)) { + pm_node_t *element = parse_starred_expression(parser, binding_power, false, PM_ERR_ARRAY_ELEMENT, (uint16_t) (depth + 1)); + + pm_array_node_elements_append(array, element); + if (PM_NODE_TYPE_P(element, PM_MISSING_NODE)) break; + + parse_assignment_value_local(parser, element); + } + } + + // Contradicting binding powers, the right-hand-side value of the assignment + // allows the `rescue` modifier. + if ((single_value || (binding_power == (PM_BINDING_POWER_MULTI_ASSIGNMENT + 1))) && match1(parser, PM_TOKEN_KEYWORD_RESCUE_MODIFIER)) { + context_push(parser, PM_CONTEXT_RESCUE_MODIFIER); + + pm_token_t rescue = parser->current; + parser_lex(parser); + + bool accepts_command_call_inner = false; + + // RHS can accept command call iff the value is a call with arguments + // but without parenthesis. + if (PM_NODE_TYPE_P(value, PM_CALL_NODE)) { + pm_call_node_t *call_node = (pm_call_node_t *) value; + if ((call_node->arguments != NULL) && (call_node->opening_loc.start == NULL)) { + accepts_command_call_inner = true; + } + } + + pm_node_t *right = parse_expression(parser, pm_binding_powers[PM_TOKEN_KEYWORD_RESCUE_MODIFIER].right, accepts_command_call_inner, false, PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1)); + context_pop(parser); + + return UP(pm_rescue_modifier_node_create(parser, value, &rescue, right)); + } + + return value; +} + +/** + * Ensure a call node that is about to become a call operator node does not + * have arguments or a block attached. If it does, then we'll need to add an + * error message and destroy the arguments/block. Ideally we would keep the node + * around so that consumers would still have access to it, but we don't have a + * great structure for that at the moment. + */ +static void +parse_call_operator_write(pm_parser_t *parser, pm_call_node_t *call_node, const pm_token_t *operator) { + if (call_node->arguments != NULL) { + pm_parser_err_token(parser, operator, PM_ERR_OPERATOR_WRITE_ARGUMENTS); + pm_node_unreference(parser, UP(call_node->arguments)); + pm_node_destroy(parser, UP(call_node->arguments)); + call_node->arguments = NULL; + } + + if (call_node->block != NULL) { + pm_parser_err_token(parser, operator, PM_ERR_OPERATOR_WRITE_BLOCK); + pm_node_unreference(parser, UP(call_node->block)); + pm_node_destroy(parser, UP(call_node->block)); + call_node->block = NULL; + } +} + +/** + * This struct is used to pass information between the regular expression parser + * and the named capture callback. + */ +typedef struct { + /** The parser that is parsing the regular expression. */ + pm_parser_t *parser; + + /** The call node wrapping the regular expression node. */ + pm_call_node_t *call; + + /** The match write node that is being created. */ + pm_match_write_node_t *match; + + /** The list of names that have been parsed. */ + pm_constant_id_list_t names; + + /** + * Whether the content of the regular expression is shared. This impacts + * whether or not we used owned constants or shared constants in the + * constant pool for the names of the captures. + */ + bool shared; +} parse_regular_expression_named_capture_data_t; + +static inline const uint8_t * +pm_named_capture_escape_hex(pm_buffer_t *unescaped, const uint8_t *cursor, const uint8_t *end) { + cursor++; + + if (cursor < end && pm_char_is_hexadecimal_digit(*cursor)) { + uint8_t value = escape_hexadecimal_digit(*cursor); + cursor++; + + if (cursor < end && pm_char_is_hexadecimal_digit(*cursor)) { + value = (uint8_t) ((value << 4) | escape_hexadecimal_digit(*cursor)); + cursor++; + } + + pm_buffer_append_byte(unescaped, value); + } else { + pm_buffer_append_string(unescaped, "\\x", 2); + } + + return cursor; +} + +static inline const uint8_t * +pm_named_capture_escape_octal(pm_buffer_t *unescaped, const uint8_t *cursor, const uint8_t *end) { + uint8_t value = (uint8_t) (*cursor - '0'); + cursor++; + + if (cursor < end && pm_char_is_octal_digit(*cursor)) { + value = ((uint8_t) (value << 3)) | ((uint8_t) (*cursor - '0')); + cursor++; + + if (cursor < end && pm_char_is_octal_digit(*cursor)) { + value = ((uint8_t) (value << 3)) | ((uint8_t) (*cursor - '0')); + cursor++; + } + } + + pm_buffer_append_byte(unescaped, value); + return cursor; +} + +static inline const uint8_t * +pm_named_capture_escape_unicode(pm_parser_t *parser, pm_buffer_t *unescaped, const uint8_t *cursor, const uint8_t *end, const pm_location_t *error_location) { + const uint8_t *start = cursor - 1; + cursor++; + + if (cursor >= end) { + pm_buffer_append_string(unescaped, "\\u", 2); + return cursor; + } + + if (*cursor != '{') { + size_t length = pm_strspn_hexadecimal_digit(cursor, MIN(end - cursor, 4)); + uint32_t value = escape_unicode(parser, cursor, length, error_location); + + if (!pm_buffer_append_unicode_codepoint(unescaped, value)) { + pm_buffer_append_string(unescaped, (const char *) start, (size_t) ((cursor + length) - start)); + } + + return cursor + length; + } + + cursor++; + for (;;) { + while (cursor < end && *cursor == ' ') cursor++; + + if (cursor >= end) break; + if (*cursor == '}') { + cursor++; + break; + } + + size_t length = pm_strspn_hexadecimal_digit(cursor, end - cursor); + if (length == 0) { + break; + } + uint32_t value = escape_unicode(parser, cursor, length, error_location); + + (void) pm_buffer_append_unicode_codepoint(unescaped, value); + cursor += length; + } + + return cursor; +} + +static void +pm_named_capture_escape(pm_parser_t *parser, pm_buffer_t *unescaped, const uint8_t *source, const size_t length, const uint8_t *cursor, const pm_location_t *error_location) { + const uint8_t *end = source + length; + pm_buffer_append_string(unescaped, (const char *) source, (size_t) (cursor - source)); + + for (;;) { + if (++cursor >= end) { + pm_buffer_append_byte(unescaped, '\\'); + return; + } + + switch (*cursor) { + case 'x': + cursor = pm_named_capture_escape_hex(unescaped, cursor, end); + break; + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': + cursor = pm_named_capture_escape_octal(unescaped, cursor, end); + break; + case 'u': + cursor = pm_named_capture_escape_unicode(parser, unescaped, cursor, end, error_location); + break; + default: + pm_buffer_append_byte(unescaped, '\\'); + break; + } + + const uint8_t *next_cursor = pm_memchr(cursor, '\\', (size_t) (end - cursor), parser->encoding_changed, parser->encoding); + if (next_cursor == NULL) break; + + pm_buffer_append_string(unescaped, (const char *) cursor, (size_t) (next_cursor - cursor)); + cursor = next_cursor; + } + + pm_buffer_append_string(unescaped, (const char *) cursor, (size_t) (end - cursor)); +} + +/** + * This callback is called when the regular expression parser encounters a named + * capture group. + */ +static void +parse_regular_expression_named_capture(const pm_string_t *capture, void *data) { + parse_regular_expression_named_capture_data_t *callback_data = (parse_regular_expression_named_capture_data_t *) data; + + pm_parser_t *parser = callback_data->parser; + pm_call_node_t *call = callback_data->call; + pm_constant_id_list_t *names = &callback_data->names; + + const uint8_t *source = pm_string_source(capture); + size_t length = pm_string_length(capture); + pm_buffer_t unescaped = { 0 }; + + // First, we need to handle escapes within the name of the capture group. + // This is because regular expressions have three different representations + // in prism. The first is the plain source code. The second is the + // representation that will be sent to the regular expression engine, which + // is the value of the "unescaped" field. This is poorly named, because it + // actually still contains escapes, just a subset of them that the regular + // expression engine knows how to handle. The third representation is fully + // unescaped, which is what we need. + const uint8_t *cursor = pm_memchr(source, '\\', length, parser->encoding_changed, parser->encoding); + if (PRISM_UNLIKELY(cursor != NULL)) { + pm_named_capture_escape(parser, &unescaped, source, length, cursor, callback_data->shared ? NULL : &call->receiver->location); + source = (const uint8_t *) pm_buffer_value(&unescaped); + length = pm_buffer_length(&unescaped); + } + + pm_location_t location; + pm_constant_id_t name; + + // If the name of the capture group isn't a valid identifier, we do + // not add it to the local table. + if (!pm_slice_is_valid_local(parser, source, source + length)) { + pm_buffer_free(&unescaped); + return; + } + + if (callback_data->shared) { + // If the unescaped string is a slice of the source, then we can + // copy the names directly. The pointers will line up. + location = (pm_location_t) { .start = source, .end = source + length }; + name = pm_parser_constant_id_location(parser, location.start, location.end); + } else { + // Otherwise, the name is a slice of the malloc-ed owned string, + // in which case we need to copy it out into a new string. + location = (pm_location_t) { .start = call->receiver->location.start, .end = call->receiver->location.end }; + + void *memory = xmalloc(length); + if (memory == NULL) abort(); + + memcpy(memory, source, length); + name = pm_parser_constant_id_owned(parser, (uint8_t *) memory, length); + } + + // Add this name to the list of constants if it is valid, not duplicated, + // and not a keyword. + if (name != 0 && !pm_constant_id_list_includes(names, name)) { + pm_constant_id_list_append(names, name); + + int depth; + if ((depth = pm_parser_local_depth_constant_id(parser, name)) == -1) { + // If the local is not already a local but it is a keyword, then we + // do not want to add a capture for this. + if (pm_local_is_keyword((const char *) source, length)) { + pm_buffer_free(&unescaped); + return; + } + + // If the identifier is not already a local, then we will add it to + // the local table. + pm_parser_local_add(parser, name, location.start, location.end, 0); + } + + // Here we lazily create the MatchWriteNode since we know we're + // about to add a target. + if (callback_data->match == NULL) { + callback_data->match = pm_match_write_node_create(parser, call); + } + + // Next, create the local variable target and add it to the list of + // targets for the match. + pm_node_t *target = UP(pm_local_variable_target_node_create(parser, &location, name, depth == -1 ? 0 : (uint32_t) depth)); + pm_node_list_append(&callback_data->match->targets, target); + } + + pm_buffer_free(&unescaped); +} + +/** + * Potentially change a =~ with a regular expression with named captures into a + * match write node. + */ +static pm_node_t * +parse_regular_expression_named_captures(pm_parser_t *parser, const pm_string_t *content, pm_call_node_t *call, bool extended_mode) { + parse_regular_expression_named_capture_data_t callback_data = { + .parser = parser, + .call = call, + .names = { 0 }, + .shared = content->type == PM_STRING_SHARED + }; + + parse_regular_expression_error_data_t error_data = { + .parser = parser, + .start = call->receiver->location.start, + .end = call->receiver->location.end, + .shared = content->type == PM_STRING_SHARED + }; + + pm_regexp_parse(parser, pm_string_source(content), pm_string_length(content), extended_mode, parse_regular_expression_named_capture, &callback_data, parse_regular_expression_error, &error_data); + pm_constant_id_list_free(&callback_data.names); + + if (callback_data.match != NULL) { + return UP(callback_data.match); + } else { + return UP(call); + } +} + +static inline pm_node_t * +parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t previous_binding_power, pm_binding_power_t binding_power, bool accepts_command_call, uint16_t depth) { + pm_token_t token = parser->current; + + switch (token.type) { + case PM_TOKEN_EQUAL: { + switch (PM_NODE_TYPE(node)) { + case PM_CALL_NODE: { + // If we have no arguments to the call node and we need this + // to be a target then this is either a method call or a + // local variable write. This _must_ happen before the value + // is parsed because it could be referenced in the value. + pm_call_node_t *call_node = (pm_call_node_t *) node; + if (PM_NODE_FLAG_P(call_node, PM_CALL_NODE_FLAGS_VARIABLE_CALL)) { + pm_parser_local_add_location(parser, call_node->message_loc.start, call_node->message_loc.end, 0); + } + } + PRISM_FALLTHROUGH + case PM_CASE_WRITABLE: { + // When we have `it = value`, we need to add `it` as a local + // variable before parsing the value, in case the value + // references the variable. + if (PM_NODE_TYPE_P(node, PM_IT_LOCAL_VARIABLE_READ_NODE)) { + pm_parser_local_add_location(parser, node->location.start, node->location.end, 0); + } + + parser_lex(parser); + pm_node_t *value = parse_assignment_values(parser, previous_binding_power, PM_NODE_TYPE_P(node, PM_MULTI_TARGET_NODE) ? PM_BINDING_POWER_MULTI_ASSIGNMENT + 1 : binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_EQUAL, (uint16_t) (depth + 1)); + + if (PM_NODE_TYPE_P(node, PM_MULTI_TARGET_NODE) && previous_binding_power != PM_BINDING_POWER_STATEMENT) { + pm_parser_err_node(parser, node, PM_ERR_UNEXPECTED_MULTI_WRITE); + } + + return parse_write(parser, node, &token, value); + } + case PM_SPLAT_NODE: { + pm_multi_target_node_t *multi_target = pm_multi_target_node_create(parser); + pm_multi_target_node_targets_append(parser, multi_target, node); + + parser_lex(parser); + pm_node_t *value = parse_assignment_values(parser, previous_binding_power, PM_BINDING_POWER_MULTI_ASSIGNMENT + 1, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_EQUAL, (uint16_t) (depth + 1)); + return parse_write(parser, UP(multi_target), &token, value); + } + case PM_SOURCE_ENCODING_NODE: + case PM_FALSE_NODE: + case PM_SOURCE_FILE_NODE: + case PM_SOURCE_LINE_NODE: + case PM_NIL_NODE: + case PM_SELF_NODE: + case PM_TRUE_NODE: { + // In these special cases, we have specific error messages + // and we will replace them with local variable writes. + parser_lex(parser); + pm_node_t *value = parse_assignment_values(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_EQUAL, (uint16_t) (depth + 1)); + return parse_unwriteable_write(parser, node, &token, value); + } + default: + // In this case we have an = sign, but we don't know what + // it's for. We need to treat it as an error. We'll mark it + // as an error and skip past it. + parser_lex(parser); + pm_parser_err_token(parser, &token, PM_ERR_EXPRESSION_NOT_WRITABLE); + return node; + } + } + case PM_TOKEN_AMPERSAND_AMPERSAND_EQUAL: { + switch (PM_NODE_TYPE(node)) { + case PM_BACK_REFERENCE_READ_NODE: + case PM_NUMBERED_REFERENCE_READ_NODE: + PM_PARSER_ERR_NODE_FORMAT_CONTENT(parser, node, PM_ERR_WRITE_TARGET_READONLY); + PRISM_FALLTHROUGH + case PM_GLOBAL_VARIABLE_READ_NODE: { + parser_lex(parser); + + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_global_variable_and_write_node_create(parser, node, &token, value)); + + pm_node_destroy(parser, node); + return result; + } + case PM_CLASS_VARIABLE_READ_NODE: { + parser_lex(parser); + + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_class_variable_and_write_node_create(parser, (pm_class_variable_read_node_t *) node, &token, value)); + + pm_node_destroy(parser, node); + return result; + } + case PM_CONSTANT_PATH_NODE: { + parser_lex(parser); + + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1)); + pm_node_t *write = UP(pm_constant_path_and_write_node_create(parser, (pm_constant_path_node_t *) node, &token, value)); + + return parse_shareable_constant_write(parser, write); + } + case PM_CONSTANT_READ_NODE: { + parser_lex(parser); + + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1)); + pm_node_t *write = UP(pm_constant_and_write_node_create(parser, (pm_constant_read_node_t *) node, &token, value)); + + pm_node_destroy(parser, node); + return parse_shareable_constant_write(parser, write); + } + case PM_INSTANCE_VARIABLE_READ_NODE: { + parser_lex(parser); + + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_instance_variable_and_write_node_create(parser, (pm_instance_variable_read_node_t *) node, &token, value)); + + pm_node_destroy(parser, node); + return result; + } + case PM_IT_LOCAL_VARIABLE_READ_NODE: { + pm_constant_id_t name = pm_parser_local_add_constant(parser, "it", 2); + parser_lex(parser); + + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_local_variable_and_write_node_create(parser, node, &token, value, name, 0)); + + pm_node_unreference(parser, node); + pm_node_destroy(parser, node); + return result; + } + case PM_LOCAL_VARIABLE_READ_NODE: { + if (pm_token_is_numbered_parameter(node->location.start, node->location.end)) { + PM_PARSER_ERR_FORMAT(parser, node->location.start, node->location.end, PM_ERR_PARAMETER_NUMBERED_RESERVED, node->location.start); + pm_node_unreference(parser, node); + } + + pm_local_variable_read_node_t *cast = (pm_local_variable_read_node_t *) node; + parser_lex(parser); + + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_local_variable_and_write_node_create(parser, node, &token, value, cast->name, cast->depth)); + + pm_node_destroy(parser, node); + return result; + } + case PM_CALL_NODE: { + pm_call_node_t *cast = (pm_call_node_t *) node; + + // If we have a vcall (a method with no arguments and no + // receiver that could have been a local variable) then we + // will transform it into a local variable write. + if (PM_NODE_FLAG_P(cast, PM_CALL_NODE_FLAGS_VARIABLE_CALL)) { + pm_location_t *message_loc = &cast->message_loc; + pm_refute_numbered_parameter(parser, message_loc->start, message_loc->end); + + pm_constant_id_t constant_id = pm_parser_local_add_location(parser, message_loc->start, message_loc->end, 1); + parser_lex(parser); + + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_local_variable_and_write_node_create(parser, UP(cast), &token, value, constant_id, 0)); + + pm_node_destroy(parser, UP(cast)); + return result; + } + + // Move past the token here so that we have already added + // the local variable by this point. + parser_lex(parser); + + // If there is no call operator and the message is "[]" then + // this is an aref expression, and we can transform it into + // an aset expression. + if (PM_NODE_FLAG_P(cast, PM_CALL_NODE_FLAGS_INDEX)) { + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1)); + return UP(pm_index_and_write_node_create(parser, cast, &token, value)); + } + + // If this node cannot be writable, then we have an error. + if (pm_call_node_writable_p(parser, cast)) { + parse_write_name(parser, &cast->name); + } else { + pm_parser_err_node(parser, node, PM_ERR_WRITE_TARGET_UNEXPECTED); + } + + parse_call_operator_write(parser, cast, &token); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1)); + return UP(pm_call_and_write_node_create(parser, cast, &token, value)); + } + case PM_MULTI_WRITE_NODE: { + parser_lex(parser); + pm_parser_err_token(parser, &token, PM_ERR_AMPAMPEQ_MULTI_ASSIGN); + return node; + } + default: + parser_lex(parser); + + // In this case we have an &&= sign, but we don't know what it's for. + // We need to treat it as an error. For now, we'll mark it as an error + // and just skip right past it. + pm_parser_err_token(parser, &token, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ); + return node; + } + } + case PM_TOKEN_PIPE_PIPE_EQUAL: { + switch (PM_NODE_TYPE(node)) { + case PM_BACK_REFERENCE_READ_NODE: + case PM_NUMBERED_REFERENCE_READ_NODE: + PM_PARSER_ERR_NODE_FORMAT_CONTENT(parser, node, PM_ERR_WRITE_TARGET_READONLY); + PRISM_FALLTHROUGH + case PM_GLOBAL_VARIABLE_READ_NODE: { + parser_lex(parser); + + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_global_variable_or_write_node_create(parser, node, &token, value)); + + pm_node_destroy(parser, node); + return result; + } + case PM_CLASS_VARIABLE_READ_NODE: { + parser_lex(parser); + + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_class_variable_or_write_node_create(parser, (pm_class_variable_read_node_t *) node, &token, value)); + + pm_node_destroy(parser, node); + return result; + } + case PM_CONSTANT_PATH_NODE: { + parser_lex(parser); + + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1)); + pm_node_t *write = UP(pm_constant_path_or_write_node_create(parser, (pm_constant_path_node_t *) node, &token, value)); + + return parse_shareable_constant_write(parser, write); + } + case PM_CONSTANT_READ_NODE: { + parser_lex(parser); + + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1)); + pm_node_t *write = UP(pm_constant_or_write_node_create(parser, (pm_constant_read_node_t *) node, &token, value)); + + pm_node_destroy(parser, node); + return parse_shareable_constant_write(parser, write); + } + case PM_INSTANCE_VARIABLE_READ_NODE: { + parser_lex(parser); + + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_instance_variable_or_write_node_create(parser, (pm_instance_variable_read_node_t *) node, &token, value)); + + pm_node_destroy(parser, node); + return result; + } + case PM_IT_LOCAL_VARIABLE_READ_NODE: { + pm_constant_id_t name = pm_parser_local_add_constant(parser, "it", 2); + parser_lex(parser); + + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_local_variable_or_write_node_create(parser, node, &token, value, name, 0)); + + pm_node_unreference(parser, node); + pm_node_destroy(parser, node); + return result; + } + case PM_LOCAL_VARIABLE_READ_NODE: { + if (pm_token_is_numbered_parameter(node->location.start, node->location.end)) { + PM_PARSER_ERR_FORMAT(parser, node->location.start, node->location.end, PM_ERR_PARAMETER_NUMBERED_RESERVED, node->location.start); + pm_node_unreference(parser, node); + } + + pm_local_variable_read_node_t *cast = (pm_local_variable_read_node_t *) node; + parser_lex(parser); + + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_local_variable_or_write_node_create(parser, node, &token, value, cast->name, cast->depth)); + + pm_node_destroy(parser, node); + return result; + } + case PM_CALL_NODE: { + pm_call_node_t *cast = (pm_call_node_t *) node; + + // If we have a vcall (a method with no arguments and no + // receiver that could have been a local variable) then we + // will transform it into a local variable write. + if (PM_NODE_FLAG_P(cast, PM_CALL_NODE_FLAGS_VARIABLE_CALL)) { + pm_location_t *message_loc = &cast->message_loc; + pm_refute_numbered_parameter(parser, message_loc->start, message_loc->end); + + pm_constant_id_t constant_id = pm_parser_local_add_location(parser, message_loc->start, message_loc->end, 1); + parser_lex(parser); + + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_local_variable_or_write_node_create(parser, UP(cast), &token, value, constant_id, 0)); + + pm_node_destroy(parser, UP(cast)); + return result; + } + + // Move past the token here so that we have already added + // the local variable by this point. + parser_lex(parser); + + // If there is no call operator and the message is "[]" then + // this is an aref expression, and we can transform it into + // an aset expression. + if (PM_NODE_FLAG_P(cast, PM_CALL_NODE_FLAGS_INDEX)) { + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1)); + return UP(pm_index_or_write_node_create(parser, cast, &token, value)); + } + + // If this node cannot be writable, then we have an error. + if (pm_call_node_writable_p(parser, cast)) { + parse_write_name(parser, &cast->name); + } else { + pm_parser_err_node(parser, node, PM_ERR_WRITE_TARGET_UNEXPECTED); + } + + parse_call_operator_write(parser, cast, &token); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1)); + return UP(pm_call_or_write_node_create(parser, cast, &token, value)); + } + case PM_MULTI_WRITE_NODE: { + parser_lex(parser); + pm_parser_err_token(parser, &token, PM_ERR_PIPEPIPEEQ_MULTI_ASSIGN); + return node; + } + default: + parser_lex(parser); + + // In this case we have an ||= sign, but we don't know what it's for. + // We need to treat it as an error. For now, we'll mark it as an error + // and just skip right past it. + pm_parser_err_token(parser, &token, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ); + return node; + } + } + case PM_TOKEN_AMPERSAND_EQUAL: + case PM_TOKEN_CARET_EQUAL: + case PM_TOKEN_GREATER_GREATER_EQUAL: + case PM_TOKEN_LESS_LESS_EQUAL: + case PM_TOKEN_MINUS_EQUAL: + case PM_TOKEN_PERCENT_EQUAL: + case PM_TOKEN_PIPE_EQUAL: + case PM_TOKEN_PLUS_EQUAL: + case PM_TOKEN_SLASH_EQUAL: + case PM_TOKEN_STAR_EQUAL: + case PM_TOKEN_STAR_STAR_EQUAL: { + switch (PM_NODE_TYPE(node)) { + case PM_BACK_REFERENCE_READ_NODE: + case PM_NUMBERED_REFERENCE_READ_NODE: + PM_PARSER_ERR_NODE_FORMAT_CONTENT(parser, node, PM_ERR_WRITE_TARGET_READONLY); + PRISM_FALLTHROUGH + case PM_GLOBAL_VARIABLE_READ_NODE: { + parser_lex(parser); + + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_global_variable_operator_write_node_create(parser, node, &token, value)); + + pm_node_destroy(parser, node); + return result; + } + case PM_CLASS_VARIABLE_READ_NODE: { + parser_lex(parser); + + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_class_variable_operator_write_node_create(parser, (pm_class_variable_read_node_t *) node, &token, value)); + + pm_node_destroy(parser, node); + return result; + } + case PM_CONSTANT_PATH_NODE: { + parser_lex(parser); + + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + pm_node_t *write = UP(pm_constant_path_operator_write_node_create(parser, (pm_constant_path_node_t *) node, &token, value)); + + return parse_shareable_constant_write(parser, write); + } + case PM_CONSTANT_READ_NODE: { + parser_lex(parser); + + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + pm_node_t *write = UP(pm_constant_operator_write_node_create(parser, (pm_constant_read_node_t *) node, &token, value)); + + pm_node_destroy(parser, node); + return parse_shareable_constant_write(parser, write); + } + case PM_INSTANCE_VARIABLE_READ_NODE: { + parser_lex(parser); + + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_instance_variable_operator_write_node_create(parser, (pm_instance_variable_read_node_t *) node, &token, value)); + + pm_node_destroy(parser, node); + return result; + } + case PM_IT_LOCAL_VARIABLE_READ_NODE: { + pm_constant_id_t name = pm_parser_local_add_constant(parser, "it", 2); + parser_lex(parser); + + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_local_variable_operator_write_node_create(parser, node, &token, value, name, 0)); + + pm_node_unreference(parser, node); + pm_node_destroy(parser, node); + return result; + } + case PM_LOCAL_VARIABLE_READ_NODE: { + if (pm_token_is_numbered_parameter(node->location.start, node->location.end)) { + PM_PARSER_ERR_FORMAT(parser, node->location.start, node->location.end, PM_ERR_PARAMETER_NUMBERED_RESERVED, node->location.start); + pm_node_unreference(parser, node); + } + + pm_local_variable_read_node_t *cast = (pm_local_variable_read_node_t *) node; + parser_lex(parser); + + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_local_variable_operator_write_node_create(parser, node, &token, value, cast->name, cast->depth)); + + pm_node_destroy(parser, node); + return result; + } + case PM_CALL_NODE: { + parser_lex(parser); + pm_call_node_t *cast = (pm_call_node_t *) node; + + // If we have a vcall (a method with no arguments and no + // receiver that could have been a local variable) then we + // will transform it into a local variable write. + if (PM_NODE_FLAG_P(cast, PM_CALL_NODE_FLAGS_VARIABLE_CALL)) { + pm_location_t *message_loc = &cast->message_loc; + pm_refute_numbered_parameter(parser, message_loc->start, message_loc->end); + + pm_constant_id_t constant_id = pm_parser_local_add_location(parser, message_loc->start, message_loc->end, 1); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_local_variable_operator_write_node_create(parser, UP(cast), &token, value, constant_id, 0)); + + pm_node_destroy(parser, UP(cast)); + return result; + } + + // If there is no call operator and the message is "[]" then + // this is an aref expression, and we can transform it into + // an aset expression. + if (PM_NODE_FLAG_P(cast, PM_CALL_NODE_FLAGS_INDEX)) { + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + return UP(pm_index_operator_write_node_create(parser, cast, &token, value)); + } + + // If this node cannot be writable, then we have an error. + if (pm_call_node_writable_p(parser, cast)) { + parse_write_name(parser, &cast->name); + } else { + pm_parser_err_node(parser, node, PM_ERR_WRITE_TARGET_UNEXPECTED); + } + + parse_call_operator_write(parser, cast, &token); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + return UP(pm_call_operator_write_node_create(parser, cast, &token, value)); + } + case PM_MULTI_WRITE_NODE: { + parser_lex(parser); + pm_parser_err_token(parser, &token, PM_ERR_OPERATOR_MULTI_ASSIGN); + return node; + } + default: + parser_lex(parser); + + // In this case we have an operator but we don't know what it's for. + // We need to treat it as an error. For now, we'll mark it as an error + // and just skip right past it. + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->previous, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, pm_token_type_human(parser->current.type)); + return node; + } + } + case PM_TOKEN_AMPERSAND_AMPERSAND: + case PM_TOKEN_KEYWORD_AND: { + parser_lex(parser); + + pm_node_t *right = parse_expression(parser, binding_power, parser->previous.type == PM_TOKEN_KEYWORD_AND, false, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + return UP(pm_and_node_create(parser, node, &token, right)); + } + case PM_TOKEN_KEYWORD_OR: + case PM_TOKEN_PIPE_PIPE: { + parser_lex(parser); + + pm_node_t *right = parse_expression(parser, binding_power, parser->previous.type == PM_TOKEN_KEYWORD_OR, false, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + return UP(pm_or_node_create(parser, node, &token, right)); + } + case PM_TOKEN_EQUAL_TILDE: { + // Note that we _must_ parse the value before adding the local + // variables in order to properly mirror the behavior of Ruby. For + // example, + // + // /(?<foo>bar)/ =~ foo + // + // In this case, `foo` should be a method call and not a local yet. + parser_lex(parser); + pm_node_t *argument = parse_expression(parser, binding_power, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + + // By default, we're going to create a call node and then return it. + pm_call_node_t *call = pm_call_node_binary_create(parser, node, &token, argument, 0); + pm_node_t *result = UP(call); + + // If the receiver of this =~ is a regular expression node, then we + // need to introduce local variables for it based on its named + // capture groups. + if (PM_NODE_TYPE_P(node, PM_INTERPOLATED_REGULAR_EXPRESSION_NODE)) { + // It's possible to have an interpolated regular expression node + // that only contains strings. This is because it can be split + // up by a heredoc. In this case we need to concat the unescaped + // strings together and then parse them as a regular expression. + pm_node_list_t *parts = &((pm_interpolated_regular_expression_node_t *) node)->parts; + + bool interpolated = false; + size_t total_length = 0; + + pm_node_t *part; + PM_NODE_LIST_FOREACH(parts, index, part) { + if (PM_NODE_TYPE_P(part, PM_STRING_NODE)) { + total_length += pm_string_length(&((pm_string_node_t *) part)->unescaped); + } else { + interpolated = true; + break; + } + } + + if (!interpolated && total_length > 0) { + void *memory = xmalloc(total_length); + if (!memory) abort(); + + uint8_t *cursor = memory; + PM_NODE_LIST_FOREACH(parts, index, part) { + pm_string_t *unescaped = &((pm_string_node_t *) part)->unescaped; + size_t length = pm_string_length(unescaped); + + memcpy(cursor, pm_string_source(unescaped), length); + cursor += length; + } + + pm_string_t owned; + pm_string_owned_init(&owned, (uint8_t *) memory, total_length); + + result = parse_regular_expression_named_captures(parser, &owned, call, PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_EXTENDED)); + pm_string_free(&owned); + } + } else if (PM_NODE_TYPE_P(node, PM_REGULAR_EXPRESSION_NODE)) { + // If we have a regular expression node, then we can just parse + // the named captures directly off the unescaped string. + const pm_string_t *content = &((pm_regular_expression_node_t *) node)->unescaped; + result = parse_regular_expression_named_captures(parser, content, call, PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_EXTENDED)); + } + + return result; + } + case PM_TOKEN_UAMPERSAND: + case PM_TOKEN_USTAR: + case PM_TOKEN_USTAR_STAR: + // The only times this will occur are when we are in an error state, + // but we'll put them in here so that errors can propagate. + case PM_TOKEN_BANG_EQUAL: + case PM_TOKEN_BANG_TILDE: + case PM_TOKEN_EQUAL_EQUAL: + case PM_TOKEN_EQUAL_EQUAL_EQUAL: + case PM_TOKEN_LESS_EQUAL_GREATER: + case PM_TOKEN_CARET: + case PM_TOKEN_PIPE: + case PM_TOKEN_AMPERSAND: + case PM_TOKEN_GREATER_GREATER: + case PM_TOKEN_LESS_LESS: + case PM_TOKEN_MINUS: + case PM_TOKEN_PLUS: + case PM_TOKEN_PERCENT: + case PM_TOKEN_SLASH: + case PM_TOKEN_STAR: + case PM_TOKEN_STAR_STAR: { + parser_lex(parser); + pm_token_t operator = parser->previous; + switch (PM_NODE_TYPE(node)) { + case PM_RESCUE_MODIFIER_NODE: { + pm_rescue_modifier_node_t *cast = (pm_rescue_modifier_node_t *) node; + if (PM_NODE_TYPE_P(cast->rescue_expression, PM_MATCH_PREDICATE_NODE) || PM_NODE_TYPE_P(cast->rescue_expression, PM_MATCH_REQUIRED_NODE)) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, operator, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_type_human(operator.type)); + } + break; + } + case PM_AND_NODE: { + pm_and_node_t *cast = (pm_and_node_t *) node; + if (PM_NODE_TYPE_P(cast->right, PM_MATCH_PREDICATE_NODE) || PM_NODE_TYPE_P(cast->right, PM_MATCH_REQUIRED_NODE)) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, operator, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_type_human(operator.type)); + } + break; + } + case PM_OR_NODE: { + pm_or_node_t *cast = (pm_or_node_t *) node; + if (PM_NODE_TYPE_P(cast->right, PM_MATCH_PREDICATE_NODE) || PM_NODE_TYPE_P(cast->right, PM_MATCH_REQUIRED_NODE)) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, operator, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_type_human(operator.type)); + } + break; + } + default: + break; + } + + pm_node_t *argument = parse_expression(parser, binding_power, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + return UP(pm_call_node_binary_create(parser, node, &token, argument, 0)); + } + case PM_TOKEN_GREATER: + case PM_TOKEN_GREATER_EQUAL: + case PM_TOKEN_LESS: + case PM_TOKEN_LESS_EQUAL: { + if (PM_NODE_TYPE_P(node, PM_CALL_NODE) && PM_NODE_FLAG_P(node, PM_CALL_NODE_FLAGS_COMPARISON)) { + PM_PARSER_WARN_TOKEN_FORMAT_CONTENT(parser, parser->current, PM_WARN_COMPARISON_AFTER_COMPARISON); + } + + parser_lex(parser); + pm_node_t *argument = parse_expression(parser, binding_power, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + return UP(pm_call_node_binary_create(parser, node, &token, argument, PM_CALL_NODE_FLAGS_COMPARISON)); + } + case PM_TOKEN_AMPERSAND_DOT: + case PM_TOKEN_DOT: { + parser_lex(parser); + pm_token_t operator = parser->previous; + pm_arguments_t arguments = { 0 }; + + // This if statement handles the foo.() syntax. + if (match1(parser, PM_TOKEN_PARENTHESIS_LEFT)) { + parse_arguments_list(parser, &arguments, true, false, (uint16_t) (depth + 1)); + return UP(pm_call_node_shorthand_create(parser, node, &operator, &arguments)); + } + + switch (PM_NODE_TYPE(node)) { + case PM_RESCUE_MODIFIER_NODE: { + pm_rescue_modifier_node_t *cast = (pm_rescue_modifier_node_t *) node; + if (PM_NODE_TYPE_P(cast->rescue_expression, PM_MATCH_PREDICATE_NODE) || PM_NODE_TYPE_P(cast->rescue_expression, PM_MATCH_REQUIRED_NODE)) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, operator, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_type_human(operator.type)); + } + break; + } + case PM_AND_NODE: { + pm_and_node_t *cast = (pm_and_node_t *) node; + if (PM_NODE_TYPE_P(cast->right, PM_MATCH_PREDICATE_NODE) || PM_NODE_TYPE_P(cast->right, PM_MATCH_REQUIRED_NODE)) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, operator, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_type_human(operator.type)); + } + break; + } + case PM_OR_NODE: { + pm_or_node_t *cast = (pm_or_node_t *) node; + if (PM_NODE_TYPE_P(cast->right, PM_MATCH_PREDICATE_NODE) || PM_NODE_TYPE_P(cast->right, PM_MATCH_REQUIRED_NODE)) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, operator, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_type_human(operator.type)); + } + break; + } + default: + break; + } + + pm_token_t message; + + switch (parser->current.type) { + case PM_CASE_OPERATOR: + case PM_CASE_KEYWORD: + case PM_TOKEN_CONSTANT: + case PM_TOKEN_IDENTIFIER: + case PM_TOKEN_METHOD_NAME: { + parser_lex(parser); + message = parser->previous; + break; + } + default: { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_EXPECT_MESSAGE, pm_token_type_human(parser->current.type)); + message = (pm_token_t) { .type = PM_TOKEN_MISSING, .start = parser->previous.end, .end = parser->previous.end }; + } + } + + parse_arguments_list(parser, &arguments, true, accepts_command_call, (uint16_t) (depth + 1)); + pm_call_node_t *call = pm_call_node_call_create(parser, node, &operator, &message, &arguments); + + if ( + (previous_binding_power == PM_BINDING_POWER_STATEMENT) && + arguments.arguments == NULL && + arguments.opening_loc.start == NULL && + match1(parser, PM_TOKEN_COMMA) + ) { + return parse_targets_validate(parser, UP(call), PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); + } else { + return UP(call); + } + } + case PM_TOKEN_DOT_DOT: + case PM_TOKEN_DOT_DOT_DOT: { + parser_lex(parser); + + pm_node_t *right = NULL; + if (token_begins_expression_p(parser->current.type)) { + right = parse_expression(parser, binding_power, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + } + + return UP(pm_range_node_create(parser, node, &token, right)); + } + case PM_TOKEN_KEYWORD_IF_MODIFIER: { + pm_token_t keyword = parser->current; + parser_lex(parser); + + pm_node_t *predicate = parse_value_expression(parser, binding_power, true, false, PM_ERR_CONDITIONAL_IF_PREDICATE, (uint16_t) (depth + 1)); + return UP(pm_if_node_modifier_create(parser, node, &keyword, predicate)); + } + case PM_TOKEN_KEYWORD_UNLESS_MODIFIER: { + pm_token_t keyword = parser->current; + parser_lex(parser); + + pm_node_t *predicate = parse_value_expression(parser, binding_power, true, false, PM_ERR_CONDITIONAL_UNLESS_PREDICATE, (uint16_t) (depth + 1)); + return UP(pm_unless_node_modifier_create(parser, node, &keyword, predicate)); + } + case PM_TOKEN_KEYWORD_UNTIL_MODIFIER: { + parser_lex(parser); + pm_statements_node_t *statements = pm_statements_node_create(parser); + pm_statements_node_body_append(parser, statements, node, true); + + pm_node_t *predicate = parse_value_expression(parser, binding_power, true, false, PM_ERR_CONDITIONAL_UNTIL_PREDICATE, (uint16_t) (depth + 1)); + return UP(pm_until_node_modifier_create(parser, &token, predicate, statements, PM_NODE_TYPE_P(node, PM_BEGIN_NODE) ? PM_LOOP_FLAGS_BEGIN_MODIFIER : 0)); + } + case PM_TOKEN_KEYWORD_WHILE_MODIFIER: { + parser_lex(parser); + pm_statements_node_t *statements = pm_statements_node_create(parser); + pm_statements_node_body_append(parser, statements, node, true); + + pm_node_t *predicate = parse_value_expression(parser, binding_power, true, false, PM_ERR_CONDITIONAL_WHILE_PREDICATE, (uint16_t) (depth + 1)); + return UP(pm_while_node_modifier_create(parser, &token, predicate, statements, PM_NODE_TYPE_P(node, PM_BEGIN_NODE) ? PM_LOOP_FLAGS_BEGIN_MODIFIER : 0)); + } + case PM_TOKEN_QUESTION_MARK: { + context_push(parser, PM_CONTEXT_TERNARY); + pm_node_list_t current_block_exits = { 0 }; + pm_node_list_t *previous_block_exits = push_block_exits(parser, ¤t_block_exits); + + pm_token_t qmark = parser->current; + parser_lex(parser); + + pm_node_t *true_expression = parse_expression(parser, PM_BINDING_POWER_DEFINED, false, false, PM_ERR_TERNARY_EXPRESSION_TRUE, (uint16_t) (depth + 1)); + + if (parser->recovering) { + // If parsing the true expression of this ternary resulted in a syntax + // error that we can recover from, then we're going to put missing nodes + // and tokens into the remaining places. We want to be sure to do this + // before the `expect` function call to make sure it doesn't + // accidentally move past a ':' token that occurs after the syntax + // error. + pm_token_t colon = (pm_token_t) { .type = PM_TOKEN_MISSING, .start = parser->previous.end, .end = parser->previous.end }; + pm_node_t *false_expression = UP(pm_missing_node_create(parser, colon.start, colon.end)); + + context_pop(parser); + pop_block_exits(parser, previous_block_exits); + pm_node_list_free(¤t_block_exits); + + return UP(pm_if_node_ternary_create(parser, node, &qmark, true_expression, &colon, false_expression)); + } + + accept1(parser, PM_TOKEN_NEWLINE); + expect1(parser, PM_TOKEN_COLON, PM_ERR_TERNARY_COLON); + + pm_token_t colon = parser->previous; + pm_node_t *false_expression = parse_expression(parser, PM_BINDING_POWER_DEFINED, false, false, PM_ERR_TERNARY_EXPRESSION_FALSE, (uint16_t) (depth + 1)); + + context_pop(parser); + pop_block_exits(parser, previous_block_exits); + pm_node_list_free(¤t_block_exits); + + return UP(pm_if_node_ternary_create(parser, node, &qmark, true_expression, &colon, false_expression)); + } + case PM_TOKEN_COLON_COLON: { + parser_lex(parser); + pm_token_t delimiter = parser->previous; + + switch (parser->current.type) { + case PM_TOKEN_CONSTANT: { + parser_lex(parser); + pm_node_t *path; + + if ( + (parser->current.type == PM_TOKEN_PARENTHESIS_LEFT) || + (accepts_command_call && (token_begins_expression_p(parser->current.type) || match3(parser, PM_TOKEN_UAMPERSAND, PM_TOKEN_USTAR, PM_TOKEN_USTAR_STAR))) + ) { + // If we have a constant immediately following a '::' operator, then + // this can either be a constant path or a method call, depending on + // what follows the constant. + // + // If we have parentheses, then this is a method call. That would + // look like Foo::Bar(). + pm_token_t message = parser->previous; + pm_arguments_t arguments = { 0 }; + + parse_arguments_list(parser, &arguments, true, accepts_command_call, (uint16_t) (depth + 1)); + path = UP(pm_call_node_call_create(parser, node, &delimiter, &message, &arguments)); + } else { + // Otherwise, this is a constant path. That would look like Foo::Bar. + path = UP(pm_constant_path_node_create(parser, node, &delimiter, &parser->previous)); + } + + // If this is followed by a comma then it is a multiple assignment. + if (previous_binding_power == PM_BINDING_POWER_STATEMENT && match1(parser, PM_TOKEN_COMMA)) { + return parse_targets_validate(parser, path, PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); + } + + return path; + } + case PM_CASE_OPERATOR: + case PM_CASE_KEYWORD: + case PM_TOKEN_IDENTIFIER: + case PM_TOKEN_METHOD_NAME: { + parser_lex(parser); + pm_token_t message = parser->previous; + + // If we have an identifier following a '::' operator, then it is for + // sure a method call. + pm_arguments_t arguments = { 0 }; + parse_arguments_list(parser, &arguments, true, accepts_command_call, (uint16_t) (depth + 1)); + pm_call_node_t *call = pm_call_node_call_create(parser, node, &delimiter, &message, &arguments); + + // If this is followed by a comma then it is a multiple assignment. + if (previous_binding_power == PM_BINDING_POWER_STATEMENT && match1(parser, PM_TOKEN_COMMA)) { + return parse_targets_validate(parser, UP(call), PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); + } + + return UP(call); + } + case PM_TOKEN_PARENTHESIS_LEFT: { + // If we have a parenthesis following a '::' operator, then it is the + // method call shorthand. That would look like Foo::(bar). + pm_arguments_t arguments = { 0 }; + parse_arguments_list(parser, &arguments, true, false, (uint16_t) (depth + 1)); + + return UP(pm_call_node_shorthand_create(parser, node, &delimiter, &arguments)); + } + default: { + expect1(parser, PM_TOKEN_CONSTANT, PM_ERR_CONSTANT_PATH_COLON_COLON_CONSTANT); + return UP(pm_constant_path_node_create(parser, node, &delimiter, &parser->previous)); + } + } + } + case PM_TOKEN_KEYWORD_RESCUE_MODIFIER: { + context_push(parser, PM_CONTEXT_RESCUE_MODIFIER); + parser_lex(parser); + accept1(parser, PM_TOKEN_NEWLINE); + + pm_node_t *value = parse_expression(parser, binding_power, true, false, PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1)); + context_pop(parser); + + return UP(pm_rescue_modifier_node_create(parser, node, &token, value)); + } + case PM_TOKEN_BRACKET_LEFT: { + parser_lex(parser); + + pm_arguments_t arguments = { 0 }; + arguments.opening_loc = PM_LOCATION_TOKEN_VALUE(&parser->previous); + + if (!accept1(parser, PM_TOKEN_BRACKET_RIGHT)) { + pm_accepts_block_stack_push(parser, true); + parse_arguments(parser, &arguments, false, PM_TOKEN_BRACKET_RIGHT, (uint16_t) (depth + 1)); + pm_accepts_block_stack_pop(parser); + expect1(parser, PM_TOKEN_BRACKET_RIGHT, PM_ERR_EXPECT_RBRACKET); + } + + arguments.closing_loc = PM_LOCATION_TOKEN_VALUE(&parser->previous); + + // If we have a comma after the closing bracket then this is a multiple + // assignment and we should parse the targets. + if (previous_binding_power == PM_BINDING_POWER_STATEMENT && match1(parser, PM_TOKEN_COMMA)) { + pm_call_node_t *aref = pm_call_node_aref_create(parser, node, &arguments); + return parse_targets_validate(parser, UP(aref), PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); + } + + // If we're at the end of the arguments, we can now check if there is a + // block node that starts with a {. If there is, then we can parse it and + // add it to the arguments. + pm_block_node_t *block = NULL; + if (accept1(parser, PM_TOKEN_BRACE_LEFT)) { + block = parse_block(parser, (uint16_t) (depth + 1)); + pm_arguments_validate_block(parser, &arguments, block); + } else if (pm_accepts_block_stack_p(parser) && accept1(parser, PM_TOKEN_KEYWORD_DO)) { + block = parse_block(parser, (uint16_t) (depth + 1)); + } + + if (block != NULL) { + if (arguments.block != NULL) { + pm_parser_err_node(parser, UP(block), PM_ERR_ARGUMENT_AFTER_BLOCK); + if (arguments.arguments == NULL) { + arguments.arguments = pm_arguments_node_create(parser); + } + pm_arguments_node_arguments_append(arguments.arguments, arguments.block); + } + + arguments.block = UP(block); + } + + return UP(pm_call_node_aref_create(parser, node, &arguments)); + } + case PM_TOKEN_KEYWORD_IN: { + bool previous_pattern_matching_newlines = parser->pattern_matching_newlines; + parser->pattern_matching_newlines = true; + + pm_token_t operator = parser->current; + parser->command_start = false; + lex_state_set(parser, PM_LEX_STATE_BEG | PM_LEX_STATE_LABEL); + parser_lex(parser); + + pm_constant_id_list_t captures = { 0 }; + pm_node_t *pattern = parse_pattern(parser, &captures, PM_PARSE_PATTERN_TOP | PM_PARSE_PATTERN_MULTI, PM_ERR_PATTERN_EXPRESSION_AFTER_IN, (uint16_t) (depth + 1)); + + parser->pattern_matching_newlines = previous_pattern_matching_newlines; + pm_constant_id_list_free(&captures); + + return UP(pm_match_predicate_node_create(parser, node, pattern, &operator)); + } + case PM_TOKEN_EQUAL_GREATER: { + bool previous_pattern_matching_newlines = parser->pattern_matching_newlines; + parser->pattern_matching_newlines = true; + + pm_token_t operator = parser->current; + parser->command_start = false; + lex_state_set(parser, PM_LEX_STATE_BEG | PM_LEX_STATE_LABEL); + parser_lex(parser); + + pm_constant_id_list_t captures = { 0 }; + pm_node_t *pattern = parse_pattern(parser, &captures, PM_PARSE_PATTERN_TOP | PM_PARSE_PATTERN_MULTI, PM_ERR_PATTERN_EXPRESSION_AFTER_HROCKET, (uint16_t) (depth + 1)); + + parser->pattern_matching_newlines = previous_pattern_matching_newlines; + pm_constant_id_list_free(&captures); + + return UP(pm_match_required_node_create(parser, node, pattern, &operator)); + } + default: + assert(false && "unreachable"); + return NULL; + } +} + +#undef PM_PARSE_PATTERN_SINGLE +#undef PM_PARSE_PATTERN_TOP +#undef PM_PARSE_PATTERN_MULTI + +/** + * Determine if a given call node looks like a "command", which means it has + * arguments but does not have parentheses. + */ +static inline bool +pm_call_node_command_p(const pm_call_node_t *node) { + return ( + (node->opening_loc.start == NULL) && + (node->block == NULL || PM_NODE_TYPE_P(node->block, PM_BLOCK_ARGUMENT_NODE)) && + (node->arguments != NULL || node->block != NULL) + ); +} + +/** + * Parse an expression at the given point of the parser using the given binding + * power to parse subsequent chains. If this function finds a syntax error, it + * will append the error message to the parser's error list. + * + * Consumers of this function should always check parser->recovering to + * determine if they need to perform additional cleanup. + */ +static pm_node_t * +parse_expression(pm_parser_t *parser, pm_binding_power_t binding_power, bool accepts_command_call, bool accepts_label, pm_diagnostic_id_t diag_id, uint16_t depth) { + if (PRISM_UNLIKELY(depth >= PRISM_DEPTH_MAXIMUM)) { + pm_parser_err_current(parser, PM_ERR_NESTING_TOO_DEEP); + return UP(pm_missing_node_create(parser, parser->current.start, parser->current.end)); + } + + pm_node_t *node = parse_expression_prefix(parser, binding_power, accepts_command_call, accepts_label, diag_id, depth); + + switch (PM_NODE_TYPE(node)) { + case PM_MISSING_NODE: + // If we found a syntax error, then the type of node returned by + // parse_expression_prefix is going to be a missing node. + return node; + case PM_PRE_EXECUTION_NODE: + case PM_POST_EXECUTION_NODE: + case PM_ALIAS_GLOBAL_VARIABLE_NODE: + case PM_ALIAS_METHOD_NODE: + case PM_MULTI_WRITE_NODE: + case PM_UNDEF_NODE: + // These expressions are statements, and cannot be followed by + // operators (except modifiers). + if (pm_binding_powers[parser->current.type].left > PM_BINDING_POWER_MODIFIER) { + return node; + } + break; + case PM_CALL_NODE: + // If we have a call node, then we need to check if it looks like a + // method call without parentheses that contains arguments. If it + // does, then it has different rules for parsing infix operators, + // namely that it only accepts composition (and/or) and modifiers + // (if/unless/etc.). + if ((pm_binding_powers[parser->current.type].left > PM_BINDING_POWER_COMPOSITION) && pm_call_node_command_p((pm_call_node_t *) node)) { + return node; + } + break; + case PM_SYMBOL_NODE: + // If we have a symbol node that is being parsed as a label, then we + // need to immediately return, because there should never be an + // infix operator following this node. + if (pm_symbol_node_label_p(node)) { + return node; + } + break; + default: + break; + } + + // Otherwise we'll look and see if the next token can be parsed as an infix + // operator. If it can, then we'll parse it using parse_expression_infix. + pm_binding_powers_t current_binding_powers; + pm_token_type_t current_token_type; + + while ( + current_token_type = parser->current.type, + current_binding_powers = pm_binding_powers[current_token_type], + binding_power <= current_binding_powers.left && + current_binding_powers.binary + ) { + node = parse_expression_infix(parser, node, binding_power, current_binding_powers.right, accepts_command_call, (uint16_t) (depth + 1)); + + if (context_terminator(parser->current_context->context, &parser->current)) { + // If this token terminates the current context, then we need to + // stop parsing the expression, as it has become a statement. + return node; + } + + switch (PM_NODE_TYPE(node)) { + case PM_MULTI_WRITE_NODE: + // Multi-write nodes are statements, and cannot be followed by + // operators except modifiers. + if (pm_binding_powers[parser->current.type].left > PM_BINDING_POWER_MODIFIER) { + return node; + } + break; + case PM_CLASS_VARIABLE_WRITE_NODE: + case PM_CONSTANT_PATH_WRITE_NODE: + case PM_CONSTANT_WRITE_NODE: + case PM_GLOBAL_VARIABLE_WRITE_NODE: + case PM_INSTANCE_VARIABLE_WRITE_NODE: + case PM_LOCAL_VARIABLE_WRITE_NODE: + // These expressions are statements, by virtue of the right-hand + // side of their write being an implicit array. + if (PM_NODE_FLAG_P(node, PM_WRITE_NODE_FLAGS_IMPLICIT_ARRAY) && pm_binding_powers[parser->current.type].left > PM_BINDING_POWER_MODIFIER) { + return node; + } + break; + case PM_CALL_NODE: + // These expressions are also statements, by virtue of the + // right-hand side of the expression (i.e., the last argument to + // the call node) being an implicit array. + if (PM_NODE_FLAG_P(node, PM_CALL_NODE_FLAGS_IMPLICIT_ARRAY) && pm_binding_powers[parser->current.type].left > PM_BINDING_POWER_MODIFIER) { + return node; + } + break; + default: + break; + } + + // If the operator is nonassoc and we should not be able to parse the + // upcoming infix operator, break. + if (current_binding_powers.nonassoc) { + // If this is a non-assoc operator and we are about to parse the + // exact same operator, then we need to add an error. + if (match1(parser, current_token_type)) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_NON_ASSOCIATIVE_OPERATOR, pm_token_type_human(parser->current.type), pm_token_type_human(current_token_type)); + break; + } + + // If this is an endless range, then we need to reject a couple of + // additional operators because it violates the normal operator + // precedence rules. Those patterns are: + // + // 1.. & 2 + // 1.. * 2 + // + if (PM_NODE_TYPE_P(node, PM_RANGE_NODE) && ((pm_range_node_t *) node)->right == NULL) { + if (match4(parser, PM_TOKEN_UAMPERSAND, PM_TOKEN_USTAR, PM_TOKEN_DOT, PM_TOKEN_AMPERSAND_DOT)) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_NON_ASSOCIATIVE_OPERATOR, pm_token_type_human(parser->current.type), pm_token_type_human(current_token_type)); + break; + } + + if (PM_BINDING_POWER_TERM <= pm_binding_powers[parser->current.type].left) { + break; + } + } else if (current_binding_powers.left <= pm_binding_powers[parser->current.type].left) { + break; + } + } + + if (accepts_command_call) { + // A command-style method call is only accepted on method chains. + // Thus, we check whether the parsed node can continue method chains. + // The method chain can continue if the parsed node is one of the following five kinds: + // (1) index access: foo[1] + // (2) attribute access: foo.bar + // (3) method call with parenthesis: foo.bar(1) + // (4) method call with a block: foo.bar do end + // (5) constant path: foo::Bar + switch (node->type) { + case PM_CALL_NODE: { + pm_call_node_t *cast = (pm_call_node_t *)node; + if ( + // (1) foo[1] + !( + cast->call_operator_loc.start == NULL && + cast->message_loc.start != NULL && + cast->message_loc.start[0] == '[' && + cast->message_loc.end[-1] == ']' + ) && + // (2) foo.bar + !( + cast->call_operator_loc.start != NULL && + cast->arguments == NULL && + cast->block == NULL && + cast->opening_loc.start == NULL + ) && + // (3) foo.bar(1) + !( + cast->call_operator_loc.start != NULL && + cast->opening_loc.start != NULL + ) && + // (4) foo.bar do end + !( + cast->block != NULL && PM_NODE_TYPE_P(cast->block, PM_BLOCK_NODE) + ) + ) { + accepts_command_call = false; + } + break; + } + // (5) foo::Bar + case PM_CONSTANT_PATH_NODE: + break; + default: + accepts_command_call = false; + break; + } + } + } + + return node; +} + +/** + * ruby -p, ruby -n, ruby -a, and ruby -l options will mutate the AST. We + * perform that mutation here. + */ +static pm_statements_node_t * +wrap_statements(pm_parser_t *parser, pm_statements_node_t *statements) { + if (PM_PARSER_COMMAND_LINE_OPTION_P(parser)) { + if (statements == NULL) { + statements = pm_statements_node_create(parser); + } + + pm_arguments_node_t *arguments = pm_arguments_node_create(parser); + pm_arguments_node_arguments_append( + arguments, + UP(pm_global_variable_read_node_synthesized_create(parser, pm_parser_constant_id_constant(parser, "$_", 2))) + ); + + pm_statements_node_body_append(parser, statements, UP(pm_call_node_fcall_synthesized_create( + parser, + arguments, + pm_parser_constant_id_constant(parser, "print", 5) + )), true); + } + + if (PM_PARSER_COMMAND_LINE_OPTION_N(parser)) { + if (PM_PARSER_COMMAND_LINE_OPTION_A(parser)) { + if (statements == NULL) { + statements = pm_statements_node_create(parser); + } + + pm_arguments_node_t *arguments = pm_arguments_node_create(parser); + pm_arguments_node_arguments_append( + arguments, + UP(pm_global_variable_read_node_synthesized_create(parser, pm_parser_constant_id_constant(parser, "$;", 2))) + ); + + pm_global_variable_read_node_t *receiver = pm_global_variable_read_node_synthesized_create(parser, pm_parser_constant_id_constant(parser, "$_", 2)); + pm_call_node_t *call = pm_call_node_call_synthesized_create(parser, UP(receiver), "split", arguments); + + pm_global_variable_write_node_t *write = pm_global_variable_write_node_synthesized_create( + parser, + pm_parser_constant_id_constant(parser, "$F", 2), + UP(call) + ); + + pm_statements_node_body_prepend(statements, UP(write)); + } + + pm_arguments_node_t *arguments = pm_arguments_node_create(parser); + pm_arguments_node_arguments_append( + arguments, + UP(pm_global_variable_read_node_synthesized_create(parser, pm_parser_constant_id_constant(parser, "$/", 2))) + ); + + if (PM_PARSER_COMMAND_LINE_OPTION_L(parser)) { + pm_keyword_hash_node_t *keywords = pm_keyword_hash_node_create(parser); + pm_keyword_hash_node_elements_append(keywords, UP(pm_assoc_node_create( + parser, + UP(pm_symbol_node_synthesized_create(parser, "chomp")), + &(pm_token_t) { .type = PM_TOKEN_NOT_PROVIDED, .start = parser->start, .end = parser->start }, + UP(pm_true_node_synthesized_create(parser)) + ))); + + pm_arguments_node_arguments_append(arguments, UP(keywords)); + pm_node_flag_set(UP(arguments), PM_ARGUMENTS_NODE_FLAGS_CONTAINS_KEYWORDS); + } + + pm_statements_node_t *wrapped_statements = pm_statements_node_create(parser); + pm_statements_node_body_append(parser, wrapped_statements, UP(pm_while_node_synthesized_create( + parser, + UP(pm_call_node_fcall_synthesized_create(parser, arguments, pm_parser_constant_id_constant(parser, "gets", 4))), + statements + )), true); + + statements = wrapped_statements; + } + + return statements; +} + +/** + * Parse the top-level program node. + */ +static pm_node_t * +parse_program(pm_parser_t *parser) { + // If the current scope is NULL, then we want to push a new top level scope. + // The current scope could exist in the event that we are parsing an eval + // and the user has passed into scopes that already exist. + if (parser->current_scope == NULL) { + pm_parser_scope_push(parser, true); + } + + pm_node_list_t current_block_exits = { 0 }; + pm_node_list_t *previous_block_exits = push_block_exits(parser, ¤t_block_exits); + + parser_lex(parser); + pm_statements_node_t *statements = parse_statements(parser, PM_CONTEXT_MAIN, 0); + + if (statements != NULL && !parser->parsing_eval) { + // If we have statements, then the top-level statement should be + // explicitly checked as well. We have to do this here because + // everywhere else we check all but the last statement. + assert(statements->body.size > 0); + pm_void_statement_check(parser, statements->body.nodes[statements->body.size - 1]); + } + + pm_constant_id_list_t locals; + pm_locals_order(parser, &parser->current_scope->locals, &locals, true); + pm_parser_scope_pop(parser); + + // At the top level, see if we need to wrap the statements in a program + // node with a while loop based on the options. + if (parser->command_line & (PM_OPTIONS_COMMAND_LINE_P | PM_OPTIONS_COMMAND_LINE_N)) { + statements = wrap_statements(parser, statements); + } else { + flush_block_exits(parser, previous_block_exits); + } + + pm_node_list_free(¤t_block_exits); + + // If this is an empty file, then we're still going to parse all of the + // statements in order to gather up all of the comments and such. Here we'll + // correct the location information. + if (statements == NULL) { + statements = pm_statements_node_create(parser); + pm_statements_node_location_set(statements, parser->start, parser->start); + } + + return UP(pm_program_node_create(parser, &locals, statements)); +} + +/******************************************************************************/ +/* External functions */ +/******************************************************************************/ + +/** + * A vendored version of strnstr that is used to find a substring within a + * string with a given length. This function is used to search for the Ruby + * engine name within a shebang when the -x option is passed to Ruby. + * + * The only modification that we made here is that we don't do NULL byte checks + * because we know the little parameter will not have a NULL byte and we allow + * the big parameter to have them. + */ +static const char * +pm_strnstr(const char *big, const char *little, size_t big_length) { + size_t little_length = strlen(little); + + for (const char *max = big + big_length - little_length; big <= max; big++) { + if (*big == *little && memcmp(big, little, little_length) == 0) return big; + } + + return NULL; +} + +#ifdef _WIN32 +#define pm_parser_warn_shebang_carriage_return(parser, start, length) ((void) 0) +#else +/** + * Potentially warn the user if the shebang that has been found to include + * "ruby" has a carriage return at the end, as that can cause problems on some + * platforms. + */ +static void +pm_parser_warn_shebang_carriage_return(pm_parser_t *parser, const uint8_t *start, size_t length) { + if (length > 2 && start[length - 2] == '\r' && start[length - 1] == '\n') { + pm_parser_warn(parser, start, start + length, PM_WARN_SHEBANG_CARRIAGE_RETURN); + } +} +#endif + +/** + * Process the shebang when initializing the parser. This function assumes that + * the shebang_callback option has already been checked for nullability. + */ +static void +pm_parser_init_shebang(pm_parser_t *parser, const pm_options_t *options, const char *engine, size_t length) { + const char *switches = pm_strnstr(engine, " -", length); + if (switches == NULL) return; + + pm_options_t next_options = *options; + options->shebang_callback( + &next_options, + (const uint8_t *) (switches + 1), + length - ((size_t) (switches - engine)) - 1, + options->shebang_callback_data + ); + + size_t encoding_length; + if ((encoding_length = pm_string_length(&next_options.encoding)) > 0) { + const uint8_t *encoding_source = pm_string_source(&next_options.encoding); + parser_lex_magic_comment_encoding_value(parser, encoding_source, encoding_source + encoding_length); + } + + parser->command_line = next_options.command_line; + parser->frozen_string_literal = next_options.frozen_string_literal; +} + +/** + * Initialize a parser with the given start and end pointers. + */ +PRISM_EXPORTED_FUNCTION void +pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm_options_t *options) { + assert(source != NULL); + + *parser = (pm_parser_t) { + .node_id = 0, + .lex_state = PM_LEX_STATE_BEG, + .enclosure_nesting = 0, + .lambda_enclosure_nesting = -1, + .brace_nesting = 0, + .do_loop_stack = 0, + .accepts_block_stack = 0, + .lex_modes = { + .index = 0, + .stack = {{ .mode = PM_LEX_DEFAULT }}, + .current = &parser->lex_modes.stack[0], + }, + .start = source, + .end = source + size, + .previous = { .type = PM_TOKEN_EOF, .start = source, .end = source }, + .current = { .type = PM_TOKEN_EOF, .start = source, .end = source }, + .next_start = NULL, + .heredoc_end = NULL, + .data_loc = { .start = NULL, .end = NULL }, + .comment_list = { 0 }, + .magic_comment_list = { 0 }, + .warning_list = { 0 }, + .error_list = { 0 }, + .current_scope = NULL, + .current_context = NULL, + .encoding = PM_ENCODING_UTF_8_ENTRY, + .encoding_changed_callback = NULL, + .encoding_comment_start = source, + .lex_callback = NULL, + .filepath = { 0 }, + .constant_pool = { 0 }, + .newline_list = { 0 }, + .integer_base = 0, + .current_string = PM_STRING_EMPTY, + .start_line = 1, + .explicit_encoding = NULL, + .command_line = 0, + .parsing_eval = false, + .partial_script = false, + .command_start = true, + .recovering = false, + .encoding_locked = false, + .encoding_changed = false, + .pattern_matching_newlines = false, + .in_keyword_arg = false, + .current_block_exits = NULL, + .semantic_token_seen = false, + .frozen_string_literal = PM_OPTIONS_FROZEN_STRING_LITERAL_UNSET, + .current_regular_expression_ascii_only = false, + .warn_mismatched_indentation = true + }; + + // Initialize the constant pool. We're going to completely guess as to the + // number of constants that we'll need based on the size of the input. The + // ratio we chose here is actually less arbitrary than you might think. + // + // We took ~50K Ruby files and measured the size of the file versus the + // number of constants that were found in those files. Then we found the + // average and standard deviation of the ratios of constants/bytesize. Then + // we added 1.34 standard deviations to the average to get a ratio that + // would fit 75% of the files (for a two-tailed distribution). This works + // because there was about a 0.77 correlation and the distribution was + // roughly normal. + // + // This ratio will need to change if we add more constants to the constant + // pool for another node type. + uint32_t constant_size = ((uint32_t) size) / 95; + pm_constant_pool_init(&parser->constant_pool, constant_size < 4 ? 4 : constant_size); + + // Initialize the newline list. Similar to the constant pool, we're going to + // guess at the number of newlines that we'll need based on the size of the + // input. + size_t newline_size = size / 22; + pm_newline_list_init(&parser->newline_list, source, newline_size < 4 ? 4 : newline_size); + + // If options were provided to this parse, establish them here. + if (options != NULL) { + // filepath option + parser->filepath = options->filepath; + + // line option + parser->start_line = options->line; + + // encoding option + size_t encoding_length = pm_string_length(&options->encoding); + if (encoding_length > 0) { + const uint8_t *encoding_source = pm_string_source(&options->encoding); + parser_lex_magic_comment_encoding_value(parser, encoding_source, encoding_source + encoding_length); + } + + // encoding_locked option + parser->encoding_locked = options->encoding_locked; + + // frozen_string_literal option + parser->frozen_string_literal = options->frozen_string_literal; + + // command_line option + parser->command_line = options->command_line; + + // version option + parser->version = options->version; + + // partial_script + parser->partial_script = options->partial_script; + + // scopes option + parser->parsing_eval = options->scopes_count > 0; + if (parser->parsing_eval) parser->warn_mismatched_indentation = false; + + for (size_t scope_index = 0; scope_index < options->scopes_count; scope_index++) { + const pm_options_scope_t *scope = pm_options_scope_get(options, scope_index); + pm_parser_scope_push(parser, scope_index == 0); + + // Scopes given from the outside are not allowed to have numbered + // parameters. + parser->current_scope->parameters = ((pm_scope_parameters_t) scope->forwarding) | PM_SCOPE_PARAMETERS_IMPLICIT_DISALLOWED; + + for (size_t local_index = 0; local_index < scope->locals_count; local_index++) { + const pm_string_t *local = pm_options_scope_local_get(scope, local_index); + + const uint8_t *source = pm_string_source(local); + size_t length = pm_string_length(local); + + void *allocated = xmalloc(length); + if (allocated == NULL) continue; + + memcpy(allocated, source, length); + pm_parser_local_add_owned(parser, (uint8_t *) allocated, length); + } + } + } + + // Now that we have established the user-provided options, check if + // a version was given and parse as the latest version otherwise. + if (parser->version == PM_OPTIONS_VERSION_UNSET) { + parser->version = PM_OPTIONS_VERSION_LATEST; + } + + pm_accepts_block_stack_push(parser, true); + + // Skip past the UTF-8 BOM if it exists. + if (size >= 3 && source[0] == 0xef && source[1] == 0xbb && source[2] == 0xbf) { + parser->current.end += 3; + parser->encoding_comment_start += 3; + + if (parser->encoding != PM_ENCODING_UTF_8_ENTRY) { + parser->encoding = PM_ENCODING_UTF_8_ENTRY; + if (parser->encoding_changed_callback != NULL) parser->encoding_changed_callback(parser); + } + } + + // If the -x command line flag is set, or the first shebang of the file does + // not include "ruby", then we'll search for a shebang that does include + // "ruby" and start parsing from there. + bool search_shebang = PM_PARSER_COMMAND_LINE_OPTION_X(parser); + + // If the first two bytes of the source are a shebang, then we will do a bit + // of extra processing. + // + // First, we'll indicate that the encoding comment is at the end of the + // shebang. This means that when a shebang is present the encoding comment + // can begin on the second line. + // + // Second, we will check if the shebang includes "ruby". If it does, then we + // we will start parsing from there. We will also potentially warning the + // user if there is a carriage return at the end of the shebang. We will + // also potentially call the shebang callback if this is the main script to + // allow the caller to parse the shebang and find any command-line options. + // If the shebang does not include "ruby" and this is the main script being + // parsed, then we will start searching the file for a shebang that does + // contain "ruby" as if -x were passed on the command line. + const uint8_t *newline = next_newline(parser->current.end, parser->end - parser->current.end); + size_t length = (size_t) ((newline != NULL ? newline : parser->end) - parser->current.end); + + if (length > 2 && parser->current.end[0] == '#' && parser->current.end[1] == '!') { + const char *engine; + + if ((engine = pm_strnstr((const char *) parser->start, "ruby", length)) != NULL) { + if (newline != NULL) { + parser->encoding_comment_start = newline + 1; + + if (options == NULL || options->main_script) { + pm_parser_warn_shebang_carriage_return(parser, parser->start, length + 1); + } + } + + if (options != NULL && options->main_script && options->shebang_callback != NULL) { + pm_parser_init_shebang(parser, options, engine, length - ((size_t) (engine - (const char *) parser->start))); + } + + search_shebang = false; + } else if (options != NULL && options->main_script && !parser->parsing_eval) { + search_shebang = true; + } + } + + // Here we're going to find the first shebang that includes "ruby" and start + // parsing from there. + if (search_shebang) { + // If a shebang that includes "ruby" is not found, then we're going to a + // a load error to the list of errors on the parser. + bool found_shebang = false; + + // This is going to point to the start of each line as we check it. + // We'll maintain a moving window looking at each line at they come. + const uint8_t *cursor = parser->start; + + // The newline pointer points to the end of the current line that we're + // considering. If it is NULL, then we're at the end of the file. + const uint8_t *newline = next_newline(cursor, parser->end - cursor); + + while (newline != NULL) { + pm_newline_list_append(&parser->newline_list, newline); + + cursor = newline + 1; + newline = next_newline(cursor, parser->end - cursor); + + size_t length = (size_t) ((newline != NULL ? newline : parser->end) - cursor); + if (length > 2 && cursor[0] == '#' && cursor[1] == '!') { + const char *engine; + if ((engine = pm_strnstr((const char *) cursor, "ruby", length)) != NULL) { + found_shebang = true; + + if (newline != NULL) { + pm_parser_warn_shebang_carriage_return(parser, cursor, length + 1); + parser->encoding_comment_start = newline + 1; + } + + if (options != NULL && options->shebang_callback != NULL) { + pm_parser_init_shebang(parser, options, engine, length - ((size_t) (engine - (const char *) cursor))); + } + + break; + } + } + } + + if (found_shebang) { + parser->previous = (pm_token_t) { .type = PM_TOKEN_EOF, .start = cursor, .end = cursor }; + parser->current = (pm_token_t) { .type = PM_TOKEN_EOF, .start = cursor, .end = cursor }; + } else { + pm_parser_err(parser, parser->start, parser->start, PM_ERR_SCRIPT_NOT_FOUND); + pm_newline_list_clear(&parser->newline_list); + } + } + + // The encoding comment can start after any amount of inline whitespace, so + // here we'll advance it to the first non-inline-whitespace character so + // that it is ready for future comparisons. + parser->encoding_comment_start += pm_strspn_inline_whitespace(parser->encoding_comment_start, parser->end - parser->encoding_comment_start); +} + +/** + * Register a callback that will be called whenever prism changes the encoding + * it is using to parse based on the magic comment. + */ +PRISM_EXPORTED_FUNCTION void +pm_parser_register_encoding_changed_callback(pm_parser_t *parser, pm_encoding_changed_callback_t callback) { + parser->encoding_changed_callback = callback; +} + +/** + * Free all of the memory associated with the comment list. + */ +static inline void +pm_comment_list_free(pm_list_t *list) { + pm_list_node_t *node, *next; + + for (node = list->head; node != NULL; node = next) { + next = node->next; + + pm_comment_t *comment = (pm_comment_t *) node; + xfree(comment); + } +} + +/** + * Free all of the memory associated with the magic comment list. + */ +static inline void +pm_magic_comment_list_free(pm_list_t *list) { + pm_list_node_t *node, *next; + + for (node = list->head; node != NULL; node = next) { + next = node->next; + + pm_magic_comment_t *magic_comment = (pm_magic_comment_t *) node; + xfree(magic_comment); + } +} + +/** + * Free any memory associated with the given parser. + */ +PRISM_EXPORTED_FUNCTION void +pm_parser_free(pm_parser_t *parser) { + pm_string_free(&parser->filepath); + pm_diagnostic_list_free(&parser->error_list); + pm_diagnostic_list_free(&parser->warning_list); + pm_comment_list_free(&parser->comment_list); + pm_magic_comment_list_free(&parser->magic_comment_list); + pm_constant_pool_free(&parser->constant_pool); + pm_newline_list_free(&parser->newline_list); + + while (parser->current_scope != NULL) { + // Normally, popping the scope doesn't free the locals since it is + // assumed that ownership has transferred to the AST. However if we have + // scopes while we're freeing the parser, it's likely they came from + // eval scopes and we need to free them explicitly here. + pm_parser_scope_pop(parser); + } + + while (parser->lex_modes.index >= PM_LEX_STACK_SIZE) { + lex_mode_pop(parser); + } +} + +/** + * Parse the Ruby source associated with the given parser and return the tree. + */ +PRISM_EXPORTED_FUNCTION pm_node_t * +pm_parse(pm_parser_t *parser) { + return parse_program(parser); +} + +/** + * Read into the stream until the gets callback returns false. If the last read + * line from the stream matches an __END__ marker, then halt and return false, + * otherwise return true. + */ +static bool +pm_parse_stream_read(pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *stream_fgets, pm_parse_stream_feof_t *stream_feof) { +#define LINE_SIZE 4096 + char line[LINE_SIZE]; + + while (memset(line, '\n', LINE_SIZE), stream_fgets(line, LINE_SIZE, stream) != NULL) { + size_t length = LINE_SIZE; + while (length > 0 && line[length - 1] == '\n') length--; + + if (length == LINE_SIZE) { + // If we read a line that is the maximum size and it doesn't end + // with a newline, then we'll just append it to the buffer and + // continue reading. + length--; + pm_buffer_append_string(buffer, line, length); + continue; + } + + // Append the line to the buffer. + length--; + pm_buffer_append_string(buffer, line, length); + + // Check if the line matches the __END__ marker. If it does, then stop + // reading and return false. In most circumstances, this means we should + // stop reading from the stream so that the DATA constant can pick it + // up. + switch (length) { + case 7: + if (strncmp(line, "__END__", 7) == 0) return false; + break; + case 8: + if (strncmp(line, "__END__\n", 8) == 0) return false; + break; + case 9: + if (strncmp(line, "__END__\r\n", 9) == 0) return false; + break; + } + + // All data should be read via gets. If the string returned by gets + // _doesn't_ end with a newline, then we assume we hit EOF condition. + if (stream_feof(stream)) { + break; + } + } + + return true; +#undef LINE_SIZE +} + +/** + * Determine if there was an unterminated heredoc at the end of the input, which + * would mean the stream isn't finished and we should keep reading. + * + * For the other lex modes we can check if the lex mode has been closed, but for + * heredocs when we hit EOF we close the lex mode and then go back to parse the + * rest of the line after the heredoc declaration so that we get more of the + * syntax tree. + */ +static bool +pm_parse_stream_unterminated_heredoc_p(pm_parser_t *parser) { + pm_diagnostic_t *diagnostic = (pm_diagnostic_t *) parser->error_list.head; + + for (; diagnostic != NULL; diagnostic = (pm_diagnostic_t *) diagnostic->node.next) { + if (diagnostic->diag_id == PM_ERR_HEREDOC_TERM) { + return true; + } + } + + return false; +} + +/** + * Parse a stream of Ruby source and return the tree. + * + * Prism is designed around having the entire source in memory at once, but you + * can stream stdin in to Ruby so we need to support a streaming API. + */ +PRISM_EXPORTED_FUNCTION pm_node_t * +pm_parse_stream(pm_parser_t *parser, pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *stream_fgets, pm_parse_stream_feof_t *stream_feof, const pm_options_t *options) { + pm_buffer_init(buffer); + + bool eof = pm_parse_stream_read(buffer, stream, stream_fgets, stream_feof); + + pm_parser_init(parser, (const uint8_t *) pm_buffer_value(buffer), pm_buffer_length(buffer), options); + pm_node_t *node = pm_parse(parser); + + while (!eof && parser->error_list.size > 0 && (parser->lex_modes.index > 0 || pm_parse_stream_unterminated_heredoc_p(parser))) { + pm_node_destroy(parser, node); + eof = pm_parse_stream_read(buffer, stream, stream_fgets, stream_feof); + + pm_parser_free(parser); + pm_parser_init(parser, (const uint8_t *) pm_buffer_value(buffer), pm_buffer_length(buffer), options); + node = pm_parse(parser); + } + + return node; +} + +/** + * Parse the source and return true if it parses without errors or warnings. + */ +PRISM_EXPORTED_FUNCTION bool +pm_parse_success_p(const uint8_t *source, size_t size, const char *data) { + pm_options_t options = { 0 }; + pm_options_read(&options, data); + + pm_parser_t parser; + pm_parser_init(&parser, source, size, &options); + + pm_node_t *node = pm_parse(&parser); + pm_node_destroy(&parser, node); + + bool result = parser.error_list.size == 0; + pm_parser_free(&parser); + pm_options_free(&options); + + return result; +} + +#undef PM_CASE_KEYWORD +#undef PM_CASE_OPERATOR +#undef PM_CASE_WRITABLE +#undef PM_STRING_EMPTY + +// We optionally support serializing to a binary string. For systems that don't +// want or need this functionality, it can be turned off with the +// PRISM_EXCLUDE_SERIALIZATION define. +#ifndef PRISM_EXCLUDE_SERIALIZATION + +static inline void +pm_serialize_header(pm_buffer_t *buffer) { + pm_buffer_append_string(buffer, "PRISM", 5); + pm_buffer_append_byte(buffer, PRISM_VERSION_MAJOR); + pm_buffer_append_byte(buffer, PRISM_VERSION_MINOR); + pm_buffer_append_byte(buffer, PRISM_VERSION_PATCH); + pm_buffer_append_byte(buffer, PRISM_SERIALIZE_ONLY_SEMANTICS_FIELDS ? 1 : 0); +} + +/** + * Serialize the AST represented by the given node to the given buffer. + */ +PRISM_EXPORTED_FUNCTION void +pm_serialize(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) { + pm_serialize_header(buffer); + pm_serialize_content(parser, node, buffer); + pm_buffer_append_byte(buffer, '\0'); +} + +/** + * Parse and serialize the AST represented by the given source to the given + * buffer. + */ +PRISM_EXPORTED_FUNCTION void +pm_serialize_parse(pm_buffer_t *buffer, const uint8_t *source, size_t size, const char *data) { + pm_options_t options = { 0 }; + pm_options_read(&options, data); + + pm_parser_t parser; + pm_parser_init(&parser, source, size, &options); + + pm_node_t *node = pm_parse(&parser); + + pm_serialize_header(buffer); + pm_serialize_content(&parser, node, buffer); + pm_buffer_append_byte(buffer, '\0'); + + pm_node_destroy(&parser, node); + pm_parser_free(&parser); + pm_options_free(&options); +} + +/** + * Parse and serialize the AST represented by the source that is read out of the + * given stream into to the given buffer. + */ +PRISM_EXPORTED_FUNCTION void +pm_serialize_parse_stream(pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *stream_fgets, pm_parse_stream_feof_t *stream_feof, const char *data) { + pm_parser_t parser; + pm_options_t options = { 0 }; + pm_options_read(&options, data); + + pm_buffer_t parser_buffer; + pm_node_t *node = pm_parse_stream(&parser, &parser_buffer, stream, stream_fgets, stream_feof, &options); + pm_serialize_header(buffer); + pm_serialize_content(&parser, node, buffer); + pm_buffer_append_byte(buffer, '\0'); + + pm_node_destroy(&parser, node); + pm_buffer_free(&parser_buffer); + pm_parser_free(&parser); + pm_options_free(&options); +} + +/** + * Parse and serialize the comments in the given source to the given buffer. + */ +PRISM_EXPORTED_FUNCTION void +pm_serialize_parse_comments(pm_buffer_t *buffer, const uint8_t *source, size_t size, const char *data) { + pm_options_t options = { 0 }; + pm_options_read(&options, data); + + pm_parser_t parser; + pm_parser_init(&parser, source, size, &options); + + pm_node_t *node = pm_parse(&parser); + pm_serialize_header(buffer); + pm_serialize_encoding(parser.encoding, buffer); + pm_buffer_append_varsint(buffer, parser.start_line); + pm_serialize_comment_list(&parser, &parser.comment_list, buffer); + + pm_node_destroy(&parser, node); + pm_parser_free(&parser); + pm_options_free(&options); +} + +#endif + +/******************************************************************************/ +/* Slice queries for the Ruby API */ +/******************************************************************************/ + +/** The category of slice returned from pm_slice_type. */ +typedef enum { + /** Returned when the given encoding name is invalid. */ + PM_SLICE_TYPE_ERROR = -1, + + /** Returned when no other types apply to the slice. */ + PM_SLICE_TYPE_NONE, + + /** Returned when the slice is a valid local variable name. */ + PM_SLICE_TYPE_LOCAL, + + /** Returned when the slice is a valid constant name. */ + PM_SLICE_TYPE_CONSTANT, + + /** Returned when the slice is a valid method name. */ + PM_SLICE_TYPE_METHOD_NAME +} pm_slice_type_t; + +/** + * Check that the slice is a valid local variable name or constant. + */ +pm_slice_type_t +pm_slice_type(const uint8_t *source, size_t length, const char *encoding_name) { + // first, get the right encoding object + const pm_encoding_t *encoding = pm_encoding_find((const uint8_t *) encoding_name, (const uint8_t *) (encoding_name + strlen(encoding_name))); + if (encoding == NULL) return PM_SLICE_TYPE_ERROR; + + // check that there is at least one character + if (length == 0) return PM_SLICE_TYPE_NONE; + + size_t width; + if ((width = encoding->alpha_char(source, (ptrdiff_t) length)) != 0) { + // valid because alphabetical + } else if (*source == '_') { + // valid because underscore + width = 1; + } else if ((*source >= 0x80) && ((width = encoding->char_width(source, (ptrdiff_t) length)) > 0)) { + // valid because multibyte + } else { + // invalid because no match + return PM_SLICE_TYPE_NONE; + } + + // determine the type of the slice based on the first character + const uint8_t *end = source + length; + pm_slice_type_t result = encoding->isupper_char(source, end - source) ? PM_SLICE_TYPE_CONSTANT : PM_SLICE_TYPE_LOCAL; + + // next, iterate through all of the bytes of the string to ensure that they + // are all valid identifier characters + source += width; + + while (source < end) { + if ((width = encoding->alnum_char(source, end - source)) != 0) { + // valid because alphanumeric + source += width; + } else if (*source == '_') { + // valid because underscore + source++; + } else if ((*source >= 0x80) && ((width = encoding->char_width(source, end - source)) > 0)) { + // valid because multibyte + source += width; + } else { + // invalid because no match + break; + } + } + + // accept a ! or ? at the end of the slice as a method name + if (*source == '!' || *source == '?' || *source == '=') { + source++; + result = PM_SLICE_TYPE_METHOD_NAME; + } + + // valid if we are at the end of the slice + return source == end ? result : PM_SLICE_TYPE_NONE; +} + +/** + * Check that the slice is a valid local variable name. + */ +PRISM_EXPORTED_FUNCTION pm_string_query_t +pm_string_query_local(const uint8_t *source, size_t length, const char *encoding_name) { + switch (pm_slice_type(source, length, encoding_name)) { + case PM_SLICE_TYPE_ERROR: + return PM_STRING_QUERY_ERROR; + case PM_SLICE_TYPE_NONE: + case PM_SLICE_TYPE_CONSTANT: + case PM_SLICE_TYPE_METHOD_NAME: + return PM_STRING_QUERY_FALSE; + case PM_SLICE_TYPE_LOCAL: + return PM_STRING_QUERY_TRUE; + } + + assert(false && "unreachable"); + return PM_STRING_QUERY_FALSE; +} + +/** + * Check that the slice is a valid constant name. + */ +PRISM_EXPORTED_FUNCTION pm_string_query_t +pm_string_query_constant(const uint8_t *source, size_t length, const char *encoding_name) { + switch (pm_slice_type(source, length, encoding_name)) { + case PM_SLICE_TYPE_ERROR: + return PM_STRING_QUERY_ERROR; + case PM_SLICE_TYPE_NONE: + case PM_SLICE_TYPE_LOCAL: + case PM_SLICE_TYPE_METHOD_NAME: + return PM_STRING_QUERY_FALSE; + case PM_SLICE_TYPE_CONSTANT: + return PM_STRING_QUERY_TRUE; + } + + assert(false && "unreachable"); + return PM_STRING_QUERY_FALSE; +} + +/** + * Check that the slice is a valid method name. + */ +PRISM_EXPORTED_FUNCTION pm_string_query_t +pm_string_query_method_name(const uint8_t *source, size_t length, const char *encoding_name) { +#define B(p) ((p) ? PM_STRING_QUERY_TRUE : PM_STRING_QUERY_FALSE) +#define C1(c) (*source == c) +#define C2(s) (memcmp(source, s, 2) == 0) +#define C3(s) (memcmp(source, s, 3) == 0) + + switch (pm_slice_type(source, length, encoding_name)) { + case PM_SLICE_TYPE_ERROR: + return PM_STRING_QUERY_ERROR; + case PM_SLICE_TYPE_NONE: + break; + case PM_SLICE_TYPE_LOCAL: + // numbered parameters are not valid method names + return B((length != 2) || (source[0] != '_') || (source[1] == '0') || !pm_char_is_decimal_digit(source[1])); + case PM_SLICE_TYPE_CONSTANT: + // all constants are valid method names + case PM_SLICE_TYPE_METHOD_NAME: + // all method names are valid method names + return PM_STRING_QUERY_TRUE; + } + + switch (length) { + case 1: + return B(C1('&') || C1('`') || C1('!') || C1('^') || C1('>') || C1('<') || C1('-') || C1('%') || C1('|') || C1('+') || C1('/') || C1('*') || C1('~')); + case 2: + return B(C2("!=") || C2("!~") || C2("[]") || C2("==") || C2("=~") || C2(">=") || C2(">>") || C2("<=") || C2("<<") || C2("**")); + case 3: + return B(C3("===") || C3("<=>") || C3("[]=")); + default: + return PM_STRING_QUERY_FALSE; + } + +#undef B +#undef C1 +#undef C2 +#undef C3 +} diff --git a/prism/prism.h b/prism/prism.h new file mode 100644 index 0000000000..c468db18be --- /dev/null +++ b/prism/prism.h @@ -0,0 +1,408 @@ +/** + * @file prism.h + * + * The main header file for the prism parser. + */ +#ifndef PRISM_H +#define PRISM_H + +#include "prism/defines.h" +#include "prism/util/pm_buffer.h" +#include "prism/util/pm_char.h" +#include "prism/util/pm_integer.h" +#include "prism/util/pm_memchr.h" +#include "prism/util/pm_strncasecmp.h" +#include "prism/util/pm_strpbrk.h" +#include "prism/ast.h" +#include "prism/diagnostic.h" +#include "prism/node.h" +#include "prism/options.h" +#include "prism/pack.h" +#include "prism/parser.h" +#include "prism/prettyprint.h" +#include "prism/regexp.h" +#include "prism/static_literals.h" +#include "prism/version.h" + +#include <assert.h> +#include <errno.h> +#include <locale.h> +#include <math.h> +#include <stdarg.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#ifndef _WIN32 +#include <strings.h> +#endif + +/** + * The prism version and the serialization format. + * + * @returns The prism version as a constant string. + */ +PRISM_EXPORTED_FUNCTION const char * pm_version(void); + +/** + * Initialize a parser with the given start and end pointers. + * + * The resulting parser must eventually be freed with `pm_parser_free()`. + * + * @param parser The parser to initialize. + * @param source The source to parse. + * @param size The size of the source. + * @param options The optional options to use when parsing. These options must + * live for the whole lifetime of this parser. + * + * \public \memberof pm_parser + */ +PRISM_EXPORTED_FUNCTION void pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm_options_t *options); + +/** + * Register a callback that will be called whenever prism changes the encoding + * it is using to parse based on the magic comment. + * + * @param parser The parser to register the callback with. + * @param callback The callback to register. + * + * \public \memberof pm_parser + */ +PRISM_EXPORTED_FUNCTION void pm_parser_register_encoding_changed_callback(pm_parser_t *parser, pm_encoding_changed_callback_t callback); + +/** + * Free any memory associated with the given parser. + * + * This does not free the `pm_options_t` object that was used to initialize the + * parser. + * + * @param parser The parser to free. + * + * \public \memberof pm_parser + */ +PRISM_EXPORTED_FUNCTION void pm_parser_free(pm_parser_t *parser); + +/** + * Initiate the parser with the given parser. + * + * @param parser The parser to use. + * @return The AST representing the source. + * + * \public \memberof pm_parser + */ +PRISM_EXPORTED_FUNCTION pm_node_t * pm_parse(pm_parser_t *parser); + +/** + * This function is used in pm_parse_stream() to retrieve a line of input from a + * stream. It closely mirrors that of fgets so that fgets can be used as the + * default implementation. + */ +typedef char * (pm_parse_stream_fgets_t)(char *string, int size, void *stream); + +/** + * This function is used in pm_parse_stream to check whether a stream is EOF. + * It closely mirrors that of feof so that feof can be used as the + * default implementation. + */ +typedef int (pm_parse_stream_feof_t)(void *stream); + +/** + * Parse a stream of Ruby source and return the tree. + * + * @param parser The parser to use. + * @param buffer The buffer to use. + * @param stream The stream to parse. + * @param stream_fgets The function to use to read from the stream. + * @param stream_feof The function to use to determine if the stream has hit eof. + * @param options The optional options to use when parsing. + * @return The AST representing the source. + * + * \public \memberof pm_parser + */ +PRISM_EXPORTED_FUNCTION pm_node_t * pm_parse_stream(pm_parser_t *parser, pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *stream_fgets, pm_parse_stream_feof_t *stream_feof, const pm_options_t *options); + +// We optionally support serializing to a binary string. For systems that don't +// want or need this functionality, it can be turned off with the +// PRISM_EXCLUDE_SERIALIZATION define. +#ifndef PRISM_EXCLUDE_SERIALIZATION + +/** + * Parse and serialize the AST represented by the source that is read out of the + * given stream into to the given buffer. + * + * @param buffer The buffer to serialize to. + * @param stream The stream to parse. + * @param stream_fgets The function to use to read from the stream. + * @param stream_feof The function to use to tell if the stream has hit eof. + * @param data The optional data to pass to the parser. + */ +PRISM_EXPORTED_FUNCTION void pm_serialize_parse_stream(pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *stream_fgets, pm_parse_stream_feof_t *stream_feof, const char *data); + +/** + * Serialize the given list of comments to the given buffer. + * + * @param parser The parser to serialize. + * @param list The list of comments to serialize. + * @param buffer The buffer to serialize to. + */ +void pm_serialize_comment_list(pm_parser_t *parser, pm_list_t *list, pm_buffer_t *buffer); + +/** + * Serialize the name of the encoding to the buffer. + * + * @param encoding The encoding to serialize. + * @param buffer The buffer to serialize to. + */ +void pm_serialize_encoding(const pm_encoding_t *encoding, pm_buffer_t *buffer); + +/** + * Serialize the encoding, metadata, nodes, and constant pool. + * + * @param parser The parser to serialize. + * @param node The node to serialize. + * @param buffer The buffer to serialize to. + */ +void pm_serialize_content(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer); + +/** + * Serialize the AST represented by the given node to the given buffer. + * + * @param parser The parser to serialize. + * @param node The node to serialize. + * @param buffer The buffer to serialize to. + */ +PRISM_EXPORTED_FUNCTION void pm_serialize(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer); + +/** + * Parse the given source to the AST and dump the AST to the given buffer. + * + * @param buffer The buffer to serialize to. + * @param source The source to parse. + * @param size The size of the source. + * @param data The optional data to pass to the parser. + */ +PRISM_EXPORTED_FUNCTION void pm_serialize_parse(pm_buffer_t *buffer, const uint8_t *source, size_t size, const char *data); + +/** + * Parse and serialize the comments in the given source to the given buffer. + * + * @param buffer The buffer to serialize to. + * @param source The source to parse. + * @param size The size of the source. + * @param data The optional data to pass to the parser. + */ +PRISM_EXPORTED_FUNCTION void pm_serialize_parse_comments(pm_buffer_t *buffer, const uint8_t *source, size_t size, const char *data); + +/** + * Lex the given source and serialize to the given buffer. + * + * @param source The source to lex. + * @param size The size of the source. + * @param buffer The buffer to serialize to. + * @param data The optional data to pass to the lexer. + */ +PRISM_EXPORTED_FUNCTION void pm_serialize_lex(pm_buffer_t *buffer, const uint8_t *source, size_t size, const char *data); + +/** + * Parse and serialize both the AST and the tokens represented by the given + * source to the given buffer. + * + * @param buffer The buffer to serialize to. + * @param source The source to parse. + * @param size The size of the source. + * @param data The optional data to pass to the parser. + */ +PRISM_EXPORTED_FUNCTION void pm_serialize_parse_lex(pm_buffer_t *buffer, const uint8_t *source, size_t size, const char *data); + +#endif + +/** + * Parse the source and return true if it parses without errors or warnings. + * + * @param source The source to parse. + * @param size The size of the source. + * @param data The optional data to pass to the parser. + * @return True if the source parses without errors or warnings. + */ +PRISM_EXPORTED_FUNCTION bool pm_parse_success_p(const uint8_t *source, size_t size, const char *data); + +/** + * Returns a string representation of the given token type. + * + * @param token_type The token type to convert to a string. + * @return A string representation of the given token type. + */ +PRISM_EXPORTED_FUNCTION const char * pm_token_type_name(pm_token_type_t token_type); + +/** + * Returns the human name of the given token type. + * + * @param token_type The token type to convert to a human name. + * @return The human name of the given token type. + */ +const char * pm_token_type_human(pm_token_type_t token_type); + +// We optionally support dumping to JSON. For systems that don't want or need +// this functionality, it can be turned off with the PRISM_EXCLUDE_JSON define. +#ifndef PRISM_EXCLUDE_JSON + +/** + * Dump JSON to the given buffer. + * + * @param buffer The buffer to serialize to. + * @param parser The parser that parsed the node. + * @param node The node to serialize. + */ +PRISM_EXPORTED_FUNCTION void pm_dump_json(pm_buffer_t *buffer, const pm_parser_t *parser, const pm_node_t *node); + +#endif + +/** + * Represents the results of a slice query. + */ +typedef enum { + /** Returned if the encoding given to a slice query was invalid. */ + PM_STRING_QUERY_ERROR = -1, + + /** Returned if the result of the slice query is false. */ + PM_STRING_QUERY_FALSE, + + /** Returned if the result of the slice query is true. */ + PM_STRING_QUERY_TRUE +} pm_string_query_t; + +/** + * Check that the slice is a valid local variable name. + * + * @param source The source to check. + * @param length The length of the source. + * @param encoding_name The name of the encoding of the source. + * @return PM_STRING_QUERY_TRUE if the query is true, PM_STRING_QUERY_FALSE if + * the query is false, and PM_STRING_QUERY_ERROR if the encoding was invalid. + */ +PRISM_EXPORTED_FUNCTION pm_string_query_t pm_string_query_local(const uint8_t *source, size_t length, const char *encoding_name); + +/** + * Check that the slice is a valid constant name. + * + * @param source The source to check. + * @param length The length of the source. + * @param encoding_name The name of the encoding of the source. + * @return PM_STRING_QUERY_TRUE if the query is true, PM_STRING_QUERY_FALSE if + * the query is false, and PM_STRING_QUERY_ERROR if the encoding was invalid. + */ +PRISM_EXPORTED_FUNCTION pm_string_query_t pm_string_query_constant(const uint8_t *source, size_t length, const char *encoding_name); + +/** + * Check that the slice is a valid method name. + * + * @param source The source to check. + * @param length The length of the source. + * @param encoding_name The name of the encoding of the source. + * @return PM_STRING_QUERY_TRUE if the query is true, PM_STRING_QUERY_FALSE if + * the query is false, and PM_STRING_QUERY_ERROR if the encoding was invalid. + */ +PRISM_EXPORTED_FUNCTION pm_string_query_t pm_string_query_method_name(const uint8_t *source, size_t length, const char *encoding_name); + +/** + * @mainpage + * + * Prism is a parser for the Ruby programming language. It is designed to be + * portable, error tolerant, and maintainable. It is written in C99 and has no + * dependencies. It is currently being integrated into + * [CRuby](https://github.com/ruby/ruby), + * [JRuby](https://github.com/jruby/jruby), + * [TruffleRuby](https://github.com/truffleruby/truffleruby), + * [Sorbet](https://github.com/sorbet/sorbet), and + * [Syntax Tree](https://github.com/ruby-syntax-tree/syntax_tree). + * + * @section getting-started Getting started + * + * If you're vendoring this project and compiling it statically then as long as + * you have a C99 compiler you will be fine. If you're linking against it as + * shared library, then you should compile with `-fvisibility=hidden` and + * `-DPRISM_EXPORT_SYMBOLS` to tell prism to make only its public interface + * visible. + * + * @section parsing Parsing + * + * In order to parse Ruby code, the structures and functions that you're going + * to want to use and be aware of are: + * + * * `pm_parser_t` - the main parser structure + * * `pm_parser_init()` - initialize a parser + * * `pm_parse()` - parse and return the root node + * * `pm_node_destroy()` - deallocate the root node returned by `pm_parse()` + * * `pm_parser_free()` - free the internal memory of the parser + * + * Putting all of this together would look something like: + * + * ```c + * void parse(const uint8_t *source, size_t length) { + * pm_parser_t parser; + * pm_parser_init(&parser, source, length, NULL); + * + * pm_node_t *root = pm_parse(&parser); + * printf("PARSED!\n"); + * + * pm_node_destroy(&parser, root); + * pm_parser_free(&parser); + * } + * ``` + * + * All of the nodes "inherit" from `pm_node_t` by embedding those structures + * as their first member. This means you can downcast and upcast any node in the + * tree to a `pm_node_t`. + * + * @section serializing Serializing + * + * Prism provides the ability to serialize the AST and its related metadata into + * a binary format. This format is designed to be portable to different + * languages and runtimes so that you only need to make one FFI call in order to + * parse Ruby code. The structures and functions that you're going to want to + * use and be aware of are: + * + * * `pm_buffer_t` - a small buffer object that will hold the serialized AST + * * `pm_buffer_free()` - free the memory associated with the buffer + * * `pm_serialize()` - serialize the AST into a buffer + * * `pm_serialize_parse()` - parse and serialize the AST into a buffer + * + * Putting all of this together would look something like: + * + * ```c + * void serialize(const uint8_t *source, size_t length) { + * pm_buffer_t buffer = { 0 }; + * + * pm_serialize_parse(&buffer, source, length, NULL); + * printf("SERIALIZED!\n"); + * + * pm_buffer_free(&buffer); + * } + * ``` + * + * @section inspecting Inspecting + * + * Prism provides the ability to inspect the AST by pretty-printing nodes. You + * can do this with the `pm_prettyprint()` function, which you would use like: + * + * ```c + * void prettyprint(const uint8_t *source, size_t length) { + * pm_parser_t parser; + * pm_parser_init(&parser, source, length, NULL); + * + * pm_node_t *root = pm_parse(&parser); + * pm_buffer_t buffer = { 0 }; + * + * pm_prettyprint(&buffer, &parser, root); + * printf("%*.s\n", (int) buffer.length, buffer.value); + * + * pm_buffer_free(&buffer); + * pm_node_destroy(&parser, root); + * pm_parser_free(&parser); + * } + * ``` + */ + +#endif diff --git a/prism/regexp.c b/prism/regexp.c new file mode 100644 index 0000000000..dcc7476244 --- /dev/null +++ b/prism/regexp.c @@ -0,0 +1,790 @@ +#include "prism/regexp.h" + +#define PM_REGEXP_PARSE_DEPTH_MAX 4096 + +/** + * This is the parser that is going to handle parsing regular expressions. + */ +typedef struct { + /** The parser that is currently being used. */ + pm_parser_t *parser; + + /** A pointer to the start of the source that we are parsing. */ + const uint8_t *start; + + /** A pointer to the current position in the source. */ + const uint8_t *cursor; + + /** A pointer to the end of the source that we are parsing. */ + const uint8_t *end; + + /** + * Whether or not the regular expression currently being parsed is in + * extended mode, wherein whitespace is ignored and comments are allowed. + */ + bool extended_mode; + + /** Whether the encoding has changed from the default. */ + bool encoding_changed; + + /** The encoding of the source. */ + const pm_encoding_t *encoding; + + /** The callback to call when a named capture group is found. */ + pm_regexp_name_callback_t name_callback; + + /** The data to pass to the name callback. */ + void *name_data; + + /** The callback to call when a parse error is found. */ + pm_regexp_error_callback_t error_callback; + + /** The data to pass to the error callback. */ + void *error_data; +} pm_regexp_parser_t; + +/** + * Append an error to the parser. + */ +static inline void +pm_regexp_parse_error(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, const char *message) { + parser->error_callback(start, end, message, parser->error_data); +} + +/** + * This appends a new string to the list of named captures. This function + * assumes the caller has already checked the validity of the name callback. + */ +static void +pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) { + pm_string_t string; + pm_string_shared_init(&string, start, end); + parser->name_callback(&string, parser->name_data); + pm_string_free(&string); +} + +/** + * Returns true if the next character is the end of the source. + */ +static inline bool +pm_regexp_char_is_eof(pm_regexp_parser_t *parser) { + return parser->cursor >= parser->end; +} + +/** + * Optionally accept a char and consume it if it exists. + */ +static inline bool +pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) { + if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) { + parser->cursor++; + return true; + } + return false; +} + +/** + * Expect a character to be present and consume it. + */ +static inline bool +pm_regexp_char_expect(pm_regexp_parser_t *parser, uint8_t value) { + if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) { + parser->cursor++; + return true; + } + return false; +} + +/** + * This advances the current token to the next instance of the given character. + */ +static bool +pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) { + if (pm_regexp_char_is_eof(parser)) { + return false; + } + + const uint8_t *end = (const uint8_t *) pm_memchr(parser->cursor, value, (size_t) (parser->end - parser->cursor), parser->encoding_changed, parser->encoding); + if (end == NULL) { + return false; + } + + parser->cursor = end + 1; + return true; +} + +/** + * Range quantifiers are a special class of quantifiers that look like + * + * * {digit} + * * {digit,} + * * {digit,digit} + * * {,digit} + * + * Unfortunately, if there are any spaces in between, then this just becomes a + * regular character match expression and we have to backtrack. So when this + * function first starts running, we'll create a "save" point and then attempt + * to parse the quantifier. If it fails, we'll restore the save point and + * return. + * + * The properly track everything, we're going to build a little state machine. + * It looks something like the following: + * + * +-------+ +---------+ ------------+ + * ---- lbrace ---> | start | ---- digit ---> | minimum | | + * +-------+ +---------+ <--- digit -+ + * | | | + * +-------+ | | rbrace + * | comma | <----- comma +---- comma -------+ | + * +-------+ V V + * | +---------+ +---------+ + * +-- digit --> | maximum | -- rbrace --> || final || + * +---------+ +---------+ + * | ^ + * +- digit -+ + * + * Note that by the time we've hit this function, the lbrace has already been + * consumed so we're in the start state. + */ +static bool +pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) { + const uint8_t *savepoint = parser->cursor; + + enum { + PM_REGEXP_RANGE_QUANTIFIER_STATE_START, + PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM, + PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM, + PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA + } state = PM_REGEXP_RANGE_QUANTIFIER_STATE_START; + + while (1) { + if (parser->cursor >= parser->end) { + parser->cursor = savepoint; + return true; + } + + switch (state) { + case PM_REGEXP_RANGE_QUANTIFIER_STATE_START: + switch (*parser->cursor) { + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': + parser->cursor++; + state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM; + break; + case ',': + parser->cursor++; + state = PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA; + break; + default: + parser->cursor = savepoint; + return true; + } + break; + case PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM: + switch (*parser->cursor) { + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': + parser->cursor++; + break; + case ',': + parser->cursor++; + state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM; + break; + case '}': + parser->cursor++; + return true; + default: + parser->cursor = savepoint; + return true; + } + break; + case PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA: + switch (*parser->cursor) { + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': + parser->cursor++; + state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM; + break; + default: + parser->cursor = savepoint; + return true; + } + break; + case PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM: + switch (*parser->cursor) { + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': + parser->cursor++; + break; + case '}': + parser->cursor++; + return true; + default: + parser->cursor = savepoint; + return true; + } + break; + } + } + + return true; +} + +/** + * quantifier : star-quantifier + * | plus-quantifier + * | optional-quantifier + * | range-quantifier + * | <empty> + * ; + */ +static bool +pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) { + while (!pm_regexp_char_is_eof(parser)) { + switch (*parser->cursor) { + case '*': + case '+': + case '?': + parser->cursor++; + break; + case '{': + parser->cursor++; + if (!pm_regexp_parse_range_quantifier(parser)) return false; + break; + default: + // In this case there is no quantifier. + return true; + } + } + + return true; +} + +/** + * match-posix-class : '[' '[' ':' '^'? CHAR+ ':' ']' ']' + * ; + */ +static bool +pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) { + if (!pm_regexp_char_expect(parser, ':')) { + return false; + } + + pm_regexp_char_accept(parser, '^'); + + return ( + pm_regexp_char_find(parser, ':') && + pm_regexp_char_expect(parser, ']') && + pm_regexp_char_expect(parser, ']') + ); +} + +// Forward declaration because character sets can be nested. +static bool +pm_regexp_parse_lbracket(pm_regexp_parser_t *parser, uint16_t depth); + +/** + * match-char-set : '[' '^'? (match-range | match-char)* ']' + * ; + */ +static bool +pm_regexp_parse_character_set(pm_regexp_parser_t *parser, uint16_t depth) { + pm_regexp_char_accept(parser, '^'); + + while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ']') { + switch (*parser->cursor++) { + case '[': + pm_regexp_parse_lbracket(parser, (uint16_t) (depth + 1)); + break; + case '\\': + if (!pm_regexp_char_is_eof(parser)) { + parser->cursor++; + } + break; + default: + // do nothing, we've already advanced the cursor + break; + } + } + + return pm_regexp_char_expect(parser, ']'); +} + +/** + * A left bracket can either mean a POSIX class or a character set. + */ +static bool +pm_regexp_parse_lbracket(pm_regexp_parser_t *parser, uint16_t depth) { + if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) { + pm_regexp_parse_error(parser, parser->start, parser->end, "parse depth limit over"); + return false; + } + + if ((parser->cursor < parser->end) && parser->cursor[0] == ']') { + parser->cursor++; + pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "empty char-class"); + return true; + } + + const uint8_t *reset = parser->cursor; + + if ((parser->cursor + 2 < parser->end) && parser->cursor[0] == '[' && parser->cursor[1] == ':') { + parser->cursor++; + if (pm_regexp_parse_posix_class(parser)) return true; + + parser->cursor = reset; + } + + return pm_regexp_parse_character_set(parser, depth); +} + +// Forward declaration here since parsing groups needs to go back up the grammar +// to parse expressions within them. +static bool +pm_regexp_parse_expression(pm_regexp_parser_t *parser, uint16_t depth); + +/** + * These are the states of the options that are configurable on the regular + * expression (or from within a group). + */ +typedef enum { + PM_REGEXP_OPTION_STATE_INVALID, + PM_REGEXP_OPTION_STATE_TOGGLEABLE, + PM_REGEXP_OPTION_STATE_ADDABLE, + PM_REGEXP_OPTION_STATE_ADDED, + PM_REGEXP_OPTION_STATE_REMOVED +} pm_regexp_option_state_t; + +// These are the options that are configurable on the regular expression (or +// from within a group). + +#define PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM 'a' +#define PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM 'x' +#define PRISM_REGEXP_OPTION_STATE_SLOTS (PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM + 1) + +/** + * This is the set of options that are configurable on the regular expression. + */ +typedef struct { + /** The current state of each option. */ + uint8_t values[PRISM_REGEXP_OPTION_STATE_SLOTS]; +} pm_regexp_options_t; + +/** + * Initialize a new set of options to their default values. + */ +static void +pm_regexp_options_init(pm_regexp_options_t *options) { + memset(options, PM_REGEXP_OPTION_STATE_INVALID, sizeof(uint8_t) * PRISM_REGEXP_OPTION_STATE_SLOTS); + options->values['i' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE; + options->values['m' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE; + options->values['x' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE; + options->values['d' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE; + options->values['a' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE; + options->values['u' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE; +} + +/** + * Attempt to add the given option to the set of options. Returns true if it was + * added, false if it was already present. + */ +static bool +pm_regexp_options_add(pm_regexp_options_t *options, uint8_t key) { + if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) { + key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM); + + switch (options->values[key]) { + case PM_REGEXP_OPTION_STATE_INVALID: + case PM_REGEXP_OPTION_STATE_REMOVED: + return false; + case PM_REGEXP_OPTION_STATE_TOGGLEABLE: + case PM_REGEXP_OPTION_STATE_ADDABLE: + options->values[key] = PM_REGEXP_OPTION_STATE_ADDED; + return true; + case PM_REGEXP_OPTION_STATE_ADDED: + return true; + } + } + + return false; +} + +/** + * Attempt to remove the given option from the set of options. Returns true if + * it was removed, false if it was already absent. + */ +static bool +pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) { + if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) { + key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM); + + switch (options->values[key]) { + case PM_REGEXP_OPTION_STATE_INVALID: + case PM_REGEXP_OPTION_STATE_ADDABLE: + return false; + case PM_REGEXP_OPTION_STATE_TOGGLEABLE: + case PM_REGEXP_OPTION_STATE_ADDED: + case PM_REGEXP_OPTION_STATE_REMOVED: + options->values[key] = PM_REGEXP_OPTION_STATE_REMOVED; + return true; + } + } + + return false; +} + +/** + * True if the given key is set in the options. + */ +static uint8_t +pm_regexp_options_state(pm_regexp_options_t *options, uint8_t key) { + if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) { + key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM); + return options->values[key]; + } + + return false; +} + +/** + * Groups can have quite a few different patterns for syntax. They basically + * just wrap a set of expressions, but they can potentially have options after a + * question mark. If there _isn't_ a question mark, then it's just a set of + * expressions. If there _is_, then here are the options: + * + * * (?#...) - inline comments + * * (?:subexp) - non-capturing group + * * (?=subexp) - positive lookahead + * * (?!subexp) - negative lookahead + * * (?>subexp) - atomic group + * * (?~subexp) - absence operator + * * (?<=subexp) - positive lookbehind + * * (?<!subexp) - negative lookbehind + * * (?<name>subexp) - named capturing group + * * (?'name'subexp) - named capturing group + * * (?(cond)yes-subexp) - conditional expression + * * (?(cond)yes-subexp|no-subexp) - conditional expression + * * (?imxdau-imx) - turn on and off configuration + * * (?imxdau-imx:subexp) - turn on and off configuration for an expression + */ +static bool +pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) { + const uint8_t *group_start = parser->cursor; + + pm_regexp_options_t options; + pm_regexp_options_init(&options); + + // First, parse any options for the group. + if (pm_regexp_char_accept(parser, '?')) { + if (pm_regexp_char_is_eof(parser)) { + pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern in group"); + return false; + } + + switch (*parser->cursor) { + case '#': { // inline comments + parser->cursor++; + if (pm_regexp_char_is_eof(parser)) { + pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern in group"); + return false; + } + + if (parser->encoding_changed && parser->encoding->multibyte) { + bool escaped = false; + + // Here we're going to take a slow path and iterate through + // each multibyte character to find the close paren. We do + // this because \ can be a trailing byte in some encodings. + while (parser->cursor < parser->end) { + if (!escaped && *parser->cursor == ')') { + parser->cursor++; + return true; + } + + size_t width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor)); + if (width == 0) return false; + + escaped = (width == 1) && (*parser->cursor == '\\'); + parser->cursor += width; + } + + return false; + } else { + // Here we can take the fast path and use memchr to find the + // next ) because we are safe checking backward for \ since + // it cannot be a trailing character. + bool found = pm_regexp_char_find(parser, ')'); + + while (found && (parser->start <= parser->cursor - 2) && (*(parser->cursor - 2) == '\\')) { + found = pm_regexp_char_find(parser, ')'); + } + + return found; + } + } + case ':': // non-capturing group + case '=': // positive lookahead + case '!': // negative lookahead + case '>': // atomic group + case '~': // absence operator + parser->cursor++; + break; + case '<': + parser->cursor++; + if (pm_regexp_char_is_eof(parser)) { + pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern with unmatched parenthesis"); + return false; + } + + switch (*parser->cursor) { + case '=': // positive lookbehind + case '!': // negative lookbehind + parser->cursor++; + break; + default: { // named capture group + const uint8_t *start = parser->cursor; + if (!pm_regexp_char_find(parser, '>')) { + return false; + } + + if (parser->cursor - start == 1) { + pm_regexp_parse_error(parser, start, parser->cursor, "group name is empty"); + } + + if (parser->name_callback != NULL) { + pm_regexp_parser_named_capture(parser, start, parser->cursor - 1); + } + + break; + } + } + break; + case '\'': { // named capture group + const uint8_t *start = ++parser->cursor; + if (!pm_regexp_char_find(parser, '\'')) { + return false; + } + + if (parser->name_callback != NULL) { + pm_regexp_parser_named_capture(parser, start, parser->cursor - 1); + } + + break; + } + case '(': // conditional expression + if (!pm_regexp_char_find(parser, ')')) { + return false; + } + break; + case 'i': case 'm': case 'x': case 'd': case 'a': case 'u': // options + while (!pm_regexp_char_is_eof(parser) && *parser->cursor != '-' && *parser->cursor != ':' && *parser->cursor != ')') { + if (!pm_regexp_options_add(&options, *parser->cursor)) { + return false; + } + parser->cursor++; + } + + if (pm_regexp_char_is_eof(parser)) { + return false; + } + + // If we are at the end of the group of options and there is no + // subexpression, then we are going to be setting the options + // for the parent group. In this case we are safe to return now. + if (*parser->cursor == ')') { + if (pm_regexp_options_state(&options, 'x') == PM_REGEXP_OPTION_STATE_ADDED) { + parser->extended_mode = true; + } + + parser->cursor++; + return true; + } + + // If we hit a -, then we're done parsing options. + if (*parser->cursor != '-') break; + + PRISM_FALLTHROUGH + case '-': + parser->cursor++; + while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ':' && *parser->cursor != ')') { + if (!pm_regexp_options_remove(&options, *parser->cursor)) { + return false; + } + parser->cursor++; + } + + if (pm_regexp_char_is_eof(parser)) { + return false; + } + + // If we are at the end of the group of options and there is no + // subexpression, then we are going to be setting the options + // for the parent group. In this case we are safe to return now. + if (*parser->cursor == ')') { + switch (pm_regexp_options_state(&options, 'x')) { + case PM_REGEXP_OPTION_STATE_ADDED: + parser->extended_mode = true; + break; + case PM_REGEXP_OPTION_STATE_REMOVED: + parser->extended_mode = false; + break; + } + + parser->cursor++; + return true; + } + + break; + default: + parser->cursor++; + pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "undefined group option"); + break; + } + } + + bool extended_mode = parser->extended_mode; + switch (pm_regexp_options_state(&options, 'x')) { + case PM_REGEXP_OPTION_STATE_ADDED: + parser->extended_mode = true; + break; + case PM_REGEXP_OPTION_STATE_REMOVED: + parser->extended_mode = false; + break; + } + + // Now, parse the expressions within this group. + while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')') { + if (!pm_regexp_parse_expression(parser, (uint16_t) (depth + 1))) { + parser->extended_mode = extended_mode; + return false; + } + pm_regexp_char_accept(parser, '|'); + } + + // Finally, make sure we have a closing parenthesis. + parser->extended_mode = extended_mode; + if (pm_regexp_char_expect(parser, ')')) return true; + + pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern with unmatched parenthesis"); + return false; +} + +/** + * item : anchor + * | match-posix-class + * | match-char-set + * | match-char-class + * | match-char-prop + * | match-char + * | match-any + * | group + * | quantified + * ; + */ +static bool +pm_regexp_parse_item(pm_regexp_parser_t *parser, uint16_t depth) { + switch (*parser->cursor) { + case '^': + case '$': + parser->cursor++; + return pm_regexp_parse_quantifier(parser); + case '\\': + parser->cursor++; + if (!pm_regexp_char_is_eof(parser)) { + parser->cursor++; + } + return pm_regexp_parse_quantifier(parser); + case '(': + parser->cursor++; + return pm_regexp_parse_group(parser, depth) && pm_regexp_parse_quantifier(parser); + case '[': + parser->cursor++; + return pm_regexp_parse_lbracket(parser, depth) && pm_regexp_parse_quantifier(parser); + case '*': + case '?': + case '+': + parser->cursor++; + pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "target of repeat operator is not specified"); + return true; + case ')': + parser->cursor++; + pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "unmatched close parenthesis"); + return true; + case '#': + if (parser->extended_mode) { + if (!pm_regexp_char_find(parser, '\n')) parser->cursor = parser->end; + return true; + } + PRISM_FALLTHROUGH + default: { + size_t width; + if (!parser->encoding_changed) { + width = pm_encoding_utf_8_char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor)); + } else { + width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor)); + } + + if (width == 0) return false; // TODO: add appropriate error + parser->cursor += width; + + return pm_regexp_parse_quantifier(parser); + } + } +} + +/** + * expression : item+ + * ; + */ +static bool +pm_regexp_parse_expression(pm_regexp_parser_t *parser, uint16_t depth) { + if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) { + pm_regexp_parse_error(parser, parser->start, parser->end, "parse depth limit over"); + return false; + } + + if (!pm_regexp_parse_item(parser, depth)) { + return false; + } + + while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')' && *parser->cursor != '|') { + if (!pm_regexp_parse_item(parser, depth)) { + return false; + } + } + + return true; +} + +/** + * pattern : EOF + * | expression EOF + * | expression '|' pattern + * ; + */ +static bool +pm_regexp_parse_pattern(pm_regexp_parser_t *parser) { + do { + if (pm_regexp_char_is_eof(parser)) return true; + if (!pm_regexp_parse_expression(parser, 0)) return false; + } while (pm_regexp_char_accept(parser, '|')); + + return pm_regexp_char_is_eof(parser); +} + +/** + * Parse a regular expression and extract the names of all of the named capture + * groups. + */ +PRISM_EXPORTED_FUNCTION void +pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, bool extended_mode, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data) { + pm_regexp_parse_pattern(&(pm_regexp_parser_t) { + .parser = parser, + .start = source, + .cursor = source, + .end = source + size, + .extended_mode = extended_mode, + .encoding_changed = parser->encoding_changed, + .encoding = parser->encoding, + .name_callback = name_callback, + .name_data = name_data, + .error_callback = error_callback, + .error_data = error_data + }); +} diff --git a/prism/regexp.h b/prism/regexp.h new file mode 100644 index 0000000000..5366b5a5a0 --- /dev/null +++ b/prism/regexp.h @@ -0,0 +1,43 @@ +/** + * @file regexp.h + * + * A regular expression parser. + */ +#ifndef PRISM_REGEXP_H +#define PRISM_REGEXP_H + +#include "prism/defines.h" +#include "prism/parser.h" +#include "prism/encoding.h" +#include "prism/util/pm_memchr.h" +#include "prism/util/pm_string.h" + +#include <stdbool.h> +#include <stddef.h> +#include <string.h> + +/** + * This callback is called by pm_regexp_parse() when a named capture group is found. + */ +typedef void (*pm_regexp_name_callback_t)(const pm_string_t *name, void *data); + +/** + * This callback is called by pm_regexp_parse() when a parse error is found. + */ +typedef void (*pm_regexp_error_callback_t)(const uint8_t *start, const uint8_t *end, const char *message, void *data); + +/** + * Parse a regular expression. + * + * @param parser The parser that is currently being used. + * @param source The source code to parse. + * @param size The size of the source code. + * @param extended_mode Whether to parse the regular expression in extended mode. + * @param name_callback The optional callback to call when a named capture group is found. + * @param name_data The optional data to pass to the name callback. + * @param error_callback The callback to call when a parse error is found. + * @param error_data The data to pass to the error callback. + */ +PRISM_EXPORTED_FUNCTION void pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, bool extended_mode, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data); + +#endif diff --git a/prism/srcs.mk b/prism/srcs.mk new file mode 100644 index 0000000000..022662a00b --- /dev/null +++ b/prism/srcs.mk @@ -0,0 +1,150 @@ +PRISM_TEMPLATES_DIR = $(PRISM_SRCDIR)/templates +PRISM_TEMPLATE = $(PRISM_TEMPLATES_DIR)/template.rb +PRISM_CONFIG = $(PRISM_SRCDIR)/config.yml + +srcs uncommon.mk: prism/.srcs.mk.time + +prism/.srcs.mk.time: $(order_only) $(PRISM_BUILD_DIR)/.time +prism/$(HAVE_BASERUBY:no=.srcs.mk.time): + touch $@ +prism/$(HAVE_BASERUBY:yes=.srcs.mk.time): \ + $(PRISM_SRCDIR)/templates/template.rb \ + $(PRISM_SRCDIR)/srcs.mk.in + $(BASERUBY) $(tooldir)/generic_erb.rb -c -t$@ -o $(PRISM_SRCDIR)/srcs.mk $(PRISM_SRCDIR)/srcs.mk.in + +distclean-prism-srcs:: + $(RM) prism/.srcs.mk.time + $(RMDIRS) prism || $(NULLCMD) + +distclean-srcs-local:: distclean-prism-srcs + +realclean-prism-srcs:: distclean-prism-srcs + $(RM) $(PRISM_SRCDIR)/srcs.mk + +realclean-srcs-local:: realclean-prism-srcs + +main srcs: $(srcdir)/prism/api_node.c +$(srcdir)/prism/api_node.c: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/ext/prism/api_node.c.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) ext/prism/api_node.c $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/prism/api_node.c + +main incs: $(srcdir)/prism/ast.h +$(srcdir)/prism/ast.h: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/include/prism/ast.h.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) include/prism/ast.h $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/prism/ast.h + +main incs: $(srcdir)/prism/diagnostic.h +$(srcdir)/prism/diagnostic.h: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/include/prism/diagnostic.h.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) include/prism/diagnostic.h $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/prism/diagnostic.h + +main srcs: $(srcdir)/lib/prism/compiler.rb +$(srcdir)/lib/prism/compiler.rb: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/lib/prism/compiler.rb.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) lib/prism/compiler.rb $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/lib/prism/compiler.rb + +main srcs: $(srcdir)/lib/prism/dispatcher.rb +$(srcdir)/lib/prism/dispatcher.rb: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/lib/prism/dispatcher.rb.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) lib/prism/dispatcher.rb $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/lib/prism/dispatcher.rb + +main srcs: $(srcdir)/lib/prism/dot_visitor.rb +$(srcdir)/lib/prism/dot_visitor.rb: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/lib/prism/dot_visitor.rb.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) lib/prism/dot_visitor.rb $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/lib/prism/dot_visitor.rb + +main srcs: $(srcdir)/lib/prism/dsl.rb +$(srcdir)/lib/prism/dsl.rb: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/lib/prism/dsl.rb.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) lib/prism/dsl.rb $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/lib/prism/dsl.rb + +main srcs: $(srcdir)/lib/prism/inspect_visitor.rb +$(srcdir)/lib/prism/inspect_visitor.rb: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/lib/prism/inspect_visitor.rb.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) lib/prism/inspect_visitor.rb $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/lib/prism/inspect_visitor.rb + +main srcs: $(srcdir)/lib/prism/mutation_compiler.rb +$(srcdir)/lib/prism/mutation_compiler.rb: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/lib/prism/mutation_compiler.rb.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) lib/prism/mutation_compiler.rb $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/lib/prism/mutation_compiler.rb + +main srcs: $(srcdir)/lib/prism/node.rb +$(srcdir)/lib/prism/node.rb: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/lib/prism/node.rb.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) lib/prism/node.rb $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/lib/prism/node.rb + +main srcs: $(srcdir)/lib/prism/reflection.rb +$(srcdir)/lib/prism/reflection.rb: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/lib/prism/reflection.rb.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) lib/prism/reflection.rb $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/lib/prism/reflection.rb + +main srcs: $(srcdir)/lib/prism/serialize.rb +$(srcdir)/lib/prism/serialize.rb: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/lib/prism/serialize.rb.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) lib/prism/serialize.rb $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/lib/prism/serialize.rb + +main srcs: $(srcdir)/lib/prism/visitor.rb +$(srcdir)/lib/prism/visitor.rb: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/lib/prism/visitor.rb.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) lib/prism/visitor.rb $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/lib/prism/visitor.rb + +main srcs: $(srcdir)/prism/diagnostic.c +$(srcdir)/prism/diagnostic.c: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/src/diagnostic.c.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) src/diagnostic.c $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/prism/diagnostic.c + +main srcs: $(srcdir)/prism/node.c +$(srcdir)/prism/node.c: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/src/node.c.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) src/node.c $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/prism/node.c + +main srcs: $(srcdir)/prism/prettyprint.c +$(srcdir)/prism/prettyprint.c: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/src/prettyprint.c.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) src/prettyprint.c $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/prism/prettyprint.c + +main srcs: $(srcdir)/prism/serialize.c +$(srcdir)/prism/serialize.c: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/src/serialize.c.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) src/serialize.c $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/prism/serialize.c + +main srcs: $(srcdir)/prism/token_type.c +$(srcdir)/prism/token_type.c: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/src/token_type.c.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) src/token_type.c $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/prism/token_type.c diff --git a/prism/srcs.mk.in b/prism/srcs.mk.in new file mode 100644 index 0000000000..cc263fd1b4 --- /dev/null +++ b/prism/srcs.mk.in @@ -0,0 +1,48 @@ +<% # -*- ruby -*- +require_relative 'templates/template' + +script = File.basename(__FILE__) +srcs = output ? File.basename(output) : script.chomp('.in') +mk = 'uncommon.mk' + +# %> +PRISM_TEMPLATES_DIR = $(PRISM_SRCDIR)/templates +PRISM_TEMPLATE = $(PRISM_TEMPLATES_DIR)/template.rb +PRISM_CONFIG = $(PRISM_SRCDIR)/config.yml + +srcs <%=%><%=mk%>: prism/.srcs.mk.time + +prism/.srcs.mk.time: $(order_only) $(PRISM_BUILD_DIR)/.time +prism/$(HAVE_BASERUBY:no=.srcs.mk.time): + touch $@ +prism/$(HAVE_BASERUBY:yes=.srcs.mk.time): \ + $(PRISM_SRCDIR)/templates/template.rb \ + $(PRISM_SRCDIR)/<%=%><%=script%> + $(BASERUBY) $(tooldir)/generic_erb.rb -c -t$@ -o $(PRISM_SRCDIR)/<%=%><%=srcs%> $(PRISM_SRCDIR)/<%=%><%=script%> + +distclean-prism-srcs:: + $(RM) prism/.srcs.mk.time + $(RMDIRS) prism || $(NULLCMD) + +distclean-srcs-local:: distclean-prism-srcs + +realclean-prism-srcs:: distclean-prism-srcs + $(RM) $(PRISM_SRCDIR)/<%=%><%=srcs%> + +realclean-srcs-local:: realclean-prism-srcs +<% Prism::Template::TEMPLATES.map do |t| + /\.(?:[ch]|rb)\z/ =~ t or next + s = '$(srcdir)/' + t.sub(%r[\A(?:(src)|ext|include)/]) {$1 && 'prism/'} + s.sub!(%r[\A\$(srcdir)/prism/], '$(PRISM_SRCDIR)/') + target = s.end_with?('.h') ? 'incs' : 'srcs' +# %> + +main <%=%><%=target%>: <%=%><%=s%> +<%=%><%=s%>: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/<%=%><%=t%>.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) <%=%><%=t%> $@ + +realclean-prism-srcs:: + $(RM) <%=%><%=s%> +<% +end +# %> diff --git a/prism/static_literals.c b/prism/static_literals.c new file mode 100644 index 0000000000..9fa37b999a --- /dev/null +++ b/prism/static_literals.c @@ -0,0 +1,617 @@ +#include "prism/static_literals.h" + +/** + * A small struct used for passing around a subset of the information that is + * stored on the parser. We use this to avoid having static literals explicitly + * depend on the parser struct. + */ +typedef struct { + /** The list of newline offsets to use to calculate line numbers. */ + const pm_newline_list_t *newline_list; + + /** The line number that the parser starts on. */ + int32_t start_line; + + /** The name of the encoding that the parser is using. */ + const char *encoding_name; +} pm_static_literals_metadata_t; + +static inline uint32_t +murmur_scramble(uint32_t value) { + value *= 0xcc9e2d51; + value = (value << 15) | (value >> 17); + value *= 0x1b873593; + return value; +} + +/** + * Murmur hash (https://en.wikipedia.org/wiki/MurmurHash) is a non-cryptographic + * general-purpose hash function. It is fast, which is what we care about in + * this case. + */ +static uint32_t +murmur_hash(const uint8_t *key, size_t length) { + uint32_t hash = 0x9747b28c; + uint32_t segment; + + for (size_t index = length >> 2; index; index--) { + memcpy(&segment, key, sizeof(uint32_t)); + key += sizeof(uint32_t); + hash ^= murmur_scramble(segment); + hash = (hash << 13) | (hash >> 19); + hash = hash * 5 + 0xe6546b64; + } + + segment = 0; + for (size_t index = length & 3; index; index--) { + segment <<= 8; + segment |= key[index - 1]; + } + + hash ^= murmur_scramble(segment); + hash ^= (uint32_t) length; + hash ^= hash >> 16; + hash *= 0x85ebca6b; + hash ^= hash >> 13; + hash *= 0xc2b2ae35; + hash ^= hash >> 16; + return hash; +} + +/** + * Hash the value of an integer and return it. + */ +static uint32_t +integer_hash(const pm_integer_t *integer) { + uint32_t hash; + if (integer->values) { + hash = murmur_hash((const uint8_t *) integer->values, sizeof(uint32_t) * integer->length); + } else { + hash = murmur_hash((const uint8_t *) &integer->value, sizeof(uint32_t)); + } + + if (integer->negative) { + hash ^= murmur_scramble((uint32_t) 1); + } + + return hash; +} + +/** + * Return the hash of the given node. It is important that nodes that have + * equivalent static literal values have the same hash. This is because we use + * these hashes to look for duplicates. + */ +static uint32_t +node_hash(const pm_static_literals_metadata_t *metadata, const pm_node_t *node) { + switch (PM_NODE_TYPE(node)) { + case PM_INTEGER_NODE: { + // Integers hash their value. + const pm_integer_node_t *cast = (const pm_integer_node_t *) node; + return integer_hash(&cast->value); + } + case PM_SOURCE_LINE_NODE: { + // Source lines hash their line number. + const pm_line_column_t line_column = pm_newline_list_line_column(metadata->newline_list, node->location.start, metadata->start_line); + const int32_t *value = &line_column.line; + return murmur_hash((const uint8_t *) value, sizeof(int32_t)); + } + case PM_FLOAT_NODE: { + // Floats hash their value. + const double *value = &((const pm_float_node_t *) node)->value; + return murmur_hash((const uint8_t *) value, sizeof(double)); + } + case PM_RATIONAL_NODE: { + // Rationals hash their numerator and denominator. + const pm_rational_node_t *cast = (const pm_rational_node_t *) node; + return integer_hash(&cast->numerator) ^ integer_hash(&cast->denominator) ^ murmur_scramble((uint32_t) cast->base.type); + } + case PM_IMAGINARY_NODE: { + // Imaginaries hash their numeric value. Because their numeric value + // is stored as a subnode, we hash that node and then mix in the + // fact that this is an imaginary node. + const pm_node_t *numeric = ((const pm_imaginary_node_t *) node)->numeric; + return node_hash(metadata, numeric) ^ murmur_scramble((uint32_t) node->type); + } + case PM_STRING_NODE: { + // Strings hash their value and mix in their flags so that different + // encodings are not considered equal. + const pm_string_t *value = &((const pm_string_node_t *) node)->unescaped; + + pm_node_flags_t flags = node->flags; + flags &= (PM_STRING_FLAGS_FORCED_BINARY_ENCODING | PM_STRING_FLAGS_FORCED_UTF8_ENCODING); + + return murmur_hash(pm_string_source(value), pm_string_length(value) * sizeof(uint8_t)) ^ murmur_scramble((uint32_t) flags); + } + case PM_SOURCE_FILE_NODE: { + // Source files hash their value and mix in their flags so that + // different encodings are not considered equal. + const pm_string_t *value = &((const pm_source_file_node_t *) node)->filepath; + return murmur_hash(pm_string_source(value), pm_string_length(value) * sizeof(uint8_t)); + } + case PM_REGULAR_EXPRESSION_NODE: { + // Regular expressions hash their value and mix in their flags so + // that different encodings are not considered equal. + const pm_string_t *value = &((const pm_regular_expression_node_t *) node)->unescaped; + return murmur_hash(pm_string_source(value), pm_string_length(value) * sizeof(uint8_t)) ^ murmur_scramble((uint32_t) node->flags); + } + case PM_SYMBOL_NODE: { + // Symbols hash their value and mix in their flags so that different + // encodings are not considered equal. + const pm_string_t *value = &((const pm_symbol_node_t *) node)->unescaped; + return murmur_hash(pm_string_source(value), pm_string_length(value) * sizeof(uint8_t)) ^ murmur_scramble((uint32_t) node->flags); + } + default: + assert(false && "unreachable"); + return 0; + } +} + +/** + * Insert a node into the node hash. It accepts the hash that should hold the + * new node, the parser that generated the node, the node to insert, and a + * comparison function. The comparison function is used for collision detection, + * and must be able to compare all node types that will be stored in this hash. + */ +static pm_node_t * +pm_node_hash_insert(pm_node_hash_t *hash, const pm_static_literals_metadata_t *metadata, pm_node_t *node, bool replace, int (*compare)(const pm_static_literals_metadata_t *metadata, const pm_node_t *left, const pm_node_t *right)) { + // If we are out of space, we need to resize the hash. This will cause all + // of the nodes to be rehashed and reinserted into the new hash. + if (hash->size * 2 >= hash->capacity) { + // First, allocate space for the new node list. + uint32_t new_capacity = hash->capacity == 0 ? 4 : hash->capacity * 2; + pm_node_t **new_nodes = xcalloc(new_capacity, sizeof(pm_node_t *)); + if (new_nodes == NULL) return NULL; + + // It turns out to be more efficient to mask the hash value than to use + // the modulo operator. Because our capacities are always powers of two, + // we can use a bitwise AND to get the same result as the modulo + // operator. + uint32_t mask = new_capacity - 1; + + // Now, rehash all of the nodes into the new list. + for (uint32_t index = 0; index < hash->capacity; index++) { + pm_node_t *node = hash->nodes[index]; + + if (node != NULL) { + uint32_t index = node_hash(metadata, node) & mask; + new_nodes[index] = node; + } + } + + // Finally, free the old node list and update the hash. + xfree(hash->nodes); + hash->nodes = new_nodes; + hash->capacity = new_capacity; + } + + // Now, insert the node into the hash. + uint32_t mask = hash->capacity - 1; + uint32_t index = node_hash(metadata, node) & mask; + + // We use linear probing to resolve collisions. This means that if the + // current index is occupied, we will move to the next index and try again. + // We are guaranteed that this will eventually find an empty slot because we + // resize the hash when it gets too full. + while (hash->nodes[index] != NULL) { + if (compare(metadata, hash->nodes[index], node) == 0) break; + index = (index + 1) & mask; + } + + // If the current index is occupied, we need to return the node that was + // already in the hash. Otherwise, we can just increment the size and insert + // the new node. + pm_node_t *result = hash->nodes[index]; + + if (result == NULL) { + hash->size++; + hash->nodes[index] = node; + } else if (replace) { + hash->nodes[index] = node; + } + + return result; +} + +/** + * Free the internal memory associated with the given node hash. + */ +static void +pm_node_hash_free(pm_node_hash_t *hash) { + if (hash->capacity > 0) xfree(hash->nodes); +} + +/** + * Compare two values that can be compared with a simple numeric comparison. + */ +#define PM_NUMERIC_COMPARISON(left, right) ((left < right) ? -1 : (left > right) ? 1 : 0) + +/** + * Return the integer value of the given node as an int64_t. + */ +static int64_t +pm_int64_value(const pm_static_literals_metadata_t *metadata, const pm_node_t *node) { + switch (PM_NODE_TYPE(node)) { + case PM_INTEGER_NODE: { + const pm_integer_t *integer = &((const pm_integer_node_t *) node)->value; + if (integer->values) return integer->negative ? INT64_MIN : INT64_MAX; + + int64_t value = (int64_t) integer->value; + return integer->negative ? -value : value; + } + case PM_SOURCE_LINE_NODE: + return (int64_t) pm_newline_list_line_column(metadata->newline_list, node->location.start, metadata->start_line).line; + default: + assert(false && "unreachable"); + return 0; + } +} + +/** + * A comparison function for comparing two IntegerNode or SourceLineNode + * instances. + */ +static int +pm_compare_integer_nodes(const pm_static_literals_metadata_t *metadata, const pm_node_t *left, const pm_node_t *right) { + if (PM_NODE_TYPE_P(left, PM_SOURCE_LINE_NODE) || PM_NODE_TYPE_P(right, PM_SOURCE_LINE_NODE)) { + int64_t left_value = pm_int64_value(metadata, left); + int64_t right_value = pm_int64_value(metadata, right); + return PM_NUMERIC_COMPARISON(left_value, right_value); + } + + const pm_integer_t *left_integer = &((const pm_integer_node_t *) left)->value; + const pm_integer_t *right_integer = &((const pm_integer_node_t *) right)->value; + return pm_integer_compare(left_integer, right_integer); +} + +/** + * A comparison function for comparing two FloatNode instances. + */ +static int +pm_compare_float_nodes(PRISM_ATTRIBUTE_UNUSED const pm_static_literals_metadata_t *metadata, const pm_node_t *left, const pm_node_t *right) { + const double left_value = ((const pm_float_node_t *) left)->value; + const double right_value = ((const pm_float_node_t *) right)->value; + return PM_NUMERIC_COMPARISON(left_value, right_value); +} + +/** + * A comparison function for comparing two nodes that have attached numbers. + */ +static int +pm_compare_number_nodes(const pm_static_literals_metadata_t *metadata, const pm_node_t *left, const pm_node_t *right) { + if (PM_NODE_TYPE(left) != PM_NODE_TYPE(right)) { + return PM_NUMERIC_COMPARISON(PM_NODE_TYPE(left), PM_NODE_TYPE(right)); + } + + switch (PM_NODE_TYPE(left)) { + case PM_IMAGINARY_NODE: + return pm_compare_number_nodes(metadata, ((const pm_imaginary_node_t *) left)->numeric, ((const pm_imaginary_node_t *) right)->numeric); + case PM_RATIONAL_NODE: { + const pm_rational_node_t *left_rational = (const pm_rational_node_t *) left; + const pm_rational_node_t *right_rational = (const pm_rational_node_t *) right; + + int result = pm_integer_compare(&left_rational->denominator, &right_rational->denominator); + if (result != 0) return result; + + return pm_integer_compare(&left_rational->numerator, &right_rational->numerator); + } + case PM_INTEGER_NODE: + return pm_compare_integer_nodes(metadata, left, right); + case PM_FLOAT_NODE: + return pm_compare_float_nodes(metadata, left, right); + default: + assert(false && "unreachable"); + return 0; + } +} + +/** + * Return a pointer to the string value of the given node. + */ +static const pm_string_t * +pm_string_value(const pm_node_t *node) { + switch (PM_NODE_TYPE(node)) { + case PM_STRING_NODE: + return &((const pm_string_node_t *) node)->unescaped; + case PM_SOURCE_FILE_NODE: + return &((const pm_source_file_node_t *) node)->filepath; + case PM_SYMBOL_NODE: + return &((const pm_symbol_node_t *) node)->unescaped; + default: + assert(false && "unreachable"); + return NULL; + } +} + +/** + * A comparison function for comparing two nodes that have attached strings. + */ +static int +pm_compare_string_nodes(PRISM_ATTRIBUTE_UNUSED const pm_static_literals_metadata_t *metadata, const pm_node_t *left, const pm_node_t *right) { + const pm_string_t *left_string = pm_string_value(left); + const pm_string_t *right_string = pm_string_value(right); + return pm_string_compare(left_string, right_string); +} + +/** + * A comparison function for comparing two RegularExpressionNode instances. + */ +static int +pm_compare_regular_expression_nodes(PRISM_ATTRIBUTE_UNUSED const pm_static_literals_metadata_t *metadata, const pm_node_t *left, const pm_node_t *right) { + const pm_regular_expression_node_t *left_regexp = (const pm_regular_expression_node_t *) left; + const pm_regular_expression_node_t *right_regexp = (const pm_regular_expression_node_t *) right; + + int result = pm_string_compare(&left_regexp->unescaped, &right_regexp->unescaped); + if (result != 0) return result; + + return PM_NUMERIC_COMPARISON(left_regexp->base.flags, right_regexp->base.flags); +} + +#undef PM_NUMERIC_COMPARISON + +/** + * Add a node to the set of static literals. + */ +pm_node_t * +pm_static_literals_add(const pm_newline_list_t *newline_list, int32_t start_line, pm_static_literals_t *literals, pm_node_t *node, bool replace) { + switch (PM_NODE_TYPE(node)) { + case PM_INTEGER_NODE: + case PM_SOURCE_LINE_NODE: + return pm_node_hash_insert( + &literals->integer_nodes, + &(pm_static_literals_metadata_t) { + .newline_list = newline_list, + .start_line = start_line, + .encoding_name = NULL + }, + node, + replace, + pm_compare_integer_nodes + ); + case PM_FLOAT_NODE: + return pm_node_hash_insert( + &literals->float_nodes, + &(pm_static_literals_metadata_t) { + .newline_list = newline_list, + .start_line = start_line, + .encoding_name = NULL + }, + node, + replace, + pm_compare_float_nodes + ); + case PM_RATIONAL_NODE: + case PM_IMAGINARY_NODE: + return pm_node_hash_insert( + &literals->number_nodes, + &(pm_static_literals_metadata_t) { + .newline_list = newline_list, + .start_line = start_line, + .encoding_name = NULL + }, + node, + replace, + pm_compare_number_nodes + ); + case PM_STRING_NODE: + case PM_SOURCE_FILE_NODE: + return pm_node_hash_insert( + &literals->string_nodes, + &(pm_static_literals_metadata_t) { + .newline_list = newline_list, + .start_line = start_line, + .encoding_name = NULL + }, + node, + replace, + pm_compare_string_nodes + ); + case PM_REGULAR_EXPRESSION_NODE: + return pm_node_hash_insert( + &literals->regexp_nodes, + &(pm_static_literals_metadata_t) { + .newline_list = newline_list, + .start_line = start_line, + .encoding_name = NULL + }, + node, + replace, + pm_compare_regular_expression_nodes + ); + case PM_SYMBOL_NODE: + return pm_node_hash_insert( + &literals->symbol_nodes, + &(pm_static_literals_metadata_t) { + .newline_list = newline_list, + .start_line = start_line, + .encoding_name = NULL + }, + node, + replace, + pm_compare_string_nodes + ); + case PM_TRUE_NODE: { + pm_node_t *duplicated = literals->true_node; + if ((duplicated == NULL) || replace) literals->true_node = node; + return duplicated; + } + case PM_FALSE_NODE: { + pm_node_t *duplicated = literals->false_node; + if ((duplicated == NULL) || replace) literals->false_node = node; + return duplicated; + } + case PM_NIL_NODE: { + pm_node_t *duplicated = literals->nil_node; + if ((duplicated == NULL) || replace) literals->nil_node = node; + return duplicated; + } + case PM_SOURCE_ENCODING_NODE: { + pm_node_t *duplicated = literals->source_encoding_node; + if ((duplicated == NULL) || replace) literals->source_encoding_node = node; + return duplicated; + } + default: + return NULL; + } +} + +/** + * Free the internal memory associated with the given static literals set. + */ +void +pm_static_literals_free(pm_static_literals_t *literals) { + pm_node_hash_free(&literals->integer_nodes); + pm_node_hash_free(&literals->float_nodes); + pm_node_hash_free(&literals->number_nodes); + pm_node_hash_free(&literals->string_nodes); + pm_node_hash_free(&literals->regexp_nodes); + pm_node_hash_free(&literals->symbol_nodes); +} + +/** + * A helper to determine if the given node is a static literal that is positive. + * This is used for formatting imaginary nodes. + */ +static bool +pm_static_literal_positive_p(const pm_node_t *node) { + switch (PM_NODE_TYPE(node)) { + case PM_FLOAT_NODE: + return ((const pm_float_node_t *) node)->value > 0; + case PM_INTEGER_NODE: + return !((const pm_integer_node_t *) node)->value.negative; + case PM_RATIONAL_NODE: + return !((const pm_rational_node_t *) node)->numerator.negative; + case PM_IMAGINARY_NODE: + return pm_static_literal_positive_p(((const pm_imaginary_node_t *) node)->numeric); + default: + assert(false && "unreachable"); + return false; + } +} + +/** + * Create a string-based representation of the given static literal. + */ +static inline void +pm_static_literal_inspect_node(pm_buffer_t *buffer, const pm_static_literals_metadata_t *metadata, const pm_node_t *node) { + switch (PM_NODE_TYPE(node)) { + case PM_FALSE_NODE: + pm_buffer_append_string(buffer, "false", 5); + break; + case PM_FLOAT_NODE: { + const double value = ((const pm_float_node_t *) node)->value; + + if (PRISM_ISINF(value)) { + if (*node->location.start == '-') { + pm_buffer_append_byte(buffer, '-'); + } + pm_buffer_append_string(buffer, "Infinity", 8); + } else if (value == 0.0) { + if (*node->location.start == '-') { + pm_buffer_append_byte(buffer, '-'); + } + pm_buffer_append_string(buffer, "0.0", 3); + } else { + pm_buffer_append_format(buffer, "%g", value); + + // %g will not insert a .0 for 1e100 (we'll get back 1e+100). So + // we check for the decimal point and add it in here if it's not + // present. + if (pm_buffer_index(buffer, '.') == SIZE_MAX) { + size_t exponent_index = pm_buffer_index(buffer, 'e'); + size_t index = exponent_index == SIZE_MAX ? pm_buffer_length(buffer) : exponent_index; + pm_buffer_insert(buffer, index, ".0", 2); + } + } + + break; + } + case PM_IMAGINARY_NODE: { + const pm_node_t *numeric = ((const pm_imaginary_node_t *) node)->numeric; + pm_buffer_append_string(buffer, "(0", 2); + if (pm_static_literal_positive_p(numeric)) pm_buffer_append_byte(buffer, '+'); + pm_static_literal_inspect_node(buffer, metadata, numeric); + if (PM_NODE_TYPE_P(numeric, PM_RATIONAL_NODE)) { + pm_buffer_append_byte(buffer, '*'); + } + pm_buffer_append_string(buffer, "i)", 2); + break; + } + case PM_INTEGER_NODE: + pm_integer_string(buffer, &((const pm_integer_node_t *) node)->value); + break; + case PM_NIL_NODE: + pm_buffer_append_string(buffer, "nil", 3); + break; + case PM_RATIONAL_NODE: { + const pm_rational_node_t *rational = (const pm_rational_node_t *) node; + pm_buffer_append_byte(buffer, '('); + pm_integer_string(buffer, &rational->numerator); + pm_buffer_append_byte(buffer, '/'); + pm_integer_string(buffer, &rational->denominator); + pm_buffer_append_byte(buffer, ')'); + break; + } + case PM_REGULAR_EXPRESSION_NODE: { + const pm_string_t *unescaped = &((const pm_regular_expression_node_t *) node)->unescaped; + pm_buffer_append_byte(buffer, '/'); + pm_buffer_append_source(buffer, pm_string_source(unescaped), pm_string_length(unescaped), PM_BUFFER_ESCAPING_RUBY); + pm_buffer_append_byte(buffer, '/'); + + if (PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_MULTI_LINE)) pm_buffer_append_string(buffer, "m", 1); + if (PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_IGNORE_CASE)) pm_buffer_append_string(buffer, "i", 1); + if (PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_EXTENDED)) pm_buffer_append_string(buffer, "x", 1); + if (PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_ASCII_8BIT)) pm_buffer_append_string(buffer, "n", 1); + + break; + } + case PM_SOURCE_ENCODING_NODE: + pm_buffer_append_format(buffer, "#<Encoding:%s>", metadata->encoding_name); + break; + case PM_SOURCE_FILE_NODE: { + const pm_string_t *filepath = &((const pm_source_file_node_t *) node)->filepath; + pm_buffer_append_byte(buffer, '"'); + pm_buffer_append_source(buffer, pm_string_source(filepath), pm_string_length(filepath), PM_BUFFER_ESCAPING_RUBY); + pm_buffer_append_byte(buffer, '"'); + break; + } + case PM_SOURCE_LINE_NODE: + pm_buffer_append_format(buffer, "%d", pm_newline_list_line_column(metadata->newline_list, node->location.start, metadata->start_line).line); + break; + case PM_STRING_NODE: { + const pm_string_t *unescaped = &((const pm_string_node_t *) node)->unescaped; + pm_buffer_append_byte(buffer, '"'); + pm_buffer_append_source(buffer, pm_string_source(unescaped), pm_string_length(unescaped), PM_BUFFER_ESCAPING_RUBY); + pm_buffer_append_byte(buffer, '"'); + break; + } + case PM_SYMBOL_NODE: { + const pm_string_t *unescaped = &((const pm_symbol_node_t *) node)->unescaped; + pm_buffer_append_byte(buffer, ':'); + pm_buffer_append_source(buffer, pm_string_source(unescaped), pm_string_length(unescaped), PM_BUFFER_ESCAPING_RUBY); + break; + } + case PM_TRUE_NODE: + pm_buffer_append_string(buffer, "true", 4); + break; + default: + assert(false && "unreachable"); + break; + } +} + +/** + * Create a string-based representation of the given static literal. + */ +void +pm_static_literal_inspect(pm_buffer_t *buffer, const pm_newline_list_t *newline_list, int32_t start_line, const char *encoding_name, const pm_node_t *node) { + pm_static_literal_inspect_node( + buffer, + &(pm_static_literals_metadata_t) { + .newline_list = newline_list, + .start_line = start_line, + .encoding_name = encoding_name + }, + node + ); +} diff --git a/prism/static_literals.h b/prism/static_literals.h new file mode 100644 index 0000000000..bd29761899 --- /dev/null +++ b/prism/static_literals.h @@ -0,0 +1,121 @@ +/** + * @file static_literals.h + * + * A set of static literal nodes that can be checked for duplicates. + */ +#ifndef PRISM_STATIC_LITERALS_H +#define PRISM_STATIC_LITERALS_H + +#include "prism/defines.h" +#include "prism/ast.h" +#include "prism/util/pm_newline_list.h" + +#include <assert.h> +#include <stdbool.h> + +/** + * An internal hash table for a set of nodes. + */ +typedef struct { + /** The array of nodes in the hash table. */ + pm_node_t **nodes; + + /** The size of the hash table. */ + uint32_t size; + + /** The space that has been allocated in the hash table. */ + uint32_t capacity; +} pm_node_hash_t; + +/** + * Certain sets of nodes (hash keys and when clauses) check for duplicate nodes + * to alert the user of potential issues. To do this, we keep a set of the nodes + * that have been seen so far, and compare whenever we find a new node. + * + * We bucket the nodes based on their type to minimize the number of comparisons + * that need to be performed. + */ +typedef struct { + /** + * This is the set of IntegerNode and SourceLineNode instances. + */ + pm_node_hash_t integer_nodes; + + /** + * This is the set of FloatNode instances. + */ + pm_node_hash_t float_nodes; + + /** + * This is the set of RationalNode and ImaginaryNode instances. + */ + pm_node_hash_t number_nodes; + + /** + * This is the set of StringNode and SourceFileNode instances. + */ + pm_node_hash_t string_nodes; + + /** + * This is the set of RegularExpressionNode instances. + */ + pm_node_hash_t regexp_nodes; + + /** + * This is the set of SymbolNode instances. + */ + pm_node_hash_t symbol_nodes; + + /** + * A pointer to the last TrueNode instance that was inserted, or NULL. + */ + pm_node_t *true_node; + + /** + * A pointer to the last FalseNode instance that was inserted, or NULL. + */ + pm_node_t *false_node; + + /** + * A pointer to the last NilNode instance that was inserted, or NULL. + */ + pm_node_t *nil_node; + + /** + * A pointer to the last SourceEncodingNode instance that was inserted, or + * NULL. + */ + pm_node_t *source_encoding_node; +} pm_static_literals_t; + +/** + * Add a node to the set of static literals. + * + * @param newline_list The list of newline offsets to use to calculate lines. + * @param start_line The line number that the parser starts on. + * @param literals The set of static literals to add the node to. + * @param node The node to add to the set. + * @param replace Whether to replace the previous node if one already exists. + * @return A pointer to the node that is being overwritten, if there is one. + */ +pm_node_t * pm_static_literals_add(const pm_newline_list_t *newline_list, int32_t start_line, pm_static_literals_t *literals, pm_node_t *node, bool replace); + +/** + * Free the internal memory associated with the given static literals set. + * + * @param literals The set of static literals to free. + */ +void pm_static_literals_free(pm_static_literals_t *literals); + +/** + * Create a string-based representation of the given static literal. + * + * @param buffer The buffer to write the string to. + * @param newline_list The list of newline offsets to use to calculate lines. + * @param start_line The line number that the parser starts on. + * @param encoding_name The name of the encoding of the source being parsed. + * @param node The node to create a string representation of. + */ +void pm_static_literal_inspect(pm_buffer_t *buffer, const pm_newline_list_t *newline_list, int32_t start_line, const char *encoding_name, const pm_node_t *node); + +#endif diff --git a/prism/templates/ext/prism/api_node.c.erb b/prism/templates/ext/prism/api_node.c.erb new file mode 100644 index 0000000000..23af8886a7 --- /dev/null +++ b/prism/templates/ext/prism/api_node.c.erb @@ -0,0 +1,282 @@ +#line <%= __LINE__ + 1 %> "prism/templates/ext/prism/<%= File.basename(__FILE__) %>" +#include "prism/extension.h" + +extern VALUE rb_cPrism; +extern VALUE rb_cPrismNode; +extern VALUE rb_cPrismSource; +extern VALUE rb_cPrismToken; +extern VALUE rb_cPrismLocation; + +<%- nodes.each do |node| -%> +static VALUE rb_cPrism<%= node.name %>; +<%- end -%> + +static VALUE +pm_location_new(const pm_parser_t *parser, const uint8_t *start, const uint8_t *end, VALUE source, bool freeze) { + if (freeze) { + VALUE location_argv[] = { + source, + LONG2FIX(start - parser->start), + LONG2FIX(end - start) + }; + + return rb_obj_freeze(rb_class_new_instance(3, location_argv, rb_cPrismLocation)); + } else { + uint64_t value = ((((uint64_t) (start - parser->start)) << 32) | ((uint32_t) (end - start))); + return ULL2NUM(value); + } +} + +VALUE +pm_token_new(const pm_parser_t *parser, const pm_token_t *token, rb_encoding *encoding, VALUE source, bool freeze) { + ID type = rb_intern(pm_token_type_name(token->type)); + VALUE location = pm_location_new(parser, token->start, token->end, source, freeze); + + VALUE slice = rb_enc_str_new((const char *) token->start, token->end - token->start, encoding); + if (freeze) rb_obj_freeze(slice); + + VALUE argv[] = { source, ID2SYM(type), slice, location }; + VALUE value = rb_class_new_instance(4, argv, rb_cPrismToken); + if (freeze) rb_obj_freeze(value); + + return value; +} + +static VALUE +pm_string_new(const pm_string_t *string, rb_encoding *encoding) { + return rb_obj_freeze(rb_enc_str_new((const char *) pm_string_source(string), pm_string_length(string), encoding)); +} + +VALUE +pm_integer_new(const pm_integer_t *integer) { + VALUE result; + if (integer->values == NULL) { + result = UINT2NUM(integer->value); + } else { + VALUE string = rb_str_new(NULL, integer->length * 8); + unsigned char *bytes = (unsigned char *) RSTRING_PTR(string); + + size_t offset = integer->length * 8; + for (size_t value_index = 0; value_index < integer->length; value_index++) { + uint32_t value = integer->values[value_index]; + + for (int index = 0; index < 8; index++) { + int byte = (value >> (4 * index)) & 0xf; + bytes[--offset] = byte < 10 ? byte + '0' : byte - 10 + 'a'; + } + } + + result = rb_funcall(string, rb_intern("to_i"), 1, UINT2NUM(16)); + } + + if (integer->negative) { + result = rb_funcall(result, rb_intern("-@"), 0); + } + + return result; +} + +// Create a Prism::Source object from the given parser, after pm_parse() was called. +VALUE +pm_source_new(const pm_parser_t *parser, rb_encoding *encoding, bool freeze) { + VALUE source_string = rb_enc_str_new((const char *) parser->start, parser->end - parser->start, encoding); + + VALUE offsets = rb_ary_new_capa(parser->newline_list.size); + for (size_t index = 0; index < parser->newline_list.size; index++) { + rb_ary_push(offsets, ULONG2NUM(parser->newline_list.offsets[index])); + } + + if (freeze) { + rb_obj_freeze(source_string); + rb_obj_freeze(offsets); + } + + VALUE source = rb_funcall(rb_cPrismSource, rb_intern("for"), 3, source_string, LONG2NUM(parser->start_line), offsets); + if (freeze) rb_obj_freeze(source); + + return source; +} + +typedef struct pm_node_stack_node { + struct pm_node_stack_node *prev; + const pm_node_t *visit; + bool visited; +} pm_node_stack_node_t; + +static void +pm_node_stack_push(pm_node_stack_node_t **stack, const pm_node_t *visit) { + pm_node_stack_node_t *node = xmalloc(sizeof(pm_node_stack_node_t)); + node->prev = *stack; + node->visit = visit; + node->visited = false; + *stack = node; +} + +static const pm_node_t * +pm_node_stack_pop(pm_node_stack_node_t **stack) { + pm_node_stack_node_t *current = *stack; + const pm_node_t *visit = current->visit; + + *stack = current->prev; + xfree(current); + + return visit; +} + +VALUE +pm_ast_new(const pm_parser_t *parser, const pm_node_t *node, rb_encoding *encoding, VALUE source, bool freeze) { + VALUE constants = rb_ary_new_capa(parser->constant_pool.size); + + for (uint32_t index = 0; index < parser->constant_pool.size; index++) { + pm_constant_t *constant = &parser->constant_pool.constants[index]; + int state = 0; + + VALUE string = rb_enc_str_new((const char *) constant->start, constant->length, encoding); + VALUE value = rb_protect(rb_str_intern, string, &state); + + if (state != 0) { + value = ID2SYM(rb_intern_const("?")); + rb_set_errinfo(Qnil); + } + + rb_ary_push(constants, value); + } + + pm_node_stack_node_t *node_stack = NULL; + pm_node_stack_push(&node_stack, node); + VALUE value_stack = rb_ary_new(); + + while (node_stack != NULL) { + if (!node_stack->visited) { + if (node_stack->visit == NULL) { + pm_node_stack_pop(&node_stack); + rb_ary_push(value_stack, Qnil); + continue; + } + + const pm_node_t *node = node_stack->visit; + node_stack->visited = true; + + switch (PM_NODE_TYPE(node)) { + <%- nodes.each do |node| -%> + <%- if node.fields.any? { |field| [Prism::Template::NodeField, Prism::Template::OptionalNodeField, Prism::Template::NodeListField].include?(field.class) } -%> +#line <%= __LINE__ + 1 %> "prism/templates/ext/prism/<%= File.basename(__FILE__) %>" + case <%= node.type %>: { + pm_<%= node.human %>_t *cast = (pm_<%= node.human %>_t *) node; + <%- node.fields.each do |field| -%> + <%- case field -%> + <%- when Prism::Template::NodeField, Prism::Template::OptionalNodeField -%> + pm_node_stack_push(&node_stack, (pm_node_t *) cast-><%= field.name %>); + <%- when Prism::Template::NodeListField -%> + for (size_t index = 0; index < cast-><%= field.name %>.size; index++) { + pm_node_stack_push(&node_stack, (pm_node_t *) cast-><%= field.name %>.nodes[index]); + } + <%- end -%> + <%- end -%> + break; + } + <%- end -%> + <%- end -%> + default: + break; + } +#line <%= __LINE__ + 1 %> "prism/templates/ext/prism/<%= File.basename(__FILE__) %>" + } else { + const pm_node_t *node = pm_node_stack_pop(&node_stack); + + switch (PM_NODE_TYPE(node)) { + <%- nodes.each do |node| -%> +#line <%= __LINE__ + 1 %> "prism/templates/ext/prism/<%= File.basename(__FILE__) %>" + case <%= node.type %>: { + <%- if node.fields.any? { |field| ![Prism::Template::NodeField, Prism::Template::OptionalNodeField].include?(field.class) } -%> + pm_<%= node.human %>_t *cast = (pm_<%= node.human %>_t *) node; + <%- end -%> + VALUE argv[<%= node.fields.length + 4 %>]; + + // source + argv[0] = source; + + // node_id + argv[1] = ULONG2NUM(node->node_id); + + // location + argv[2] = pm_location_new(parser, node->location.start, node->location.end, source, freeze); + + // flags + argv[3] = ULONG2NUM(node->flags); + <%- node.fields.each.with_index(4) do |field, index| -%> + + // <%= field.name %> + <%- case field -%> + <%- when Prism::Template::NodeField, Prism::Template::OptionalNodeField -%> +#line <%= __LINE__ + 1 %> "prism/templates/ext/prism/<%= File.basename(__FILE__) %>" + argv[<%= index %>] = rb_ary_pop(value_stack); + <%- when Prism::Template::NodeListField -%> +#line <%= __LINE__ + 1 %> "prism/templates/ext/prism/<%= File.basename(__FILE__) %>" + argv[<%= index %>] = rb_ary_new_capa(cast-><%= field.name %>.size); + for (size_t index = 0; index < cast-><%= field.name %>.size; index++) { + rb_ary_push(argv[<%= index %>], rb_ary_pop(value_stack)); + } + if (freeze) rb_obj_freeze(argv[<%= index %>]); + <%- when Prism::Template::StringField -%> +#line <%= __LINE__ + 1 %> "prism/templates/ext/prism/<%= File.basename(__FILE__) %>" + argv[<%= index %>] = pm_string_new(&cast-><%= field.name %>, encoding); + <%- when Prism::Template::ConstantField -%> +#line <%= __LINE__ + 1 %> "prism/templates/ext/prism/<%= File.basename(__FILE__) %>" + assert(cast-><%= field.name %> != 0); + argv[<%= index %>] = RARRAY_AREF(constants, cast-><%= field.name %> - 1); + <%- when Prism::Template::OptionalConstantField -%> + argv[<%= index %>] = cast-><%= field.name %> == 0 ? Qnil : RARRAY_AREF(constants, cast-><%= field.name %> - 1); + <%- when Prism::Template::ConstantListField -%> +#line <%= __LINE__ + 1 %> "prism/templates/ext/prism/<%= File.basename(__FILE__) %>" + argv[<%= index %>] = rb_ary_new_capa(cast-><%= field.name %>.size); + for (size_t index = 0; index < cast-><%= field.name %>.size; index++) { + assert(cast-><%= field.name %>.ids[index] != 0); + rb_ary_push(argv[<%= index %>], RARRAY_AREF(constants, cast-><%= field.name %>.ids[index] - 1)); + } + if (freeze) rb_obj_freeze(argv[<%= index %>]); + <%- when Prism::Template::LocationField -%> +#line <%= __LINE__ + 1 %> "prism/templates/ext/prism/<%= File.basename(__FILE__) %>" + argv[<%= index %>] = pm_location_new(parser, cast-><%= field.name %>.start, cast-><%= field.name %>.end, source, freeze); + <%- when Prism::Template::OptionalLocationField -%> +#line <%= __LINE__ + 1 %> "prism/templates/ext/prism/<%= File.basename(__FILE__) %>" + argv[<%= index %>] = cast-><%= field.name %>.start == NULL ? Qnil : pm_location_new(parser, cast-><%= field.name %>.start, cast-><%= field.name %>.end, source, freeze); + <%- when Prism::Template::UInt8Field -%> +#line <%= __LINE__ + 1 %> "prism/templates/ext/prism/<%= File.basename(__FILE__) %>" + argv[<%= index %>] = UINT2NUM(cast-><%= field.name %>); + <%- when Prism::Template::UInt32Field -%> +#line <%= __LINE__ + 1 %> "prism/templates/ext/prism/<%= File.basename(__FILE__) %>" + argv[<%= index %>] = ULONG2NUM(cast-><%= field.name %>); + <%- when Prism::Template::IntegerField -%> +#line <%= __LINE__ + 1 %> "prism/templates/ext/prism/<%= File.basename(__FILE__) %>" + argv[<%= index %>] = pm_integer_new(&cast-><%= field.name %>); + <%- when Prism::Template::DoubleField -%> +#line <%= __LINE__ + 1 %> "prism/templates/ext/prism/<%= File.basename(__FILE__) %>" + argv[<%= index %>] = DBL2NUM(cast-><%= field.name %>); + <%- else -%> + <%- raise -%> + <%- end -%> + <%- end -%> + + VALUE value = rb_class_new_instance(<%= node.fields.length + 4 %>, argv, rb_cPrism<%= node.name %>); + if (freeze) rb_obj_freeze(value); + + rb_ary_push(value_stack, value); + break; + } + <%- end -%> + default: + rb_raise(rb_eRuntimeError, "unknown node type: %d", PM_NODE_TYPE(node)); + } + } + } + + return rb_ary_pop(value_stack); +} + +void +Init_prism_api_node(void) { + <%- nodes.each do |node| -%> + rb_cPrism<%= node.name %> = rb_define_class_under(rb_cPrism, "<%= node.name %>", rb_cPrismNode); + <%- end -%> +} diff --git a/prism/templates/include/prism/ast.h.erb b/prism/templates/include/prism/ast.h.erb new file mode 100644 index 0000000000..790cf9ebb8 --- /dev/null +++ b/prism/templates/include/prism/ast.h.erb @@ -0,0 +1,238 @@ +/** + * @file ast.h + * + * The abstract syntax tree. + * + * -- + */ +#ifndef PRISM_AST_H +#define PRISM_AST_H + +#include "prism/defines.h" +#include "prism/util/pm_constant_pool.h" +#include "prism/util/pm_integer.h" +#include "prism/util/pm_string.h" + +#include <assert.h> +#include <stddef.h> +#include <stdint.h> + +/** + * This enum represents every type of token in the Ruby source. + */ +typedef enum pm_token_type { +<%- tokens.each do |token| -%> + /** <%= token.comment %> */ + PM_TOKEN_<%= token.name %><%= " = #{token.value}" if token.value %>, + +<%- end -%> + /** The maximum token value. */ + PM_TOKEN_MAXIMUM, +} pm_token_type_t; + +/** + * This struct represents a token in the Ruby source. We use it to track both + * type and location information. + */ +typedef struct { + /** The type of the token. */ + pm_token_type_t type; + + /** A pointer to the start location of the token in the source. */ + const uint8_t *start; + + /** A pointer to the end location of the token in the source. */ + const uint8_t *end; +} pm_token_t; + +/** + * This represents a range of bytes in the source string to which a node or + * token corresponds. + */ +typedef struct { + /** A pointer to the start location of the range in the source. */ + const uint8_t *start; + + /** A pointer to the end location of the range in the source. */ + const uint8_t *end; +} pm_location_t; + +struct pm_node; + +/** + * A list of nodes in the source, most often used for lists of children. + */ +typedef struct pm_node_list { + /** The number of nodes in the list. */ + size_t size; + + /** The capacity of the list that has been allocated. */ + size_t capacity; + + /** The nodes in the list. */ + struct pm_node **nodes; +} pm_node_list_t; + +/** + * This enum represents every type of node in the Ruby syntax tree. + */ +enum pm_node_type { +<%- nodes.each_with_index do |node, index| -%> + /** <%= node.name %> */ + <%= node.type %> = <%= index + 1 %>, + +<%- end -%> + /** A special kind of node used for compilation. */ + PM_SCOPE_NODE +}; + +/** + * This is the type of node embedded in the node struct. We explicitly control + * the size of it here to avoid having the variable-width enum. + */ +typedef uint16_t pm_node_type_t; + +/** + * These are the flags embedded in the node struct. We explicitly control the + * size of it here to avoid having the variable-width enum. + */ +typedef uint16_t pm_node_flags_t; + +/** + * We store the flags enum in every node in the tree. Some flags are common to + * all nodes (the ones listed below). Others are specific to certain node types. + */ +static const pm_node_flags_t PM_NODE_FLAG_NEWLINE = 0x1; +static const pm_node_flags_t PM_NODE_FLAG_STATIC_LITERAL = 0x2; + +/** + * This is the base structure that represents a node in the syntax tree. It is + * embedded into every node type. + */ +typedef struct pm_node { + /** + * This represents the type of the node. It somewhat maps to the nodes that + * existed in the original grammar and ripper, but it's not a 1:1 mapping. + */ + pm_node_type_t type; + + /** + * This represents any flags on the node. Some are common to all nodes, and + * some are specific to the type of node. + */ + pm_node_flags_t flags; + + /** + * The unique identifier for this node, which is deterministic based on the + * source. It is used to identify unique nodes across parses. + */ + uint32_t node_id; + + /** + * This is the location of the node in the source. It's a range of bytes + * containing a start and an end. + */ + pm_location_t location; +} pm_node_t; + +/** + * Cast the given node to the base pm_node_t type. + */ +#define PM_NODE_UPCAST(node_) ((pm_node_t *) (node_)) + +/** + * Cast the type to an enum to allow the compiler to provide exhaustiveness + * checking. + */ +#define PM_NODE_TYPE(node_) ((enum pm_node_type) (node_)->type) + +/** + * Return true if the type of the given node matches the given type. + */ +#define PM_NODE_TYPE_P(node_, type_) (PM_NODE_TYPE(node_) == (type_)) + +/** + * Return the flags associated with the given node. + */ +#define PM_NODE_FLAGS(node_) (PM_NODE_UPCAST(node_)->flags) + +/** + * Return true if the given flag is set on the given node. + */ +#define PM_NODE_FLAG_P(node_, flag_) ((PM_NODE_FLAGS(node_) & (flag_)) != 0) +<%- nodes.each do |node| -%> + +/** + * <%= node.name %> + * +<%- node.each_comment_line do |line| -%> + *<%= line %> +<%- end -%> + * + * Type: ::<%= node.type %> +<% if (node_flags = node.flags) %> + * Flags (#pm_<%= node_flags.human %>): +<%- node_flags.values.each do |value| -%> + * * ::PM_<%= node_flags.human.upcase %>_<%= value.name %> +<%- end -%> +<%- end -%> + * + * @extends pm_node_t + */ +typedef struct pm_<%= node.human %> { + /** The embedded base node. */ + pm_node_t base; + +<%- node.fields.each do |field| -%> + + /** + * <%= node.name %>#<%= field.name %> + <%- if field.comment -%> + * + <%- field.each_comment_line do |line| -%> + *<%= line %> + <%- end -%> + <%- end -%> + */ + <%= case field + when Prism::Template::NodeField, Prism::Template::OptionalNodeField then "struct #{field.c_type} *#{field.name}" + when Prism::Template::NodeListField then "struct pm_node_list #{field.name}" + when Prism::Template::ConstantField, Prism::Template::OptionalConstantField then "pm_constant_id_t #{field.name}" + when Prism::Template::ConstantListField then "pm_constant_id_list_t #{field.name}" + when Prism::Template::StringField then "pm_string_t #{field.name}" + when Prism::Template::LocationField, Prism::Template::OptionalLocationField then "pm_location_t #{field.name}" + when Prism::Template::UInt8Field then "uint8_t #{field.name}" + when Prism::Template::UInt32Field then "uint32_t #{field.name}" + when Prism::Template::IntegerField then "pm_integer_t #{field.name}" + when Prism::Template::DoubleField then "double #{field.name}" + else raise field.class.name + end + %>; +<%- end -%> +} pm_<%= node.human %>_t; +<%- end -%> +<%- flags.each do |flag| -%> + +/** + * <%= flag.comment %> + */ +typedef enum pm_<%= flag.human %> { + <%- flag.values.each_with_index do |value, index| -%> +<%= "\n" if index > 0 -%> + /** <%= value.comment %> */ + PM_<%= flag.human.upcase %>_<%= value.name %> = <%= 1 << (index + Prism::Template::COMMON_FLAGS_COUNT) %>, + <%- end -%> + + PM_<%= flag.human.upcase %>_LAST, +} pm_<%= flag.human %>_t; +<%- end -%> + +/** + * When we're serializing to Java, we want to skip serializing the location + * fields as they won't be used by JRuby or TruffleRuby. This boolean allows us + * to specify that through the environment. It will never be true except for in + * those build systems. + */ +#define PRISM_SERIALIZE_ONLY_SEMANTICS_FIELDS <%= Prism::Template::SERIALIZE_ONLY_SEMANTICS_FIELDS ? 1 : 0 %> + +#endif diff --git a/prism/templates/include/prism/diagnostic.h.erb b/prism/templates/include/prism/diagnostic.h.erb new file mode 100644 index 0000000000..07bbc8fae7 --- /dev/null +++ b/prism/templates/include/prism/diagnostic.h.erb @@ -0,0 +1,130 @@ +/** + * @file diagnostic.h + * + * A list of diagnostics generated during parsing. + */ +#ifndef PRISM_DIAGNOSTIC_H +#define PRISM_DIAGNOSTIC_H + +#include "prism/ast.h" +#include "prism/defines.h" +#include "prism/util/pm_list.h" + +#include <stdbool.h> +#include <stdlib.h> +#include <assert.h> + +/** + * The diagnostic IDs of all of the diagnostics, used to communicate the types + * of errors between the parser and the user. + */ +typedef enum { + // These are the error diagnostics. + <%- errors.each do |error| -%> + PM_ERR_<%= error.name %>, + <%- end -%> + + // These are the warning diagnostics. + <%- warnings.each do |warning| -%> + PM_WARN_<%= warning.name %>, + <%- end -%> +} pm_diagnostic_id_t; + +/** + * This struct represents a diagnostic generated during parsing. + * + * @extends pm_list_node_t + */ +typedef struct { + /** The embedded base node. */ + pm_list_node_t node; + + /** The location of the diagnostic in the source. */ + pm_location_t location; + + /** The ID of the diagnostic. */ + pm_diagnostic_id_t diag_id; + + /** The message associated with the diagnostic. */ + const char *message; + + /** + * Whether or not the memory related to the message of this diagnostic is + * owned by this diagnostic. If it is, it needs to be freed when the + * diagnostic is freed. + */ + bool owned; + + /** + * The level of the diagnostic, see `pm_error_level_t` and + * `pm_warning_level_t` for possible values. + */ + uint8_t level; +} pm_diagnostic_t; + +/** + * The levels of errors generated during parsing. + */ +typedef enum { + /** For errors that should raise a syntax error. */ + PM_ERROR_LEVEL_SYNTAX = 0, + + /** For errors that should raise an argument error. */ + PM_ERROR_LEVEL_ARGUMENT = 1, + + /** For errors that should raise a load error. */ + PM_ERROR_LEVEL_LOAD = 2 +} pm_error_level_t; + +/** + * The levels of warnings generated during parsing. + */ +typedef enum { + /** For warnings which should be emitted if $VERBOSE != nil. */ + PM_WARNING_LEVEL_DEFAULT = 0, + + /** For warnings which should be emitted if $VERBOSE == true. */ + PM_WARNING_LEVEL_VERBOSE = 1 +} pm_warning_level_t; + +/** + * Get the human-readable name of the given diagnostic ID. + * + * @param diag_id The diagnostic ID. + * @return The human-readable name of the diagnostic ID. + */ +const char * pm_diagnostic_id_human(pm_diagnostic_id_t diag_id); + +/** + * Append a diagnostic to the given list of diagnostics that is using shared + * memory for its message. + * + * @param list The list to append to. + * @param start The start of the diagnostic. + * @param end The end of the diagnostic. + * @param diag_id The diagnostic ID. + * @return Whether the diagnostic was successfully appended. + */ +bool pm_diagnostic_list_append(pm_list_t *list, const uint8_t *start, const uint8_t *end, pm_diagnostic_id_t diag_id); + +/** + * Append a diagnostic to the given list of diagnostics that is using a format + * string for its message. + * + * @param list The list to append to. + * @param start The start of the diagnostic. + * @param end The end of the diagnostic. + * @param diag_id The diagnostic ID. + * @param ... The arguments to the format string for the message. + * @return Whether the diagnostic was successfully appended. + */ +bool pm_diagnostic_list_append_format(pm_list_t *list, const uint8_t *start, const uint8_t *end, pm_diagnostic_id_t diag_id, ...); + +/** + * Deallocate the internal state of the given diagnostic list. + * + * @param list The list to deallocate. + */ +void pm_diagnostic_list_free(pm_list_t *list); + +#endif diff --git a/prism/templates/lib/prism/compiler.rb.erb b/prism/templates/lib/prism/compiler.rb.erb new file mode 100644 index 0000000000..66dbe666b9 --- /dev/null +++ b/prism/templates/lib/prism/compiler.rb.erb @@ -0,0 +1,43 @@ +module Prism + # A compiler is a visitor that returns the value of each node as it visits. + # This is as opposed to a visitor which will only walk the tree. This can be + # useful when you are trying to compile a tree into a different format. + # + # For example, to build a representation of the tree as s-expressions, you + # could write: + # + # class SExpressions < Prism::Compiler + # def visit_arguments_node(node) = [:arguments, super] + # def visit_call_node(node) = [:call, super] + # def visit_integer_node(node) = [:integer] + # def visit_program_node(node) = [:program, super] + # end + # + # Prism.parse("1 + 2").value.accept(SExpressions.new) + # # => [:program, [[[:call, [[:integer], [:arguments, [[:integer]]]]]]]] + # + class Compiler < Visitor + # Visit an individual node. + def visit(node) + node&.accept(self) + end + + # Visit a list of nodes. + def visit_all(nodes) + nodes.map { |node| node&.accept(self) } + end + + # Visit the child nodes of the given node. + def visit_child_nodes(node) + node.each_child_node.map { |node| node.accept(self) } + end + + <%- nodes.each_with_index do |node, index| -%> +<%= "\n" if index != 0 -%> + # Compile a <%= node.name %> node + def visit_<%= node.human %>(node) + node.each_child_node.map { |node| node.accept(self) } + end + <%- end -%> + end +end diff --git a/prism/templates/lib/prism/dispatcher.rb.erb b/prism/templates/lib/prism/dispatcher.rb.erb new file mode 100644 index 0000000000..52478451c9 --- /dev/null +++ b/prism/templates/lib/prism/dispatcher.rb.erb @@ -0,0 +1,103 @@ +module Prism + # The dispatcher class fires events for nodes that are found while walking an + # AST to all registered listeners. It's useful for performing different types + # of analysis on the AST while only having to walk the tree once. + # + # To use the dispatcher, you would first instantiate it and register listeners + # for the events you're interested in: + # + # class OctalListener + # def on_integer_node_enter(node) + # if node.octal? && !node.slice.start_with?("0o") + # warn("Octal integers should be written with the 0o prefix") + # end + # end + # end + # + # listener = OctalListener.new + # dispatcher = Prism::Dispatcher.new + # dispatcher.register(listener, :on_integer_node_enter) + # + # Then, you can walk any number of trees and dispatch events to the listeners: + # + # result = Prism.parse("001 + 002 + 003") + # dispatcher.dispatch(result.value) + # + # Optionally, you can also use `#dispatch_once` to dispatch enter and leave + # events for a single node without recursing further down the tree. This can + # be useful in circumstances where you want to reuse the listeners you already + # have registers but want to stop walking the tree at a certain point. + # + # integer = result.value.statements.body.first.receiver.receiver + # dispatcher.dispatch_once(integer) + # + class Dispatcher < Visitor + # attr_reader listeners: Hash[Symbol, Array[Listener]] + attr_reader :listeners + + # Initialize a new dispatcher. + def initialize + @listeners = {} + end + + # Register a listener for one or more events. + # + # def register: (Listener, *Symbol) -> void + def register(listener, *events) + register_events(listener, events) + end + + # Register all public methods of a listener that match the pattern + # `on_<node_name>_(enter|leave)`. + # + # def register_public_methods: (Listener) -> void + def register_public_methods(listener) + register_events(listener, listener.public_methods(false).grep(/\Aon_.+_(?:enter|leave)\z/)) + end + + # Register a listener for the given events. + private def register_events(listener, events) + events.each { |event| (listeners[event] ||= []) << listener } + end + + # Walks `root` dispatching events to all registered listeners. + # + # def dispatch: (Node) -> void + alias dispatch visit + + # Dispatches a single event for `node` to all registered listeners. + # + # def dispatch_once: (Node) -> void + def dispatch_once(node) + node.accept(DispatchOnce.new(listeners)) + end + <%- nodes.each do |node| -%> + + # Dispatch enter and leave events for <%= node.name %> nodes and continue + # walking the tree. + def visit_<%= node.human %>(node) + listeners[:on_<%= node.human %>_enter]&.each { |listener| listener.on_<%= node.human %>_enter(node) } + super + listeners[:on_<%= node.human %>_leave]&.each { |listener| listener.on_<%= node.human %>_leave(node) } + end + <%- end -%> + + class DispatchOnce < Visitor # :nodoc: + attr_reader :listeners + + def initialize(listeners) + @listeners = listeners + end + <%- nodes.each do |node| -%> + + # Dispatch enter and leave events for <%= node.name %> nodes. + def visit_<%= node.human %>(node) + listeners[:on_<%= node.human %>_enter]&.each { |listener| listener.on_<%= node.human %>_enter(node) } + listeners[:on_<%= node.human %>_leave]&.each { |listener| listener.on_<%= node.human %>_leave(node) } + end + <%- end -%> + end + + private_constant :DispatchOnce + end +end diff --git a/prism/templates/lib/prism/dot_visitor.rb.erb b/prism/templates/lib/prism/dot_visitor.rb.erb new file mode 100644 index 0000000000..cd2998fe61 --- /dev/null +++ b/prism/templates/lib/prism/dot_visitor.rb.erb @@ -0,0 +1,189 @@ +require "cgi/escape" +require "cgi/util" unless defined?(CGI::EscapeExt) + +module Prism + # This visitor provides the ability to call Node#to_dot, which converts a + # subtree into a graphviz dot graph. + class DotVisitor < Visitor + class Field # :nodoc: + attr_reader :name, :value, :port + + def initialize(name, value, port) + @name = name + @value = value + @port = port + end + + def to_dot + if port + "<tr><td align=\"left\" colspan=\"2\" port=\"#{name}\">#{name}</td></tr>" + else + "<tr><td align=\"left\">#{name}</td><td>#{CGI.escapeHTML(value || raise)}</td></tr>" + end + end + end + + class Table # :nodoc: + attr_reader :name, :fields + + def initialize(name) + @name = name + @fields = [] + end + + def field(name, value = nil, port: false) + fields << Field.new(name, value, port) + end + + def to_dot + dot = <<~DOT + <table border="0" cellborder="1" cellspacing="0" cellpadding="4"> + <tr><td colspan="2"><b>#{name}</b></td></tr> + DOT + + if fields.any? + "#{dot} #{fields.map(&:to_dot).join("\n ")}\n</table>" + else + "#{dot}</table>" + end + end + end + + class Digraph # :nodoc: + attr_reader :nodes, :waypoints, :edges + + def initialize + @nodes = [] + @waypoints = [] + @edges = [] + end + + def node(value) + nodes << value + end + + def waypoint(value) + waypoints << value + end + + def edge(value) + edges << value + end + + def to_dot + <<~DOT + digraph "Prism" { + node [ + fontname=\"Courier New\" + shape=plain + style=filled + fillcolor=gray95 + ]; + + #{nodes.map { |node| node.gsub(/\n/, "\n ") }.join("\n ")} + node [shape=point]; + #{waypoints.join("\n ")} + + #{edges.join("\n ")} + } + DOT + end + end + + private_constant :Field, :Table, :Digraph + + # The digraph that is being built. + attr_reader :digraph + + # Initialize a new dot visitor. + def initialize + @digraph = Digraph.new + end + + # Convert this visitor into a graphviz dot graph string. + def to_dot + digraph.to_dot + end + <%- nodes.each do |node| -%> + + # Visit a <%= node.name %> node. + def visit_<%= node.human %>(node) + table = Table.new("<%= node.name %>") + id = node_id(node) + <%- if (node_flags = node.flags) -%> + + # flags + table.field("flags", <%= node_flags.human %>_inspect(node)) + <%- end -%> + <%- node.fields.each do |field| -%> + + # <%= field.name %> + <%- case field -%> + <%- when Prism::Template::NodeField -%> + table.field("<%= field.name %>", port: true) + digraph.edge("#{id}:<%= field.name %> -> #{node_id(node.<%= field.name %>)};") + <%- when Prism::Template::OptionalNodeField -%> + unless (<%= field.name %> = node.<%= field.name %>).nil? + table.field("<%= field.name %>", port: true) + digraph.edge("#{id}:<%= field.name %> -> #{node_id(<%= field.name %>)};") + end + <%- when Prism::Template::NodeListField -%> + if node.<%= field.name %>.any? + table.field("<%= field.name %>", port: true) + + waypoint = "#{id}_<%= field.name %>" + digraph.waypoint("#{waypoint};") + + digraph.edge("#{id}:<%= field.name %> -> #{waypoint};") + node.<%= field.name %>.each { |child| digraph.edge("#{waypoint} -> #{node_id(child)};") } + else + table.field("<%= field.name %>", "[]") + end + <%- when Prism::Template::StringField, Prism::Template::ConstantField, Prism::Template::OptionalConstantField, Prism::Template::UInt8Field, Prism::Template::UInt32Field, Prism::Template::ConstantListField, Prism::Template::IntegerField, Prism::Template::DoubleField -%> + table.field("<%= field.name %>", node.<%= field.name %>.inspect) + <%- when Prism::Template::LocationField -%> + table.field("<%= field.name %>", location_inspect(node.<%= field.name %>)) + <%- when Prism::Template::OptionalLocationField -%> + unless (<%= field.name %> = node.<%= field.name %>).nil? + table.field("<%= field.name %>", location_inspect(<%= field.name %>)) + end + <%- else -%> + <%- raise -%> + <%- end -%> + <%- end -%> + + digraph.nodes << <<~DOT + #{id} [ + label=<#{table.to_dot.gsub(/\n/, "\n ")}> + ]; + DOT + + super + end + <%- end -%> + + private + + # Generate a unique node ID for a node throughout the digraph. + def node_id(node) + "Node_#{node.object_id}" + end + + # Inspect a location to display the start and end line and column numbers. + def location_inspect(location) + "(#{location.start_line},#{location.start_column})-(#{location.end_line},#{location.end_column})" + end + <%- flags.each do |flag| -%> + + # Inspect a node that has <%= flag.human %> flags to display the flags as a + # comma-separated list. + def <%= flag.human %>_inspect(node) + flags = [] #: Array[String] + <%- flag.values.each do |value| -%> + flags << "<%= value.name.downcase %>" if node.<%= value.name.downcase %>? + <%- end -%> + flags.join(", ") + end + <%- end -%> + end +end diff --git a/prism/templates/lib/prism/dsl.rb.erb b/prism/templates/lib/prism/dsl.rb.erb new file mode 100644 index 0000000000..e16ebb7110 --- /dev/null +++ b/prism/templates/lib/prism/dsl.rb.erb @@ -0,0 +1,133 @@ +module Prism + # The DSL module provides a set of methods that can be used to create prism + # nodes in a more concise manner. For example, instead of writing: + # + # source = Prism::Source.for("[1]") + # + # Prism::ArrayNode.new( + # source, + # 0, + # Prism::Location.new(source, 0, 3), + # 0, + # [ + # Prism::IntegerNode.new( + # source, + # 0, + # Prism::Location.new(source, 1, 1), + # Prism::IntegerBaseFlags::DECIMAL, + # 1 + # ) + # ], + # Prism::Location.new(source, 0, 1), + # Prism::Location.new(source, 2, 1) + # ) + # + # you could instead write: + # + # class Builder + # include Prism::DSL + # + # attr_reader :default_source + # + # def initialize + # @default_source = source("[1]") + # end + # + # def build + # array_node( + # location: location(start_offset: 0, length: 3), + # elements: [ + # integer_node( + # location: location(start_offset: 1, length: 1), + # flags: integer_base_flag(:decimal), + # value: 1 + # ) + # ], + # opening_loc: location(start_offset: 0, length: 1), + # closing_loc: location(start_offset: 2, length: 1) + # ) + # end + # end + # + # This is mostly helpful in the context of generating trees programmatically. + module DSL + # Provide all of these methods as module methods as well, to allow for + # building nodes like Prism::DSL.nil_node. + extend self + + # Create a new Source object. + def source(string) + Source.for(string) + end + + # Create a new Location object. + def location(source: default_source, start_offset: 0, length: 0) + Location.new(source, start_offset, length) + end + <%- nodes.each do |node| -%> + + # Create a new <%= node.name %> node. + def <%= node.human %>(<%= ["source: default_source", "node_id: 0", "location: default_location", "flags: 0", *node.fields.map { |field| + case field + when Prism::Template::NodeField + kind = field.specific_kind || field.union_kind&.first + if kind.nil? + "#{field.name}: default_node(source, location)" + else + "#{field.name}: #{kind.gsub(/(?<=.)[A-Z]/, "_\\0").downcase}(source: source)" + end + when Prism::Template::ConstantField + "#{field.name}: :\"\"" + when Prism::Template::OptionalNodeField, Prism::Template::OptionalConstantField, Prism::Template::OptionalLocationField + "#{field.name}: nil" + when Prism::Template::NodeListField, Prism::Template::ConstantListField + "#{field.name}: []" + when Prism::Template::StringField + "#{field.name}: \"\"" + when Prism::Template::LocationField + "#{field.name}: location" + when Prism::Template::UInt8Field, Prism::Template::UInt32Field, Prism::Template::IntegerField + "#{field.name}: 0" + when Prism::Template::DoubleField + "#{field.name}: 0.0" + else + raise + end + }].join(", ") %>) + <%= node.name %>.new(<%= ["source", "node_id", "location", "flags", *node.fields.map(&:name)].join(", ") %>) + end + <%- end -%> + <%- flags.each do |flag| -%> + + # Retrieve the value of one of the <%= flag.name %> flags. + def <%= flag.human.chomp("s") %>(name) + case name + <%- flag.values.each do |value| -%> + when :<%= value.name.downcase %> then <%= flag.name %>::<%= value.name %> + <%- end -%> + else Kernel.raise ArgumentError, "invalid <%= flag.name %> flag: #{name.inspect}" + end + end + <%- end -%> + + private + + # The default source object that gets attached to nodes and locations if no + # source is specified. + def default_source + Source.for("") + end + + # The default location object that gets attached to nodes if no location is + # specified, which uses the given source. + def default_location + Location.new(default_source, 0, 0) + end + + # The default node that gets attached to nodes if no node is specified for a + # required node field. + def default_node(source, location) + MissingNode.new(source, -1, location, 0) + end + end +end diff --git a/prism/templates/lib/prism/inspect_visitor.rb.erb b/prism/templates/lib/prism/inspect_visitor.rb.erb new file mode 100644 index 0000000000..3cfe615d85 --- /dev/null +++ b/prism/templates/lib/prism/inspect_visitor.rb.erb @@ -0,0 +1,131 @@ +module Prism + # This visitor is responsible for composing the strings that get returned by + # the various #inspect methods defined on each of the nodes. + class InspectVisitor < Visitor + # Most of the time, we can simply pass down the indent to the next node. + # However, when we are inside a list we want some extra special formatting + # when we hit an element in that list. In this case, we have a special + # command that replaces the subsequent indent with the given value. + class Replace # :nodoc: + attr_reader :value + + def initialize(value) + @value = value + end + end + + private_constant :Replace + + # The current prefix string. + attr_reader :indent + + # The list of commands that we need to execute in order to compose the + # final string. + attr_reader :commands + + # Initializes a new instance of the InspectVisitor. + def initialize(indent = +"") + @indent = indent + @commands = [] + end + + # Compose an inspect string for the given node. + def self.compose(node) + visitor = new + node.accept(visitor) + visitor.compose + end + + # Compose the final string. + def compose + buffer = +"" + replace = nil + + until commands.empty? + # @type var command: String | node | Replace + # @type var indent: String + command, indent = *commands.shift + + case command + when String + buffer << (replace || indent) + buffer << command + replace = nil + when Node + visitor = InspectVisitor.new(indent) + command.accept(visitor) + @commands = [*visitor.commands, *@commands] + when Replace + replace = command.value + else + raise "Unknown command: #{command.inspect}" + end + end + + buffer + end + <%- nodes.each do |node| -%> + + # Inspect a <%= node.name %> node. + def visit_<%= node.human %>(node) + commands << [inspect_node(<%= node.name.inspect %>, node), indent] + <%- (fields = [node.flags || Prism::Template::Flags.empty, *node.fields]).each_with_index do |field, index| -%> + <%- pointer = index == fields.length - 1 ? "└── " : "├── " -%> + <%- preadd = index == fields.length - 1 ? " " : "│ " -%> + <%- case field -%> + <%- when Prism::Template::Flags -%> + flags = [("newline" if node.newline?), ("static_literal" if node.static_literal?), <%= field.values.map { |value| "(\"#{value.name.downcase}\" if node.#{value.name.downcase}?)" }.join(", ") %>].compact + commands << ["<%= pointer %>flags: #{flags.empty? ? "∅" : flags.join(", ")}\n", indent] + <%- when Prism::Template::NodeListField -%> + commands << ["<%= pointer %><%= field.name %>: (length: #{(<%= field.name %> = node.<%= field.name %>).length})\n", indent] + if <%= field.name %>.any? + <%= field.name %>[0...-1].each do |child| + commands << [Replace.new("#{indent}<%= preadd %>├── "), indent] + commands << [child, "#{indent}<%= preadd %>│ "] + end + commands << [Replace.new("#{indent}<%= preadd %>└── "), indent] + commands << [<%= field.name %>[-1], "#{indent}<%= preadd %> "] + end + <%- when Prism::Template::NodeField -%> + commands << ["<%= pointer %><%= field.name %>:\n", indent] + commands << [node.<%= field.name %>, "#{indent}<%= preadd %>"] + <%- when Prism::Template::OptionalNodeField -%> + if (<%= field.name %> = node.<%= field.name %>).nil? + commands << ["<%= pointer %><%= field.name %>: ∅\n", indent] + else + commands << ["<%= pointer %><%= field.name %>:\n", indent] + commands << [<%= field.name %>, "#{indent}<%= preadd %>"] + end + <%- when Prism::Template::ConstantField, Prism::Template::ConstantListField, Prism::Template::StringField, Prism::Template::UInt8Field, Prism::Template::UInt32Field, Prism::Template::IntegerField, Prism::Template::DoubleField -%> + commands << ["<%= pointer %><%= field.name %>: #{node.<%= field.name %>.inspect}\n", indent] + <%- when Prism::Template::OptionalConstantField -%> + if (<%= field.name %> = node.<%= field.name %>).nil? + commands << ["<%= pointer %><%= field.name %>: ∅\n", indent] + else + commands << ["<%= pointer %><%= field.name %>: #{<%= field.name %>.inspect}\n", indent] + end + <%- when Prism::Template::LocationField, Prism::Template::OptionalLocationField -%> + commands << ["<%= pointer %><%= field.name %>: #{inspect_location(node.<%= field.name %>)}\n", indent] + <%- end -%> + <%- end -%> + end + <%- end -%> + + private + + # Compose a header for the given node. + def inspect_node(name, node) + location = node.location + "@ #{name} (location: (#{location.start_line},#{location.start_column})-(#{location.end_line},#{location.end_column}))\n" + end + + # Compose a string representing the given inner location field. + def inspect_location(location) + if location + "(#{location.start_line},#{location.start_column})-(#{location.end_line},#{location.end_column}) = #{location.slice.inspect}" + else + "∅" + end + end + end +end diff --git a/prism/templates/lib/prism/mutation_compiler.rb.erb b/prism/templates/lib/prism/mutation_compiler.rb.erb new file mode 100644 index 0000000000..565ee4e315 --- /dev/null +++ b/prism/templates/lib/prism/mutation_compiler.rb.erb @@ -0,0 +1,19 @@ +module Prism + # This visitor walks through the tree and copies each node as it is being + # visited. This is useful for consumers that want to mutate the tree, as you + # can change subtrees in place without effecting the rest of the tree. + class MutationCompiler < Compiler + <%- nodes.each_with_index do |node, index| -%> +<%= "\n" if index != 0 -%> + # Copy a <%= node.name %> node + def visit_<%= node.human %>(node) + <%- fields = node.fields.select { |field| [Prism::Template::NodeField, Prism::Template::OptionalNodeField, Prism::Template::NodeListField].include?(field.class) } -%> + <%- if fields.any? -%> + node.copy(<%= fields.map { |field| "#{field.name}: #{field.is_a?(Prism::Template::NodeListField) ? "visit_all" : "visit"}(node.#{field.name})" }.join(", ") %>) + <%- else -%> + node.copy + <%- end -%> + end + <%- end -%> + end +end diff --git a/prism/templates/lib/prism/node.rb.erb b/prism/templates/lib/prism/node.rb.erb new file mode 100644 index 0000000000..8225bfb328 --- /dev/null +++ b/prism/templates/lib/prism/node.rb.erb @@ -0,0 +1,527 @@ +module Prism + # This represents a node in the tree. It is the parent class of all of the + # various node types. + class Node + # A pointer to the source that this node was created from. + attr_reader :source + private :source + + # A unique identifier for this node. This is used in a very specific + # use case where you want to keep around a reference to a node without + # having to keep around the syntax tree in memory. This unique identifier + # will be consistent across multiple parses of the same source code. + attr_reader :node_id + + # Save this node using a saved source so that it can be retrieved later. + def save(repository) + repository.enter(node_id, :itself) + end + + # A Location instance that represents the location of this node in the + # source. + def location + location = @location + return location if location.is_a?(Location) + @location = Location.new(source, location >> 32, location & 0xFFFFFFFF) + end + + # Save the location using a saved source so that it can be retrieved later. + def save_location(repository) + repository.enter(node_id, :location) + end + + # Delegates to the start_line of the associated location object. + def start_line + location.start_line + end + + # Delegates to the end_line of the associated location object. + def end_line + location.end_line + end + + # The start offset of the node in the source. This method is effectively a + # delegate method to the location object. + def start_offset + location = @location + location.is_a?(Location) ? location.start_offset : location >> 32 + end + + # The end offset of the node in the source. This method is effectively a + # delegate method to the location object. + def end_offset + location = @location + location.is_a?(Location) ? location.end_offset : ((location >> 32) + (location & 0xFFFFFFFF)) + end + + # Delegates to the start_character_offset of the associated location object. + def start_character_offset + location.start_character_offset + end + + # Delegates to the end_character_offset of the associated location object. + def end_character_offset + location.end_character_offset + end + + # Delegates to the cached_start_code_units_offset of the associated location + # object. + def cached_start_code_units_offset(cache) + location.cached_start_code_units_offset(cache) + end + + # Delegates to the cached_end_code_units_offset of the associated location + # object. + def cached_end_code_units_offset(cache) + location.cached_end_code_units_offset(cache) + end + + # Delegates to the start_column of the associated location object. + def start_column + location.start_column + end + + # Delegates to the end_column of the associated location object. + def end_column + location.end_column + end + + # Delegates to the start_character_column of the associated location object. + def start_character_column + location.start_character_column + end + + # Delegates to the end_character_column of the associated location object. + def end_character_column + location.end_character_column + end + + # Delegates to the cached_start_code_units_column of the associated location + # object. + def cached_start_code_units_column(cache) + location.cached_start_code_units_column(cache) + end + + # Delegates to the cached_end_code_units_column of the associated location + # object. + def cached_end_code_units_column(cache) + location.cached_end_code_units_column(cache) + end + + # Delegates to the leading_comments of the associated location object. + def leading_comments + location.leading_comments + end + + # Delegates to the trailing_comments of the associated location object. + def trailing_comments + location.trailing_comments + end + + # Delegates to the comments of the associated location object. + def comments + location.comments + end + + # Returns all of the lines of the source code associated with this node. + def source_lines + location.source_lines + end + + # An alias for source_lines, used to mimic the API from + # RubyVM::AbstractSyntaxTree to make it easier to migrate. + alias script_lines source_lines + + # Slice the location of the node from the source. + def slice + location.slice + end + + # Slice the location of the node from the source, starting at the beginning + # of the line that the location starts on, ending at the end of the line + # that the location ends on. + def slice_lines + location.slice_lines + end + + # An bitset of flags for this node. There are certain flags that are common + # for all nodes, and then some nodes have specific flags. + attr_reader :flags + protected :flags + + # Returns true if the node has the newline flag set. + def newline? + flags.anybits?(NodeFlags::NEWLINE) + end + + # Returns true if the node has the static literal flag set. + def static_literal? + flags.anybits?(NodeFlags::STATIC_LITERAL) + end + + # Similar to inspect, but respects the current level of indentation given by + # the pretty print object. + def pretty_print(q) + q.seplist(inspect.chomp.each_line, -> { q.breakable }) do |line| + q.text(line.chomp) + end + q.current_group.break + end + + # Convert this node into a graphviz dot graph string. + def to_dot + # @type self: node + DotVisitor.new.tap { |visitor| accept(visitor) }.to_dot + end + + # Returns a list of nodes that are descendants of this node that contain the + # given line and column. This is useful for locating a node that is selected + # based on the line and column of the source code. + # + # Important to note is that the column given to this method should be in + # bytes, as opposed to characters or code units. + def tunnel(line, column) + queue = [self] #: Array[Prism::node] + result = [] #: Array[Prism::node] + + search_offset = source.line_to_byte_offset(line) + column + + while (node = queue.shift) + result << node + + node.each_child_node do |child_node| + if child_node.start_offset <= search_offset && search_offset < child_node.end_offset + queue << child_node + break + end + end + end + + result + end + + # Returns the first node that matches the given block when visited in a + # depth-first search. This is useful for finding a node that matches a + # particular condition. + # + # node.breadth_first_search { |node| node.node_id == node_id } + # + def breadth_first_search(&block) + queue = [self] #: Array[Prism::node] + + while (node = queue.shift) + return node if yield node + queue.concat(node.compact_child_nodes) + end + + nil + end + + # Returns a list of the fields that exist for this node class. Fields + # describe the structure of the node. This kind of reflection is useful for + # things like recursively visiting each node _and_ field in the tree. + def self.fields + # This method should only be called on subclasses of Node, not Node + # itself. + raise NoMethodError, "undefined method `fields' for #{inspect}" if self == Node + + Reflection.fields_for(self) + end + + # -------------------------------------------------------------------------- + # :section: Node interface + # These methods are effectively abstract methods that must be implemented by + # the various subclasses of Node. They are here to make it easier to work + # with typecheckers. + # -------------------------------------------------------------------------- + + # Accepts a visitor and calls back into the specialized visit function. + def accept(visitor) + raise NoMethodError, "undefined method `accept' for #{inspect}" + end + + # Returns an array of child nodes, including `nil`s in the place of optional + # nodes that were not present. + def child_nodes + raise NoMethodError, "undefined method `child_nodes' for #{inspect}" + end + + alias deconstruct child_nodes + + # With a block given, yields each child node. Without a block, returns + # an enumerator that contains each child node. Excludes any `nil`s in + # the place of optional nodes that were not present. + def each_child_node + raise NoMethodError, "undefined method `each_child_node' for #{inspect}" + end + + # Returns an array of child nodes, excluding any `nil`s in the place of + # optional nodes that were not present. + def compact_child_nodes + raise NoMethodError, "undefined method `compact_child_nodes' for #{inspect}" + end + + # Returns an array of child nodes and locations that could potentially have + # comments attached to them. + def comment_targets + raise NoMethodError, "undefined method `comment_targets' for #{inspect}" + end + + # Returns a string representation of the node. + def inspect + raise NoMethodError, "undefined method `inspect' for #{inspect}" + end + + # Sometimes you want to check an instance of a node against a list of + # classes to see what kind of behavior to perform. Usually this is done by + # calling `[cls1, cls2].include?(node.class)` or putting the node into a + # case statement and doing `case node; when cls1; when cls2; end`. Both of + # these approaches are relatively slow because of the constant lookups, + # method calls, and/or array allocations. + # + # Instead, you can call #type, which will return to you a symbol that you + # can use for comparison. This is faster than the other approaches because + # it uses a single integer comparison, but also because if you're on CRuby + # you can take advantage of the fact that case statements with all symbol + # keys will use a jump table. + def type + raise NoMethodError, "undefined method `type' for #{inspect}" + end + + # Similar to #type, this method returns a symbol that you can use for + # splitting on the type of the node without having to do a long === chain. + # Note that like #type, it will still be slower than using == for a single + # class, but should be faster in a case statement or an array comparison. + def self.type + raise NoMethodError, "undefined method `type' for #{inspect}" + end + end + <%- nodes.each do |node| -%> + + <%- node.each_comment_line do |line| -%> + #<%= line %> + <%- end -%> + class <%= node.name -%> < Node + # Initialize a new <%= node.name %> node. + def initialize(<%= ["source", "node_id", "location", "flags", *node.fields.map(&:name)].join(", ") %>) + @source = source + @node_id = node_id + @location = location + @flags = flags + <%- node.fields.each do |field| -%> + <%- if Prism::Template::CHECK_FIELD_KIND && field.respond_to?(:check_field_kind) -%> + raise "<%= node.name %>#<%= field.name %> was of unexpected type:\n#{<%= field.name %>.inspect}" unless <%= field.check_field_kind %> + <%- end -%> + @<%= field.name %> = <%= field.name %> + <%- end -%> + end + + # def accept: (Visitor visitor) -> void + def accept(visitor) + visitor.visit_<%= node.human %>(self) + end + + # def child_nodes: () -> Array[Node?] + def child_nodes + [<%= node.fields.map { |field| + case field + when Prism::Template::NodeField, Prism::Template::OptionalNodeField then field.name + when Prism::Template::NodeListField then "*#{field.name}" + end + }.compact.join(", ") %>] + end + + # def each_child_node: () { (Prism::node) -> void } -> void | () -> Enumerator[Prism::node] + def each_child_node + return to_enum(:each_child_node) unless block_given? + + <%- node.fields.each do |field| -%> + <%- case field -%> + <%- when Prism::Template::NodeField -%> + yield <%= field.name %> + <%- when Prism::Template::OptionalNodeField -%> + yield <%= field.name %> if <%= field.name %> + <%- when Prism::Template::NodeListField -%> + <%= field.name %>.each { |node| yield node } + <%- end -%> + <%- end -%> + end + + # def compact_child_nodes: () -> Array[Node] + def compact_child_nodes + <%- if node.fields.any? { |field| field.is_a?(Prism::Template::OptionalNodeField) } -%> + compact = [] #: Array[Prism::node] + <%- node.fields.each do |field| -%> + <%- case field -%> + <%- when Prism::Template::NodeField -%> + compact << <%= field.name %> + <%- when Prism::Template::OptionalNodeField -%> + compact << <%= field.name %> if <%= field.name %> + <%- when Prism::Template::NodeListField -%> + compact.concat(<%= field.name %>) + <%- end -%> + <%- end -%> + compact + <%- else -%> + [<%= node.fields.map { |field| + case field + when Prism::Template::NodeField then field.name + when Prism::Template::NodeListField then "*#{field.name}" + end + }.compact.join(", ") %>] + <%- end -%> + end + + # def comment_targets: () -> Array[Node | Location] + def comment_targets + [<%= node.fields.map { |field| + case field + when Prism::Template::NodeField, Prism::Template::LocationField then field.name + when Prism::Template::OptionalNodeField, Prism::Template::NodeListField, Prism::Template::OptionalLocationField then "*#{field.name}" + end + }.compact.join(", ") %>] #: Array[Prism::node | Location] + end + + # def copy: (<%= (["?node_id: Integer", "?location: Location", "?flags: Integer"] + node.fields.map { |field| "?#{field.name}: #{field.rbs_class}" }).join(", ") %>) -> <%= node.name %> + def copy(<%= (["node_id", "location", "flags"] + node.fields.map(&:name)).map { |field| "#{field}: self.#{field}" }.join(", ") %>) + <%= node.name %>.new(<%= ["source", "node_id", "location", "flags", *node.fields.map(&:name)].join(", ") %>) + end + + # def deconstruct: () -> Array[Node?] + alias deconstruct child_nodes + + # def deconstruct_keys: (Array[Symbol] keys) -> { <%= (["node_id: Integer", "location: Location"] + node.fields.map { |field| "#{field.name}: #{field.rbs_class}" }).join(", ") %> } + def deconstruct_keys(keys) + { <%= (["node_id: node_id", "location: location"] + node.fields.map { |field| "#{field.name}: #{field.name}" }).join(", ") %> } + end + <%- if (node_flags = node.flags) -%> + <%- node_flags.values.each do |value| -%> + + # def <%= value.name.downcase %>?: () -> bool + def <%= value.name.downcase %>? + flags.anybits?(<%= node_flags.name %>::<%= value.name %>) + end + <%- end -%> + <%- end -%> + <%- node.fields.each do |field| -%> + + <%- if field.comment.nil? -%> + # attr_reader <%= field.name %>: <%= field.rbs_class %> + <%- else -%> + <%- field.each_comment_line do |line| -%> + #<%= line %> + <%- end -%> + <%- end -%> + <%- case field -%> + <%- when Prism::Template::LocationField -%> + def <%= field.name %> + location = @<%= field.name %> + return location if location.is_a?(Location) + @<%= field.name %> = Location.new(source, location >> 32, location & 0xFFFFFFFF) + end + + # Save the <%= field.name %> location using the given saved source so that + # it can be retrieved later. + def save_<%= field.name %>(repository) + repository.enter(node_id, :<%= field.name %>) + end + <%- when Prism::Template::OptionalLocationField -%> + def <%= field.name %> + location = @<%= field.name %> + case location + when nil + nil + when Location + location + else + @<%= field.name %> = Location.new(source, location >> 32, location & 0xFFFFFFFF) + end + end + + # Save the <%= field.name %> location using the given saved source so that + # it can be retrieved later. + def save_<%= field.name %>(repository) + repository.enter(node_id, :<%= field.name %>) unless @<%= field.name %>.nil? + end + <%- else -%> + attr_reader :<%= field.name %> + <%- end -%> + <%- end -%> + <%- node.fields.each do |field| -%> + <%- case field -%> + <%- when Prism::Template::LocationField -%> + <%- raise unless field.name.end_with?("_loc") -%> + <%- next if node.fields.any? { |other| other.name == field.name.delete_suffix("_loc") } -%> + + # def <%= field.name.delete_suffix("_loc") %>: () -> String + def <%= field.name.delete_suffix("_loc") %> + <%= field.name %>.slice + end + <%- when Prism::Template::OptionalLocationField -%> + <%- raise unless field.name.end_with?("_loc") -%> + <%- next if node.fields.any? { |other| other.name == field.name.delete_suffix("_loc") } -%> + + # def <%= field.name.delete_suffix("_loc") %>: () -> String? + def <%= field.name.delete_suffix("_loc") %> + <%= field.name %>&.slice + end + <%- end -%> + <%- end -%> + + # def inspect -> String + def inspect + InspectVisitor.compose(self) + end + + # Return a symbol representation of this node type. See `Node#type`. + def type + :<%= node.human %> + end + + # Return a symbol representation of this node type. See `Node::type`. + def self.type + :<%= node.human %> + end + + # Implements case-equality for the node. This is effectively == but without + # comparing the value of locations. Locations are checked only for presence. + def ===(other) + other.is_a?(<%= node.name %>)<%= " &&" if (fields = [*node.flags, *node.fields]).any? %> + <%- fields.each_with_index do |field, index| -%> + <%- if field.is_a?(Prism::Template::LocationField) || field.is_a?(Prism::Template::OptionalLocationField) -%> + (<%= field.name %>.nil? == other.<%= field.name %>.nil?)<%= " &&" if index != fields.length - 1 %> + <%- elsif field.is_a?(Prism::Template::NodeListField) || field.is_a?(Prism::Template::ConstantListField) -%> + (<%= field.name %>.length == other.<%= field.name %>.length) && + <%= field.name %>.zip(other.<%= field.name %>).all? { |left, right| left === right }<%= " &&" if index != fields.length - 1 %> + <%- elsif field.is_a?(Prism::Template::Flags) -%> + (flags === other.flags)<%= " &&" if index != fields.length - 1 %> + <%- else -%> + (<%= field.name %> === other.<%= field.name %>)<%= " &&" if index != fields.length - 1 %> + <%- end -%> + <%- end -%> + end + end + <%- end -%> + <%- flags.each do |flag| -%> + + # <%= flag.comment %> + module <%= flag.name %> + <%- flag.values.each_with_index do |value, index| -%> + # <%= value.comment %> + <%= value.name %> = 1 << <%= index + Prism::Template::COMMON_FLAGS_COUNT %> +<%= "\n" if value != flag.values.last -%> + <%- end -%> + end + <%- end -%> + + # The flags that are common to all nodes. + module NodeFlags + # A flag to indicate that the node is a candidate to emit a :line event + # through tracepoint when compiled. + NEWLINE = 1 + + # A flag to indicate that the value that the node represents is a value that + # can be determined at parse-time. + STATIC_LITERAL = 2 + end +end diff --git a/prism/templates/lib/prism/reflection.rb.erb b/prism/templates/lib/prism/reflection.rb.erb new file mode 100644 index 0000000000..6c8b2f4d25 --- /dev/null +++ b/prism/templates/lib/prism/reflection.rb.erb @@ -0,0 +1,136 @@ +module Prism + # The Reflection module provides the ability to reflect on the structure of + # the syntax tree itself, as opposed to looking at a single syntax tree. This + # is useful in metaprogramming contexts. + module Reflection + # A field represents a single piece of data on a node. It is the base class + # for all other field types. + class Field + # The name of the field. + attr_reader :name + + # Initializes the field with the given name. + def initialize(name) + @name = name + end + end + + # A node field represents a single child node in the syntax tree. It + # resolves to a Prism::Node in Ruby. + class NodeField < Field + end + + # An optional node field represents a single child node in the syntax tree + # that may or may not be present. It resolves to either a Prism::Node or nil + # in Ruby. + class OptionalNodeField < Field + end + + # A node list field represents a list of child nodes in the syntax tree. It + # resolves to an array of Prism::Node instances in Ruby. + class NodeListField < Field + end + + # A constant field represents a constant value on a node. Effectively, it + # represents an identifier found within the source. It resolves to a symbol + # in Ruby. + class ConstantField < Field + end + + # An optional constant field represents a constant value on a node that may + # or may not be present. It resolves to either a symbol or nil in Ruby. + class OptionalConstantField < Field + end + + # A constant list field represents a list of constant values on a node. It + # resolves to an array of symbols in Ruby. + class ConstantListField < Field + end + + # A string field represents a string value on a node. It almost always + # represents the unescaped value of a string-like literal. It resolves to a + # string in Ruby. + class StringField < Field + end + + # A location field represents the location of some part of the node in the + # source code. For example, the location of a keyword or an operator. It + # resolves to a Prism::Location in Ruby. + class LocationField < Field + end + + # An optional location field represents the location of some part of the + # node in the source code that may or may not be present. It resolves to + # either a Prism::Location or nil in Ruby. + class OptionalLocationField < Field + end + + # An integer field represents an integer value. It is used to represent the + # value of an integer literal, the depth of local variables, and the number + # of a numbered reference. It resolves to an Integer in Ruby. + class IntegerField < Field + end + + # A float field represents a double-precision floating point value. It is + # used exclusively to represent the value of a floating point literal. It + # resolves to a Float in Ruby. + class FloatField < Field + end + + # A flags field represents a bitset of flags on a node. It resolves to an + # integer in Ruby. Note that the flags cannot be accessed directly on the + # node because the integer is kept private. Instead, the various flags in + # the bitset should be accessed through their query methods. + class FlagsField < Field + # The names of the flags in the bitset. + attr_reader :flags + + # Initializes the flags field with the given name and flags. + def initialize(name, flags) + super(name) + @flags = flags + end + end + + # Returns the fields for the given node. + def self.fields_for(node) + case node.type + <%- nodes.each do |node| -%> + when :<%= node.human %> + [<%= [*node.flags, *node.fields].map { |field| + case field + when Prism::Template::NodeField + "NodeField.new(:#{field.name})" + when Prism::Template::OptionalNodeField + "OptionalNodeField.new(:#{field.name})" + when Prism::Template::NodeListField + "NodeListField.new(:#{field.name})" + when Prism::Template::ConstantField + "ConstantField.new(:#{field.name})" + when Prism::Template::OptionalConstantField + "OptionalConstantField.new(:#{field.name})" + when Prism::Template::ConstantListField + "ConstantListField.new(:#{field.name})" + when Prism::Template::StringField + "StringField.new(:#{field.name})" + when Prism::Template::LocationField + "LocationField.new(:#{field.name})" + when Prism::Template::OptionalLocationField + "OptionalLocationField.new(:#{field.name})" + when Prism::Template::UInt8Field, Prism::Template::UInt32Field, Prism::Template::IntegerField + "IntegerField.new(:#{field.name})" + when Prism::Template::DoubleField + "FloatField.new(:#{field.name})" + when Prism::Template::Flags + "FlagsField.new(:flags, [#{field.values.map { |value| ":#{value.name.downcase}?" }.join(", ")}])" + else + raise field.class.name + end + }.join(", ") %>] + <%- end -%> + else + raise "Unknown node type: #{node.type.inspect}" + end + end + end +end diff --git a/prism/templates/lib/prism/serialize.rb.erb b/prism/templates/lib/prism/serialize.rb.erb new file mode 100644 index 0000000000..6902df5c01 --- /dev/null +++ b/prism/templates/lib/prism/serialize.rb.erb @@ -0,0 +1,602 @@ +require "stringio" +require_relative "polyfill/unpack1" + +module Prism + # A module responsible for deserializing parse results. + module Serialize + # The major version of prism that we are expecting to find in the serialized + # strings. + MAJOR_VERSION = 1 + + # The minor version of prism that we are expecting to find in the serialized + # strings. + MINOR_VERSION = 8 + + # The patch version of prism that we are expecting to find in the serialized + # strings. + PATCH_VERSION = 0 + + # Deserialize the dumped output from a request to parse or parse_file. + # + # The formatting of the source of this method is purposeful to illustrate + # the structure of the serialized data. + def self.load_parse(input, serialized, freeze) + input = input.dup + source = Source.for(input) + loader = Loader.new(source, serialized) + + loader.load_header + encoding = loader.load_encoding + start_line = loader.load_varsint + offsets = loader.load_line_offsets(freeze) + + source.replace_start_line(start_line) + source.replace_offsets(offsets) + + comments = loader.load_comments(freeze) + magic_comments = loader.load_magic_comments(freeze) + data_loc = loader.load_optional_location_object(freeze) + errors = loader.load_errors(encoding, freeze) + warnings = loader.load_warnings(encoding, freeze) + cpool_base = loader.load_uint32 + cpool_size = loader.load_varuint + + constant_pool = ConstantPool.new(input, serialized, cpool_base, cpool_size) + + node = loader.load_node(constant_pool, encoding, freeze) + loader.load_constant_pool(constant_pool) + raise unless loader.eof? + + result = ParseResult.new(node, comments, magic_comments, data_loc, errors, warnings, source) + result.freeze if freeze + + input.force_encoding(encoding) + + # This is an extremely niche use-case where the file was marked as binary + # but it contained UTF-8-encoded characters. In that case we will actually + # put it back to UTF-8 to give the location APIs the best chance of being + # correct. + if !input.ascii_only? && input.encoding == Encoding::BINARY + input.force_encoding(Encoding::UTF_8) + input.force_encoding(Encoding::BINARY) unless input.valid_encoding? + end + + if freeze + input.freeze + source.deep_freeze + end + + result + end + + # Deserialize the dumped output from a request to lex or lex_file. + # + # The formatting of the source of this method is purposeful to illustrate + # the structure of the serialized data. + def self.load_lex(input, serialized, freeze) + source = Source.for(input) + loader = Loader.new(source, serialized) + + tokens = loader.load_tokens + encoding = loader.load_encoding + start_line = loader.load_varsint + offsets = loader.load_line_offsets(freeze) + + source.replace_start_line(start_line) + source.replace_offsets(offsets) + + comments = loader.load_comments(freeze) + magic_comments = loader.load_magic_comments(freeze) + data_loc = loader.load_optional_location_object(freeze) + errors = loader.load_errors(encoding, freeze) + warnings = loader.load_warnings(encoding, freeze) + raise unless loader.eof? + + result = LexResult.new(tokens, comments, magic_comments, data_loc, errors, warnings, source) + + tokens.each do |token| + token[0].value.force_encoding(encoding) + + if freeze + token[0].deep_freeze + token.freeze + end + end + + if freeze + source.deep_freeze + tokens.freeze + result.freeze + end + + result + end + + # Deserialize the dumped output from a request to parse_comments or + # parse_file_comments. + # + # The formatting of the source of this method is purposeful to illustrate + # the structure of the serialized data. + def self.load_parse_comments(input, serialized, freeze) + source = Source.for(input) + loader = Loader.new(source, serialized) + + loader.load_header + loader.load_encoding + start_line = loader.load_varsint + + source.replace_start_line(start_line) + + result = loader.load_comments(freeze) + raise unless loader.eof? + + source.deep_freeze if freeze + result + end + + # Deserialize the dumped output from a request to parse_lex or + # parse_lex_file. + # + # The formatting of the source of this method is purposeful to illustrate + # the structure of the serialized data. + def self.load_parse_lex(input, serialized, freeze) + source = Source.for(input) + loader = Loader.new(source, serialized) + + tokens = loader.load_tokens + loader.load_header + encoding = loader.load_encoding + start_line = loader.load_varsint + offsets = loader.load_line_offsets(freeze) + + source.replace_start_line(start_line) + source.replace_offsets(offsets) + + comments = loader.load_comments(freeze) + magic_comments = loader.load_magic_comments(freeze) + data_loc = loader.load_optional_location_object(freeze) + errors = loader.load_errors(encoding, freeze) + warnings = loader.load_warnings(encoding, freeze) + cpool_base = loader.load_uint32 + cpool_size = loader.load_varuint + + constant_pool = ConstantPool.new(input, serialized, cpool_base, cpool_size) + + node = loader.load_node(constant_pool, encoding, freeze) + loader.load_constant_pool(constant_pool) + raise unless loader.eof? + + value = [node, tokens] + result = ParseLexResult.new(value, comments, magic_comments, data_loc, errors, warnings, source) + + tokens.each do |token| + token[0].value.force_encoding(encoding) + + if freeze + token[0].deep_freeze + token.freeze + end + end + + if freeze + source.deep_freeze + tokens.freeze + value.freeze + result.freeze + end + + result + end + + class ConstantPool # :nodoc: + attr_reader :size + + def initialize(input, serialized, base, size) + @input = input + @serialized = serialized + @base = base + @size = size + @pool = Array.new(size, nil) + end + + def get(index, encoding) + @pool[index] ||= + begin + offset = @base + index * 8 + start = @serialized.unpack1("L", offset: offset) + length = @serialized.unpack1("L", offset: offset + 4) + + if start.nobits?(1 << 31) + @input.byteslice(start, length).force_encoding(encoding).to_sym + else + @serialized.byteslice(start & ((1 << 31) - 1), length).force_encoding(encoding).to_sym + end + end + end + end + + if RUBY_ENGINE == "truffleruby" + # StringIO is synchronized and that adds a high overhead on TruffleRuby. + class FastStringIO # :nodoc: + attr_accessor :pos + + def initialize(string) + @string = string + @pos = 0 + end + + def getbyte + byte = @string.getbyte(@pos) + @pos += 1 + byte + end + + def read(n) + slice = @string.byteslice(@pos, n) + @pos += n + slice + end + + def eof? + @pos >= @string.bytesize + end + end + else + FastStringIO = ::StringIO # :nodoc: + end + + class Loader # :nodoc: + attr_reader :input, :io, :source + + def initialize(source, serialized) + @input = source.source.dup + raise unless serialized.encoding == Encoding::BINARY + @io = FastStringIO.new(serialized) + @source = source + define_load_node_lambdas if RUBY_ENGINE != "ruby" + end + + def eof? + io.getbyte + io.eof? + end + + def load_constant_pool(constant_pool) + trailer = 0 + + constant_pool.size.times do |index| + start, length = io.read(8).unpack("L2") + trailer += length if start.anybits?(1 << 31) + end + + io.read(trailer) + end + + def load_header + raise "Invalid serialization" if io.read(5) != "PRISM" + raise "Invalid serialization" if io.read(3).unpack("C3") != [MAJOR_VERSION, MINOR_VERSION, PATCH_VERSION] + raise "Invalid serialization (location fields must be included but are not)" if io.getbyte != 0 + end + + def load_encoding + encoding = Encoding.find(io.read(load_varuint)) + @input = input.force_encoding(encoding).freeze + encoding + end + + def load_line_offsets(freeze) + offsets = Array.new(load_varuint) { load_varuint } + offsets.freeze if freeze + offsets + end + + def load_comments(freeze) + comments = + Array.new(load_varuint) do + comment = + case load_varuint + when 0 then InlineComment.new(load_location_object(freeze)) + when 1 then EmbDocComment.new(load_location_object(freeze)) + end + + comment.freeze if freeze + comment + end + + comments.freeze if freeze + comments + end + + def load_magic_comments(freeze) + magic_comments = + Array.new(load_varuint) do + magic_comment = + MagicComment.new( + load_location_object(freeze), + load_location_object(freeze) + ) + + magic_comment.freeze if freeze + magic_comment + end + + magic_comments.freeze if freeze + magic_comments + end + + DIAGNOSTIC_TYPES = [ + <%- errors.each do |error| -%> + <%= error.name.downcase.to_sym.inspect %>, + <%- end -%> + <%- warnings.each do |warning| -%> + <%= warning.name.downcase.to_sym.inspect %>, + <%- end -%> + ].freeze + + private_constant :DIAGNOSTIC_TYPES + + def load_error_level + level = io.getbyte + + case level + when 0 + :syntax + when 1 + :argument + when 2 + :load + else + raise "Unknown level: #{level}" + end + end + + def load_errors(encoding, freeze) + errors = + Array.new(load_varuint) do + error = + ParseError.new( + DIAGNOSTIC_TYPES.fetch(load_varuint), + load_embedded_string(encoding), + load_location_object(freeze), + load_error_level + ) + + error.freeze if freeze + error + end + + errors.freeze if freeze + errors + end + + def load_warning_level + level = io.getbyte + + case level + when 0 + :default + when 1 + :verbose + else + raise "Unknown level: #{level}" + end + end + + def load_warnings(encoding, freeze) + warnings = + Array.new(load_varuint) do + warning = + ParseWarning.new( + DIAGNOSTIC_TYPES.fetch(load_varuint), + load_embedded_string(encoding), + load_location_object(freeze), + load_warning_level + ) + + warning.freeze if freeze + warning + end + + warnings.freeze if freeze + warnings + end + + def load_tokens + tokens = [] + + while (type = TOKEN_TYPES.fetch(load_varuint)) + start = load_varuint + length = load_varuint + lex_state = load_varuint + + location = Location.new(@source, start, length) + token = Token.new(@source, type, location.slice, location) + + tokens << [token, lex_state] + end + + tokens + end + + # variable-length integer using https://en.wikipedia.org/wiki/LEB128 + # This is also what protobuf uses: https://protobuf.dev/programming-guides/encoding/#varints + def load_varuint + n = io.getbyte + if n < 128 + n + else + n -= 128 + shift = 0 + while (b = io.getbyte) >= 128 + n += (b - 128) << (shift += 7) + end + n + (b << (shift + 7)) + end + end + + def load_varsint + n = load_varuint + (n >> 1) ^ (-(n & 1)) + end + + def load_integer + negative = io.getbyte != 0 + length = load_varuint + + value = 0 + length.times { |index| value |= (load_varuint << (index * 32)) } + + value = -value if negative + value + end + + def load_double + io.read(8).unpack1("D") + end + + def load_uint32 + io.read(4).unpack1("L") + end + + def load_optional_node(constant_pool, encoding, freeze) + if io.getbyte != 0 + io.pos -= 1 + load_node(constant_pool, encoding, freeze) + end + end + + def load_embedded_string(encoding) + io.read(load_varuint).force_encoding(encoding).freeze + end + + def load_string(encoding) + case (type = io.getbyte) + when 1 + input.byteslice(load_varuint, load_varuint).force_encoding(encoding).freeze + when 2 + load_embedded_string(encoding) + else + raise "Unknown serialized string type: #{type}" + end + end + + def load_location_object(freeze) + location = Location.new(source, load_varuint, load_varuint) + location.freeze if freeze + location + end + + def load_location(freeze) + return load_location_object(freeze) if freeze + (load_varuint << 32) | load_varuint + end + + def load_optional_location(freeze) + load_location(freeze) if io.getbyte != 0 + end + + def load_optional_location_object(freeze) + load_location_object(freeze) if io.getbyte != 0 + end + + def load_constant(constant_pool, encoding) + index = load_varuint + constant_pool.get(index - 1, encoding) + end + + def load_optional_constant(constant_pool, encoding) + index = load_varuint + constant_pool.get(index - 1, encoding) if index != 0 + end + + if RUBY_ENGINE == "ruby" + def load_node(constant_pool, encoding, freeze) + type = io.getbyte + node_id = load_varuint + location = load_location(freeze) + value = case type + <%- nodes.each_with_index do |node, index| -%> + when <%= index + 1 %> then + <%- if node.needs_serialized_length? -%> + load_uint32 + <%- end -%> + <%= node.name %>.new(<%= ["source", "node_id", "location", "load_varuint", *node.fields.map { |field| + case field + when Prism::Template::NodeField then "load_node(constant_pool, encoding, freeze)" + when Prism::Template::OptionalNodeField then "load_optional_node(constant_pool, encoding, freeze)" + when Prism::Template::StringField then "load_string(encoding)" + when Prism::Template::NodeListField then "Array.new(load_varuint) { load_node(constant_pool, encoding, freeze) }.tap { |nodes| nodes.freeze if freeze }" + when Prism::Template::ConstantField then "load_constant(constant_pool, encoding)" + when Prism::Template::OptionalConstantField then "load_optional_constant(constant_pool, encoding)" + when Prism::Template::ConstantListField then "Array.new(load_varuint) { load_constant(constant_pool, encoding) }.tap { |constants| constants.freeze if freeze }" + when Prism::Template::LocationField then "load_location(freeze)" + when Prism::Template::OptionalLocationField then "load_optional_location(freeze)" + when Prism::Template::UInt8Field then "io.getbyte" + when Prism::Template::UInt32Field then "load_varuint" + when Prism::Template::IntegerField then "load_integer" + when Prism::Template::DoubleField then "load_double" + else raise + end + }].join(", ") -%>) + <%- end -%> + end + + value.freeze if freeze + value + end + else + def load_node(constant_pool, encoding, freeze) + @load_node_lambdas[io.getbyte].call(constant_pool, encoding, freeze) + end + + def define_load_node_lambdas + @load_node_lambdas = [ + nil, + <%- nodes.each do |node| -%> + -> (constant_pool, encoding, freeze) { + node_id = load_varuint + location = load_location(freeze) + <%- if node.needs_serialized_length? -%> + load_uint32 + <%- end -%> + value = <%= node.name %>.new(<%= ["source", "node_id", "location", "load_varuint", *node.fields.map { |field| + case field + when Prism::Template::NodeField then "load_node(constant_pool, encoding, freeze)" + when Prism::Template::OptionalNodeField then "load_optional_node(constant_pool, encoding, freeze)" + when Prism::Template::StringField then "load_string(encoding)" + when Prism::Template::NodeListField then "Array.new(load_varuint) { load_node(constant_pool, encoding, freeze) }" + when Prism::Template::ConstantField then "load_constant(constant_pool, encoding)" + when Prism::Template::OptionalConstantField then "load_optional_constant(constant_pool, encoding)" + when Prism::Template::ConstantListField then "Array.new(load_varuint) { load_constant(constant_pool, encoding) }" + when Prism::Template::LocationField then "load_location(freeze)" + when Prism::Template::OptionalLocationField then "load_optional_location(freeze)" + when Prism::Template::UInt8Field then "io.getbyte" + when Prism::Template::UInt32Field then "load_varuint" + when Prism::Template::IntegerField then "load_integer" + when Prism::Template::DoubleField then "load_double" + else raise + end + }].join(", ") -%>) + value.freeze if freeze + value + }, + <%- end -%> + ] + end + end + end + + # The token types that can be indexed by their enum values. + TOKEN_TYPES = [ + nil, + <%- tokens.each do |token| -%> + <%= token.name.to_sym.inspect %>, + <%- end -%> + ].freeze + + private_constant :MAJOR_VERSION, :MINOR_VERSION, :PATCH_VERSION + private_constant :ConstantPool, :FastStringIO, :Loader, :TOKEN_TYPES + end + + private_constant :Serialize +end diff --git a/prism/templates/lib/prism/visitor.rb.erb b/prism/templates/lib/prism/visitor.rb.erb new file mode 100644 index 0000000000..76f907724f --- /dev/null +++ b/prism/templates/lib/prism/visitor.rb.erb @@ -0,0 +1,55 @@ +module Prism + # A class that knows how to walk down the tree. None of the individual visit + # methods are implemented on this visitor, so it forces the consumer to + # implement each one that they need. For a default implementation that + # continues walking the tree, see the Visitor class. + class BasicVisitor + # Calls `accept` on the given node if it is not `nil`, which in turn should + # call back into this visitor by calling the appropriate `visit_*` method. + def visit(node) + # @type self: _Visitor + node&.accept(self) + end + + # Visits each node in `nodes` by calling `accept` on each one. + def visit_all(nodes) + # @type self: _Visitor + nodes.each { |node| node&.accept(self) } + end + + # Visits the child nodes of `node` by calling `accept` on each one. + def visit_child_nodes(node) + # @type self: _Visitor + node.each_child_node { |node| node.accept(self) } + end + end + + # A visitor is a class that provides a default implementation for every accept + # method defined on the nodes. This means it can walk a tree without the + # caller needing to define any special handling. This allows you to handle a + # subset of the tree, while still walking the whole tree. + # + # For example, to find all of the method calls that call the `foo` method, you + # could write: + # + # class FooCalls < Prism::Visitor + # def visit_call_node(node) + # if node.name == :foo + # # Do something with the node + # end + # + # # Call super so that the visitor continues walking the tree + # super + # end + # end + # + class Visitor < BasicVisitor + <%- nodes.each_with_index do |node, index| -%> +<%= "\n" if index != 0 -%> + # Visit a <%= node.name %> node + def visit_<%= node.human %>(node) + node.each_child_node { |node| node.accept(self) } + end + <%- end -%> + end +end diff --git a/prism/templates/src/diagnostic.c.erb b/prism/templates/src/diagnostic.c.erb new file mode 100644 index 0000000000..121dd4b2b6 --- /dev/null +++ b/prism/templates/src/diagnostic.c.erb @@ -0,0 +1,526 @@ +#include "prism/diagnostic.h" + +#define PM_DIAGNOSTIC_ID_MAX <%= errors.length + warnings.length %> + +/** This struct holds the data for each diagnostic. */ +typedef struct { + /** The message associated with the diagnostic. */ + const char* message; + + /** The level associated with the diagnostic. */ + uint8_t level; +} pm_diagnostic_data_t; + +/** + * ## Message composition + * + * When composing an error message, use sentence fragments. + * + * Try describing the property of the code that caused the error, rather than + * the rule that is being violated. It may help to use a fragment that completes + * a sentence beginning, "the parser encountered (a) ...". If appropriate, add a + * description of the rule violation (or other helpful context) after a + * semicolon. + * + * For example:, instead of "control escape sequence cannot be doubled", prefer: + * + * > "invalid control escape sequence; control cannot be repeated" + * + * In some cases, where the failure is more general or syntax expectations are + * violated, it may make more sense to use a fragment that completes a sentence + * beginning, "the parser ...". + * + * For example: + * + * > "expected an expression after `(`" + * > "cannot parse the expression" + * + * ## Message style guide + * + * - Use articles like "a", "an", and "the" when appropriate. + * - e.g., prefer "cannot parse the expression" to "cannot parse expression". + * - Use the common name for tokens and nodes. + * - e.g., prefer "keyword splat" to "assoc splat" + * - e.g., prefer "embedded document" to "embdoc" + * - Do not capitalize the initial word of the message. + * - Use back ticks around token literals + * - e.g., "Expected a `=>` between the hash key and value" + * - Do not use `.` or other punctuation at the end of the message. + * - Do not use contractions like "can't". Prefer "cannot" to "can not". + * - For tokens that can have multiple meanings, reference the token and its meaning. + * - e.g., "`*` splat argument" is clearer and more complete than "splat argument" or "`*` argument" + * + * ## Error names (PM_ERR_*) + * + * - When appropriate, prefer node name to token name. + * - e.g., prefer "SPLAT" to "STAR" in the context of argument parsing. + * - Prefer token name to common name. + * - e.g., prefer "STAR" to "ASTERISK". + * - Try to order the words in the name from more general to more specific, + * - e.g., "INVALID_NUMBER_DECIMAL" is better than "DECIMAL_INVALID_NUMBER". + * - When in doubt, look for similar patterns and name them so that they are grouped when lexically + * sorted. See PM_ERR_ARGUMENT_NO_FORWARDING_* for an example. + * + * ## Level + * + * For errors, they are: + * + * * `PM_ERROR_LEVEL_SYNTAX` - Errors that should raise SyntaxError. + * * `PM_ERROR_LEVEL_ARGUMENT` - Errors that should raise ArgumentError. + * * `PM_ERROR_LEVEL_LOAD` - Errors that should raise LoadError. + * + * For warnings, they are: + * + * * `PM_WARNING_LEVEL_DEFAULT` - Warnings that appear for `ruby -c -e 'code'`. + * * `PM_WARNING_LEVEL_VERBOSE` - Warnings that appear with `-w`, as in `ruby -w -c -e 'code'`. + */ +static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = { + // Special error that can be replaced + [PM_ERR_CANNOT_PARSE_EXPRESSION] = { "cannot parse the expression", PM_ERROR_LEVEL_SYNTAX }, + + // Errors that should raise argument errors + [PM_ERR_INVALID_ENCODING_MAGIC_COMMENT] = { "unknown or invalid encoding in the magic comment", PM_ERROR_LEVEL_ARGUMENT }, + + // Errors that should raise load errors + [PM_ERR_SCRIPT_NOT_FOUND] = { "no Ruby script found in input", PM_ERROR_LEVEL_LOAD }, + + // Errors that should raise syntax errors + [PM_ERR_ALIAS_ARGUMENT] = { "invalid argument being passed to `alias`; expected a bare word, symbol, constant, or global variable", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ALIAS_ARGUMENT_NUMBERED_REFERENCE] = { "invalid argument being passed to `alias`; can't make alias for the number variables", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_AMPAMPEQ_MULTI_ASSIGN] = { "unexpected `&&=` in a multiple assignment", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ARGUMENT_AFTER_BLOCK] = { "unexpected argument after a block argument", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ARGUMENT_AFTER_FORWARDING_ELLIPSES] = { "unexpected argument after `...`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ARGUMENT_BARE_HASH] = { "unexpected bare hash argument", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ARGUMENT_BLOCK_MULTI] = { "both block arg and actual block given; only one block is allowed", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ARGUMENT_CONFLICT_AMPERSAND] = { "unexpected `&`; anonymous block parameter is also used within block", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ARGUMENT_CONFLICT_STAR] = { "unexpected `*`; anonymous rest parameter is also used within block", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ARGUMENT_CONFLICT_STAR_STAR] = { "unexpected `**`; anonymous keyword rest parameter is also used within block", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ARGUMENT_FORMAL_CLASS] = { "invalid formal argument; formal argument cannot be a class variable", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ARGUMENT_FORMAL_CONSTANT] = { "invalid formal argument; formal argument cannot be a constant", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ARGUMENT_FORMAL_GLOBAL] = { "invalid formal argument; formal argument cannot be a global variable", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ARGUMENT_FORMAL_IVAR] = { "invalid formal argument; formal argument cannot be an instance variable", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ARGUMENT_FORWARDING_UNBOUND] = { "unexpected `...` in an non-parenthesized call", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ARGUMENT_NO_FORWARDING_AMPERSAND] = { "unexpected `&`; no anonymous block parameter", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ARGUMENT_NO_FORWARDING_ELLIPSES] = { "unexpected ... when the parent method is not forwarding", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ARGUMENT_NO_FORWARDING_STAR] = { "unexpected `*`; no anonymous rest parameter", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ARGUMENT_NO_FORWARDING_STAR_STAR] = { "unexpected `**`; no anonymous keyword rest parameter", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ARGUMENT_SPLAT_AFTER_ASSOC_SPLAT] = { "unexpected `*` splat argument after a `**` keyword splat argument", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ARGUMENT_SPLAT_AFTER_SPLAT] = { "unexpected `*` splat argument after a `*` splat argument", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ARGUMENT_TERM_PAREN] = { "unexpected %s; expected a `)` to close the arguments", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ARGUMENT_UNEXPECTED_BLOCK] = { "unexpected '{' after a method call without parenthesis", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ARRAY_ELEMENT] = { "expected an element for the array", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ARRAY_EXPRESSION] = { "expected an expression for the array element", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ARRAY_EXPRESSION_AFTER_STAR] = { "expected an expression after `*` in the array", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ARRAY_SEPARATOR] = { "unexpected %s; expected a `,` separator for the array elements", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ARRAY_TERM] = { "unexpected %s; expected a `]` to close the array", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_BEGIN_LONELY_ELSE] = { "unexpected `else` in `begin` block; else without rescue is useless", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_BEGIN_TERM] = { "expected an `end` to close the `begin` statement", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_BEGIN_UPCASE_BRACE] = { "expected a `{` after `BEGIN`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_BEGIN_UPCASE_TERM] = { "expected a `}` to close the `BEGIN` statement", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_BEGIN_UPCASE_TOPLEVEL] = { "BEGIN is permitted only at toplevel", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_BLOCK_PARAM_LOCAL_VARIABLE] = { "expected a local variable name in the block parameters", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_BLOCK_PARAM_PIPE_TERM] = { "expected the block parameters to end with `|`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_BLOCK_TERM_BRACE] = { "expected a block beginning with `{` to end with `}`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_BLOCK_TERM_END] = { "expected a block beginning with `do` to end with `end`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_CANNOT_PARSE_STRING_PART] = { "cannot parse the string part", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_CASE_EXPRESSION_AFTER_CASE] = { "expected an expression after `case`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_CASE_EXPRESSION_AFTER_WHEN] = { "expected an expression after `when`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_CASE_MATCH_MISSING_PREDICATE] = { "expected a predicate for a case matching statement", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_CASE_MISSING_CONDITIONS] = { "expected a `when` or `in` clause after `case`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_CASE_TERM] = { "expected an `end` to close the `case` statement", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_CLASS_IN_METHOD] = { "unexpected class definition in method body", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_CLASS_NAME] = { "unexpected constant path after `class`; class/module name must be CONSTANT", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_CLASS_SUPERCLASS] = { "expected a superclass after `<`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_CLASS_TERM] = { "expected an `end` to close the `class` statement", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_CLASS_UNEXPECTED_END] = { "unexpected `end`, expecting ';' or '\\n'", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_CLASS_VARIABLE_BARE] = { "'@@' without identifiers is not allowed as a class variable name", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_CONDITIONAL_ELSIF_PREDICATE] = { "expected a predicate expression for the `elsif` statement", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_CONDITIONAL_IF_PREDICATE] = { "expected a predicate expression for the `if` statement", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_CONDITIONAL_PREDICATE_TERM] = { "expected `then` or `;` or '\\n'", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_CONDITIONAL_TERM] = { "expected an `end` to close the conditional clause", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_CONDITIONAL_TERM_ELSE] = { "expected an `end` to close the `else` clause", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_CONDITIONAL_UNLESS_PREDICATE] = { "expected a predicate expression for the `unless` statement", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_CONDITIONAL_UNTIL_PREDICATE] = { "expected a predicate expression for the `until` statement", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_CONDITIONAL_WHILE_PREDICATE] = { "expected a predicate expression for the `while` statement", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_CONSTANT_PATH_COLON_COLON_CONSTANT] = { "expected a constant after the `::` operator", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_DEF_ENDLESS] = { "could not parse the endless method body", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_DEF_ENDLESS_PARAMETERS] = { "could not parse the endless method parameters", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_DEF_ENDLESS_SETTER] = { "invalid method name; a setter method cannot be defined in an endless method definition", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_DEF_NAME] = { "unexpected %s; expected a method name", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_DEF_PARAMS_TERM] = { "expected a delimiter to close the parameters", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_DEF_PARAMS_TERM_PAREN] = { "unexpected %s; expected a `)` to close the parameters", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_DEF_RECEIVER] = { "expected a receiver for the method definition", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_DEF_RECEIVER_TERM] = { "expected a `.` or `::` after the receiver in a method definition", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_DEF_TERM] = { "expected an `end` to close the `def` statement", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_DEFINED_EXPRESSION] = { "expected an expression after `defined?`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EMBDOC_TERM] = { "embedded document meets end of file", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EMBEXPR_END] = { "expected a `}` to close the embedded expression", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EMBVAR_INVALID] = { "invalid embedded variable", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_END_UPCASE_BRACE] = { "expected a `{` after `END`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_END_UPCASE_TERM] = { "expected a `}` to close the `END` statement", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ESCAPE_INVALID_CONTROL] = { "Invalid escape character syntax", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ESCAPE_INVALID_CONTROL_REPEAT] = { "invalid control escape sequence; control cannot be repeated", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ESCAPE_INVALID_HEXADECIMAL] = { "invalid hex escape sequence", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ESCAPE_INVALID_META] = { "Invalid escape character syntax", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ESCAPE_INVALID_META_REPEAT] = { "invalid meta escape sequence; meta cannot be repeated", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ESCAPE_INVALID_UNICODE] = { "invalid Unicode escape sequence", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ESCAPE_INVALID_UNICODE_CM_FLAGS] = { "invalid Unicode escape sequence; Unicode cannot be combined with control or meta flags", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ESCAPE_INVALID_UNICODE_LIST] = { "invalid Unicode list: %.*s", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ESCAPE_INVALID_UNICODE_LITERAL] = { "invalid Unicode escape sequence; Multiple codepoints at single character literal are disallowed", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ESCAPE_INVALID_UNICODE_LONG] = { "invalid Unicode escape sequence; maximum length is 6 digits", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ESCAPE_INVALID_UNICODE_SHORT] = { "too short escape sequence: %.*s", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ESCAPE_INVALID_UNICODE_TERM] = { "unterminated Unicode escape", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPECT_ARGUMENT] = { "unexpected %s; expected an argument", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPECT_EOL_AFTER_STATEMENT] = { "unexpected %s, expecting end-of-input", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ] = { "expected an expression after `&&=`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ] = { "expected an expression after `||=`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPECT_EXPRESSION_AFTER_COMMA] = { "expected an expression after `,`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPECT_EXPRESSION_AFTER_EQUAL] = { "expected an expression after `=`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPECT_EXPRESSION_AFTER_LESS_LESS] = { "expected an expression after `<<`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPECT_EXPRESSION_AFTER_LPAREN] = { "expected an expression after `(`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR] = { "unexpected %s; expected an expression after the operator", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPECT_EXPRESSION_AFTER_SPLAT] = { "expected an expression after `*` splat in an argument", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPECT_EXPRESSION_AFTER_SPLAT_HASH] = { "expected an expression after `**` in a hash", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPECT_EXPRESSION_AFTER_STAR] = { "expected an expression after `*`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPECT_FOR_DELIMITER] = { "unexpected %s; expected a 'do', newline, or ';' after the 'for' loop collection", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPECT_IDENT_REQ_PARAMETER] = { "expected an identifier for the required parameter", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPECT_IN_DELIMITER] = { "expected a delimiter after the patterns of an `in` clause", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPECT_LPAREN_AFTER_NOT_LPAREN] = { "expected a `(` immediately after `not`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPECT_LPAREN_AFTER_NOT_OTHER] = { "expected a `(` after `not`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPECT_LPAREN_REQ_PARAMETER] = { "expected a `(` to start a required parameter", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPECT_MESSAGE] = { "unexpected %s; expecting a message to send to the receiver", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPECT_RBRACKET] = { "expected a matching `]`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPECT_RPAREN] = { "expected a matching `)`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPECT_RPAREN_AFTER_MULTI] = { "expected a `)` after multiple assignment", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPECT_RPAREN_REQ_PARAMETER] = { "expected a `)` to end a required parameter", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPECT_SINGLETON_CLASS_DELIMITER] = { "unexpected %s; expected a newline or a ';' after the singleton class", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPECT_STRING_CONTENT] = { "expected string content after opening string delimiter", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPECT_WHEN_DELIMITER] = { "expected a delimiter after the predicates of a `when` clause", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPRESSION_BARE_HASH] = { "unexpected bare hash in expression", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPRESSION_NOT_WRITABLE] = { "unexpected '='; target cannot be written", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPRESSION_NOT_WRITABLE_ENCODING] = { "Can't assign to __ENCODING__", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPRESSION_NOT_WRITABLE_FALSE] = { "Can't assign to false", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPRESSION_NOT_WRITABLE_FILE] = { "Can't assign to __FILE__", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPRESSION_NOT_WRITABLE_LINE] = { "Can't assign to __LINE__", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPRESSION_NOT_WRITABLE_NIL] = { "Can't assign to nil", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPRESSION_NOT_WRITABLE_NUMBERED] = { "Can't assign to numbered parameter %.2s", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPRESSION_NOT_WRITABLE_SELF] = { "Can't change the value of self", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPRESSION_NOT_WRITABLE_TRUE] = { "Can't assign to true", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_FLOAT_PARSE] = { "could not parse the float '%.*s'", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_FOR_COLLECTION] = { "expected a collection after the `in` in a `for` statement", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_FOR_INDEX] = { "expected an index after `for`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_FOR_IN] = { "expected an `in` after the index in a `for` statement", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_FOR_TERM] = { "expected an `end` to close the `for` loop", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_GLOBAL_VARIABLE_BARE] = { "'$' without identifiers is not allowed as a global variable name", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_HASH_EXPRESSION_AFTER_LABEL] = { "expected an expression after the label in a hash", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_HASH_KEY] = { "unexpected %s, expecting '}' or a key in the hash literal", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_HASH_ROCKET] = { "expected a `=>` between the hash key and value", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_HASH_TERM] = { "expected a `}` to close the hash literal", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_HASH_VALUE] = { "unexpected %s; expected a value in the hash literal", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_HEREDOC_IDENTIFIER] = { "unterminated here document identifier", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_HEREDOC_TERM] = { "unterminated heredoc; can't find string \"%.*s\" anywhere before EOF", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INCOMPLETE_QUESTION_MARK] = { "incomplete expression at `?`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INCOMPLETE_VARIABLE_CLASS_3_3] = { "`%.*s' is not allowed as a class variable name", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INCOMPLETE_VARIABLE_CLASS] = { "'%.*s' is not allowed as a class variable name", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INCOMPLETE_VARIABLE_INSTANCE_3_3] = { "`%.*s' is not allowed as an instance variable name", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INCOMPLETE_VARIABLE_INSTANCE] = { "'%.*s' is not allowed as an instance variable name", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INSTANCE_VARIABLE_BARE] = { "'@' without identifiers is not allowed as an instance variable name", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INVALID_BLOCK_EXIT] = { "Invalid %s", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INVALID_COMMA] = { "invalid comma", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INVALID_ESCAPE_CHARACTER] = { "Invalid escape character syntax", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INVALID_FLOAT_EXPONENT] = { "invalid exponent", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INVALID_LOCAL_VARIABLE_READ] = { "identifier %.*s is not valid to get", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INVALID_LOCAL_VARIABLE_WRITE] = { "identifier %.*s is not valid to set", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INVALID_NUMBER_BINARY] = { "invalid binary number; numeric literal without digits", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INVALID_NUMBER_DECIMAL] = { "invalid decimal number; numeric literal without digits", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INVALID_NUMBER_FRACTION] = { "unexpected fraction part after numeric literal", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INVALID_NUMBER_HEXADECIMAL] = { "invalid hexadecimal number; numeric literal without digits", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INVALID_NUMBER_OCTAL] = { "invalid octal number; numeric literal without digits", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INVALID_NUMBER_UNDERSCORE_INNER] = { "invalid underscore placement in number", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INVALID_NUMBER_UNDERSCORE_TRAILING] = { "trailing '_' in number", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INVALID_CHARACTER] = { "Invalid char '\\x%02X' in expression", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INVALID_MULTIBYTE_CHAR] = { "invalid multibyte char (%s)", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INVALID_MULTIBYTE_CHARACTER] = { "invalid multibyte character 0x%X", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INVALID_MULTIBYTE_ESCAPE] = { "invalid multibyte escape: /%.*s/", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INVALID_PRINTABLE_CHARACTER] = { "invalid character `%c`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INVALID_PERCENT] = { "unknown type of %string", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INVALID_PERCENT_EOF] = { "unterminated quoted string meets end of file", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INVALID_RETRY_AFTER_ELSE] = { "Invalid retry after else", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INVALID_RETRY_AFTER_ENSURE] = { "Invalid retry after ensure", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INVALID_RETRY_WITHOUT_RESCUE] = { "Invalid retry without rescue", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INVALID_SYMBOL] = { "invalid symbol", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INVALID_VARIABLE_GLOBAL_3_3] = { "`%.*s' is not allowed as a global variable name", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INVALID_VARIABLE_GLOBAL] = { "'%.*s' is not allowed as a global variable name", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_INVALID_YIELD] = { "Invalid yield", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_IT_NOT_ALLOWED_NUMBERED] = { "'it' is not allowed when a numbered parameter is already used", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_IT_NOT_ALLOWED_ORDINARY] = { "'it' is not allowed when an ordinary parameter is defined", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_LAMBDA_OPEN] = { "expected a `do` keyword or a `{` to open the lambda block", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_LAMBDA_TERM_BRACE] = { "expected a lambda block beginning with `{` to end with `}`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_LAMBDA_TERM_END] = { "expected a lambda block beginning with `do` to end with `end`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_LIST_I_LOWER_ELEMENT] = { "expected a symbol in a `%i` list", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_LIST_I_LOWER_TERM] = { "unterminated list; expected a closing delimiter for the `%i`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_LIST_I_UPPER_ELEMENT] = { "expected a symbol in a `%I` list", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_LIST_I_UPPER_TERM] = { "unterminated list; expected a closing delimiter for the `%I`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_LIST_W_LOWER_ELEMENT] = { "expected a string in a `%w` list", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_LIST_W_LOWER_TERM] = { "unterminated list; expected a closing delimiter for the `%w`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_LIST_W_UPPER_ELEMENT] = { "expected a string in a `%W` list", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_LIST_W_UPPER_TERM] = { "unterminated list; expected a closing delimiter for the `%W`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_MALLOC_FAILED] = { "failed to allocate memory", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_MIXED_ENCODING] = { "UTF-8 mixed within %s source", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_MODULE_IN_METHOD] = { "unexpected module definition in method body", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_MODULE_NAME] = { "unexpected constant path after `module`; class/module name must be CONSTANT", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_MODULE_TERM] = { "expected an `end` to close the `module` statement", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_MULTI_ASSIGN_MULTI_SPLATS] = { "multiple splats in multiple assignment", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_MULTI_ASSIGN_UNEXPECTED_REST] = { "unexpected '%.*s' resulting in multiple splats in multiple assignment", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_NESTING_TOO_DEEP] = { "nesting too deep", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_NO_LOCAL_VARIABLE] = { "%.*s: no such local variable", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_NON_ASSOCIATIVE_OPERATOR] = { "unexpected %s; %s is a non-associative operator", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_NOT_EXPRESSION] = { "expected an expression after `not`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_NUMBER_LITERAL_UNDERSCORE] = { "number literal ending with a `_`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_NUMBERED_PARAMETER_INNER_BLOCK] = { "numbered parameter is already used in inner block", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_NUMBERED_PARAMETER_IT] = { "numbered parameters are not allowed when 'it' is already used", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_NUMBERED_PARAMETER_ORDINARY] = { "numbered parameters are not allowed when an ordinary parameter is defined", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_NUMBERED_PARAMETER_OUTER_BLOCK] = { "numbered parameter is already used in outer block", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_OPERATOR_MULTI_ASSIGN] = { "unexpected operator for a multiple assignment", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_OPERATOR_WRITE_ARGUMENTS] = { "unexpected operator after a call with arguments", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_OPERATOR_WRITE_BLOCK] = { "unexpected operator after a call with a block", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PARAMETER_ASSOC_SPLAT_MULTI] = { "unexpected multiple `**` splat parameters", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PARAMETER_BLOCK_MULTI] = { "multiple block parameters; only one block is allowed", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PARAMETER_CIRCULAR] = { "circular argument reference - %.*s", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PARAMETER_FORWARDING_AFTER_REST] = { "... after rest argument", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PARAMETER_METHOD_NAME] = { "unexpected name for a parameter", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PARAMETER_NAME_DUPLICATED] = { "duplicated argument name", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PARAMETER_NO_DEFAULT] = { "expected a default value for the parameter", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PARAMETER_NO_DEFAULT_KW] = { "expected a default value for the keyword parameter", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PARAMETER_NUMBERED_RESERVED] = { "%.2s is reserved for numbered parameters", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PARAMETER_ORDER] = { "unexpected parameter order", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PARAMETER_SPLAT_MULTI] = { "unexpected multiple `*` splat parameters", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PARAMETER_STAR] = { "unexpected parameter `*`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PARAMETER_UNEXPECTED_FWD] = { "unexpected `...` in parameters", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PARAMETER_WILD_LOOSE_COMMA] = { "unexpected `,` in parameters", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PARAMETER_UNEXPECTED_NO_KW] = { "unexpected **nil; no keywords marker disallowed after keywords", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PATTERN_ARRAY_MULTIPLE_RESTS] = { "unexpected multiple '*' rest patterns in an array pattern", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PATTERN_CAPTURE_DUPLICATE] = { "duplicated variable name", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PATTERN_CAPTURE_IN_ALTERNATIVE] = { "variable capture in alternative pattern", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PATTERN_EXPRESSION_AFTER_BRACKET] = { "expected a pattern expression after the `[` operator", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PATTERN_EXPRESSION_AFTER_COMMA] = { "expected a pattern expression after `,`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PATTERN_EXPRESSION_AFTER_HROCKET] = { "expected a pattern expression after `=>`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PATTERN_EXPRESSION_AFTER_IN] = { "expected a pattern expression after the `in` keyword", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PATTERN_EXPRESSION_AFTER_KEY] = { "expected a pattern expression after the key", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PATTERN_EXPRESSION_AFTER_PAREN] = { "expected a pattern expression after the `(` operator", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PATTERN_EXPRESSION_AFTER_PIN] = { "expected a pattern expression after the `^` pin operator", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PATTERN_EXPRESSION_AFTER_PIPE] = { "expected a pattern expression after the `|` operator", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PATTERN_EXPRESSION_AFTER_RANGE] = { "expected a pattern expression after the range operator", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PATTERN_EXPRESSION_AFTER_REST] = { "unexpected pattern expression after the `**` expression", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PATTERN_FIND_MISSING_INNER] = { "find patterns need at least one required inner pattern", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PATTERN_HASH_IMPLICIT] = { "unexpected implicit hash in pattern; use '{' to delineate", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PATTERN_HASH_KEY] = { "unexpected %s; expected a key in the hash pattern", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PATTERN_HASH_KEY_DUPLICATE] = { "duplicated key name", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PATTERN_HASH_KEY_INTERPOLATED] = { "symbol literal with interpolation is not allowed", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PATTERN_HASH_KEY_LABEL] = { "expected a label as the key in the hash pattern", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PATTERN_HASH_KEY_LOCALS] = { "key must be valid as local variables", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PATTERN_IDENT_AFTER_HROCKET] = { "expected an identifier after the `=>` operator", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PATTERN_LABEL_AFTER_COMMA] = { "expected a label after the `,` in the hash pattern", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PATTERN_REST] = { "unexpected rest pattern", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PATTERN_TERM_BRACE] = { "expected a `}` to close the pattern expression", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PATTERN_TERM_BRACKET] = { "expected a `]` to close the pattern expression", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PATTERN_TERM_PAREN] = { "expected a `)` to close the pattern expression", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PIPEPIPEEQ_MULTI_ASSIGN] = { "unexpected `||=` in a multiple assignment", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_REGEXP_ENCODING_OPTION_MISMATCH] = { "regexp encoding option '%c' differs from source encoding '%s'", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING] = { "incompatible character encoding: /%.*s/", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_REGEXP_NON_ESCAPED_MBC] = { "/.../n has a non escaped non ASCII character in non ASCII-8BIT script: /%.*s/", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_REGEXP_INVALID_UNICODE_RANGE] = { "invalid Unicode range: /%.*s/", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_REGEXP_PARSE_ERROR] = { "%s", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_REGEXP_UNKNOWN_OPTIONS] = { "unknown regexp %s - %.*s", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_REGEXP_TERM] = { "unterminated regexp meets end of file; expected a closing delimiter", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP] = { "UTF-8 character in non UTF-8 regexp: /%s/", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_RESCUE_EXPRESSION] = { "expected a rescued expression", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_RESCUE_MODIFIER_VALUE] = { "expected a value after the `rescue` modifier", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_RESCUE_TERM] = { "expected a closing delimiter for the `rescue` clause", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_RESCUE_VARIABLE] = { "expected an exception variable after `=>` in a rescue statement", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_RETURN_INVALID] = { "Invalid return in class/module body", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_SINGLETON_FOR_LITERALS] = { "cannot define singleton method for literals", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_STATEMENT_ALIAS] = { "unexpected an `alias` at a non-statement position", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_STATEMENT_POSTEXE_END] = { "unexpected an `END` at a non-statement position", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_STATEMENT_PREEXE_BEGIN] = { "unexpected a `BEGIN` at a non-statement position", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_STATEMENT_UNDEF] = { "unexpected an `undef` at a non-statement position", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_STRING_CONCATENATION] = { "expected a string for concatenation", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_STRING_INTERPOLATED_TERM] = { "unterminated string; expected a closing delimiter for the interpolated string", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_STRING_LITERAL_EOF] = { "unterminated string meets end of file", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_STRING_LITERAL_TERM] = { "unexpected %s, expected a string literal terminator", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_SYMBOL_INVALID] = { "invalid symbol", PM_ERROR_LEVEL_SYNTAX }, // TODO expected symbol? prism.c ~9719 + [PM_ERR_SYMBOL_TERM_DYNAMIC] = { "unterminated quoted string; expected a closing delimiter for the dynamic symbol", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_SYMBOL_TERM_INTERPOLATED] = { "unterminated symbol; expected a closing delimiter for the interpolated symbol", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_TERNARY_COLON] = { "expected a `:` after the true expression of a ternary operator", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_TERNARY_EXPRESSION_FALSE] = { "expected an expression after `:` in the ternary operator", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_TERNARY_EXPRESSION_TRUE] = { "expected an expression after `?` in the ternary operator", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_UNARY_RECEIVER] = { "unexpected %s, expected a receiver for unary `%c`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_UNARY_DISALLOWED] = { "unexpected %s; unary calls are not allowed in this context", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_UNDEF_ARGUMENT] = { "invalid argument being passed to `undef`; expected a bare word, constant, or symbol argument", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_UNEXPECTED_BLOCK_ARGUMENT] = { "block argument should not be given", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_UNEXPECTED_INDEX_BLOCK] = { "unexpected block arg given in index assignment; blocks are not allowed in index assignment expressions", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_UNEXPECTED_INDEX_KEYWORDS] = { "unexpected keyword arg given in index assignment; keywords are not allowed in index assignment expressions", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_UNEXPECTED_LABEL] = { "unexpected label", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_UNEXPECTED_MULTI_WRITE] = { "unexpected multiple assignment; multiple assignment is not allowed in this context", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_UNEXPECTED_PARAMETER_DEFAULT_VALUE] = { "unexpected %s; expected a default value for a parameter", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_UNEXPECTED_RANGE_OPERATOR] = { "unexpected range operator; .. and ... are non-associative and cannot be chained", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_UNEXPECTED_SAFE_NAVIGATION] = { "&. inside multiple assignment destination", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_UNEXPECTED_TOKEN_CLOSE_CONTEXT] = { "unexpected %s, assuming it is closing the parent %s", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_UNEXPECTED_TOKEN_IGNORE] = { "unexpected %s, ignoring it", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_UNTIL_TERM] = { "expected an `end` to close the `until` statement", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_VOID_EXPRESSION] = { "unexpected void value expression", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_WHILE_TERM] = { "expected an `end` to close the `while` statement", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_WRITE_TARGET_IN_METHOD] = { "dynamic constant assignment", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_WRITE_TARGET_READONLY] = { "Can't set variable %.*s", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_WRITE_TARGET_UNEXPECTED] = { "unexpected write target", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_XSTRING_TERM] = { "expected a closing delimiter for the `%x` or backtick string", PM_ERROR_LEVEL_SYNTAX }, + + // Warnings + [PM_WARN_AMBIGUOUS_BINARY_OPERATOR] = { "'%s' after local variable or literal is interpreted as binary operator even though it seems like %s", PM_WARNING_LEVEL_VERBOSE }, + [PM_WARN_AMBIGUOUS_FIRST_ARGUMENT_MINUS] = { "ambiguous first argument; put parentheses or a space even after `-` operator", PM_WARNING_LEVEL_VERBOSE }, + [PM_WARN_AMBIGUOUS_FIRST_ARGUMENT_PLUS] = { "ambiguous first argument; put parentheses or a space even after `+` operator", PM_WARNING_LEVEL_VERBOSE }, + [PM_WARN_AMBIGUOUS_PREFIX_AMPERSAND] = { "ambiguous `&` has been interpreted as an argument prefix", PM_WARNING_LEVEL_VERBOSE }, + [PM_WARN_AMBIGUOUS_PREFIX_STAR] = { "ambiguous `*` has been interpreted as an argument prefix", PM_WARNING_LEVEL_VERBOSE }, + [PM_WARN_AMBIGUOUS_PREFIX_STAR_STAR] = { "ambiguous `**` has been interpreted as an argument prefix", PM_WARNING_LEVEL_VERBOSE }, + [PM_WARN_AMBIGUOUS_SLASH] = { "ambiguous `/`; wrap regexp in parentheses or add a space after `/` operator", PM_WARNING_LEVEL_VERBOSE }, + [PM_WARN_COMPARISON_AFTER_COMPARISON] = { "comparison '%.*s' after comparison", PM_WARNING_LEVEL_VERBOSE }, + [PM_WARN_DOT_DOT_DOT_EOL] = { "... at EOL, should be parenthesized?", PM_WARNING_LEVEL_DEFAULT }, + [PM_WARN_DUPLICATED_HASH_KEY] = { "key %.*s is duplicated and overwritten on line %" PRIi32, PM_WARNING_LEVEL_DEFAULT }, + [PM_WARN_DUPLICATED_WHEN_CLAUSE] = { "'when' clause on line %" PRIi32 " duplicates 'when' clause on line %" PRIi32 " and is ignored", PM_WARNING_LEVEL_VERBOSE }, + [PM_WARN_EQUAL_IN_CONDITIONAL_3_3] = { "found `= literal' in conditional, should be ==", PM_WARNING_LEVEL_DEFAULT }, + [PM_WARN_EQUAL_IN_CONDITIONAL] = { "found '= literal' in conditional, should be ==", PM_WARNING_LEVEL_DEFAULT }, + [PM_WARN_END_IN_METHOD] = { "END in method; use at_exit", PM_WARNING_LEVEL_DEFAULT }, + [PM_WARN_FLOAT_OUT_OF_RANGE] = { "Float %.*s%s out of range", PM_WARNING_LEVEL_VERBOSE }, + [PM_WARN_IGNORED_FROZEN_STRING_LITERAL] = { "'frozen_string_literal' is ignored after any tokens", PM_WARNING_LEVEL_VERBOSE }, + [PM_WARN_INDENTATION_MISMATCH] = { "mismatched indentations at '%.*s' with '%.*s' at %" PRIi32, PM_WARNING_LEVEL_VERBOSE }, + [PM_WARN_INTEGER_IN_FLIP_FLOP] = { "integer literal in flip-flop", PM_WARNING_LEVEL_DEFAULT }, + [PM_WARN_INVALID_CHARACTER] = { "invalid character syntax; use %s%s%s", PM_WARNING_LEVEL_DEFAULT }, + [PM_WARN_INVALID_MAGIC_COMMENT_VALUE] = { "invalid value for %.*s: %.*s", PM_WARNING_LEVEL_VERBOSE }, + [PM_WARN_INVALID_NUMBERED_REFERENCE] = { "'%.*s' is too big for a number variable, always nil", PM_WARNING_LEVEL_DEFAULT }, + [PM_WARN_KEYWORD_EOL] = { "`%.*s` at the end of line without an expression", PM_WARNING_LEVEL_VERBOSE }, + [PM_WARN_LITERAL_IN_CONDITION_DEFAULT] = { "%sliteral in %s", PM_WARNING_LEVEL_DEFAULT }, + [PM_WARN_LITERAL_IN_CONDITION_VERBOSE] = { "%sliteral in %s", PM_WARNING_LEVEL_VERBOSE }, + [PM_WARN_SHAREABLE_CONSTANT_VALUE_LINE] = { "'shareable_constant_value' is ignored unless in comment-only line", PM_WARNING_LEVEL_VERBOSE }, + [PM_WARN_SHEBANG_CARRIAGE_RETURN] = { "shebang line ending with \\r may cause problems", PM_WARNING_LEVEL_DEFAULT }, + [PM_WARN_UNEXPECTED_CARRIAGE_RETURN] = { "encountered \\r in middle of line, treated as a mere space", PM_WARNING_LEVEL_DEFAULT }, + [PM_WARN_UNREACHABLE_STATEMENT] = { "statement not reached", PM_WARNING_LEVEL_VERBOSE }, + [PM_WARN_UNUSED_LOCAL_VARIABLE] = { "assigned but unused variable - %.*s", PM_WARNING_LEVEL_VERBOSE }, + [PM_WARN_VOID_STATEMENT] = { "possibly useless use of %.*s in void context", PM_WARNING_LEVEL_VERBOSE } +}; + +/** + * Get the human-readable name of the given diagnostic ID. + */ +const char * +pm_diagnostic_id_human(pm_diagnostic_id_t diag_id) { + switch (diag_id) { + <%- errors.each do |error| -%> + case PM_ERR_<%= error.name %>: return "<%= error.name.downcase %>"; + <%- end -%> + <%- warnings.each do |warning| -%> + case PM_WARN_<%= warning.name %>: return "<%= warning.name.downcase %>"; + <%- end -%> + } + + assert(false && "unreachable"); + return ""; +} + +static inline const char * +pm_diagnostic_message(pm_diagnostic_id_t diag_id) { + assert(diag_id < PM_DIAGNOSTIC_ID_MAX); + + const char *message = diagnostic_messages[diag_id].message; + assert(message); + + return message; +} + +static inline uint8_t +pm_diagnostic_level(pm_diagnostic_id_t diag_id) { + assert(diag_id < PM_DIAGNOSTIC_ID_MAX); + + return (uint8_t) diagnostic_messages[diag_id].level; +} + +/** + * Append an error to the given list of diagnostic. + */ +bool +pm_diagnostic_list_append(pm_list_t *list, const uint8_t *start, const uint8_t *end, pm_diagnostic_id_t diag_id) { + pm_diagnostic_t *diagnostic = (pm_diagnostic_t *) xcalloc(1, sizeof(pm_diagnostic_t)); + if (diagnostic == NULL) return false; + + *diagnostic = (pm_diagnostic_t) { + .location = { start, end }, + .diag_id = diag_id, + .message = pm_diagnostic_message(diag_id), + .owned = false, + .level = pm_diagnostic_level(diag_id) + }; + + pm_list_append(list, (pm_list_node_t *) diagnostic); + return true; +} + +/** + * Append a diagnostic to the given list of diagnostics that is using a format + * string for its message. + */ +bool +pm_diagnostic_list_append_format(pm_list_t *list, const uint8_t *start, const uint8_t *end, pm_diagnostic_id_t diag_id, ...) { + va_list arguments; + va_start(arguments, diag_id); + + const char *format = pm_diagnostic_message(diag_id); + int result = vsnprintf(NULL, 0, format, arguments); + va_end(arguments); + + if (result < 0) { + return false; + } + + pm_diagnostic_t *diagnostic = (pm_diagnostic_t *) xcalloc(1, sizeof(pm_diagnostic_t)); + if (diagnostic == NULL) { + return false; + } + + size_t length = (size_t) (result + 1); + char *message = (char *) xmalloc(length); + if (message == NULL) { + xfree(diagnostic); + return false; + } + + va_start(arguments, diag_id); + vsnprintf(message, length, format, arguments); + va_end(arguments); + + *diagnostic = (pm_diagnostic_t) { + .location = { start, end }, + .diag_id = diag_id, + .message = message, + .owned = true, + .level = pm_diagnostic_level(diag_id) + }; + + pm_list_append(list, (pm_list_node_t *) diagnostic); + return true; +} + +/** + * Deallocate the internal state of the given diagnostic list. + */ +void +pm_diagnostic_list_free(pm_list_t *list) { + pm_diagnostic_t *node = (pm_diagnostic_t *) list->head; + + while (node != NULL) { + pm_diagnostic_t *next = (pm_diagnostic_t *) node->node.next; + + if (node->owned) xfree((void *) node->message); + xfree(node); + + node = next; + } +} diff --git a/prism/templates/src/node.c.erb b/prism/templates/src/node.c.erb new file mode 100644 index 0000000000..2357e55200 --- /dev/null +++ b/prism/templates/src/node.c.erb @@ -0,0 +1,333 @@ +#line <%= __LINE__ + 1 %> "prism/templates/src/<%= File.basename(__FILE__) %>" +#include "prism/node.h" + +/** + * Attempts to grow the node list to the next size. If there is already + * capacity in the list, this function does nothing. Otherwise it reallocates + * the list to be twice as large as it was before. If the reallocation fails, + * this function returns false, otherwise it returns true. + */ +static bool +pm_node_list_grow(pm_node_list_t *list, size_t size) { + size_t requested_size = list->size + size; + + // If the requested size caused overflow, return false. + if (requested_size < list->size) return false; + + // If the requested size is within the existing capacity, return true. + if (requested_size < list->capacity) return true; + + // Otherwise, reallocate the list to be twice as large as it was before. + size_t next_capacity = list->capacity == 0 ? 4 : list->capacity * 2; + + // If multiplying by 2 caused overflow, return false. + if (next_capacity < list->capacity) return false; + + // If we didn't get enough by doubling, keep doubling until we do. + while (requested_size > next_capacity) { + size_t double_capacity = next_capacity * 2; + + // Ensure we didn't overflow by multiplying by 2. + if (double_capacity < next_capacity) return false; + next_capacity = double_capacity; + } + + pm_node_t **nodes = (pm_node_t **) xrealloc(list->nodes, sizeof(pm_node_t *) * next_capacity); + if (nodes == NULL) return false; + + list->nodes = nodes; + list->capacity = next_capacity; + return true; +} + +/** + * Append a new node onto the end of the node list. + */ +void +pm_node_list_append(pm_node_list_t *list, pm_node_t *node) { + if (pm_node_list_grow(list, 1)) { + list->nodes[list->size++] = node; + } +} + +/** + * Prepend a new node onto the beginning of the node list. + */ +void +pm_node_list_prepend(pm_node_list_t *list, pm_node_t *node) { + if (pm_node_list_grow(list, 1)) { + memmove(list->nodes + 1, list->nodes, list->size * sizeof(pm_node_t *)); + list->nodes[0] = node; + list->size++; + } +} + +/** + * Concatenate the given node list onto the end of the other node list. + */ +void +pm_node_list_concat(pm_node_list_t *list, pm_node_list_t *other) { + if (other->size > 0 && pm_node_list_grow(list, other->size)) { + memcpy(list->nodes + list->size, other->nodes, other->size * sizeof(pm_node_t *)); + list->size += other->size; + } +} + +/** + * Free the internal memory associated with the given node list. + */ +void +pm_node_list_free(pm_node_list_t *list) { + if (list->capacity > 0) { + xfree(list->nodes); + *list = (pm_node_list_t) { 0 }; + } +} + +PRISM_EXPORTED_FUNCTION void +pm_node_destroy(pm_parser_t *parser, pm_node_t *node); + +/** + * Destroy the nodes that are contained within the given node list. + */ +static void +pm_node_list_destroy(pm_parser_t *parser, pm_node_list_t *list) { + pm_node_t *node; + PM_NODE_LIST_FOREACH(list, index, node) pm_node_destroy(parser, node); + pm_node_list_free(list); +} + +/** + * Deallocate the space for a pm_node_t. Similarly to pm_node_alloc, we're not + * using the parser argument, but it's there to allow for the future possibility + * of pre-allocating larger memory pools. + */ +PRISM_EXPORTED_FUNCTION void +pm_node_destroy(pm_parser_t *parser, pm_node_t *node) { + switch (PM_NODE_TYPE(node)) { + <%- nodes.each do |node| -%> +#line <%= __LINE__ + 1 %> "prism/templates/src/<%= File.basename(__FILE__) %>" + case <%= node.type %>: { + <%- if node.fields.any? { |field| ![Prism::Template::LocationField, Prism::Template::OptionalLocationField, Prism::Template::UInt8Field, Prism::Template::UInt32Field, Prism::Template::ConstantField, Prism::Template::OptionalConstantField, Prism::Template::DoubleField].include?(field.class) } -%> + pm_<%= node.human %>_t *cast = (pm_<%= node.human %>_t *) node; + <%- end -%> + <%- node.fields.each do |field| -%> + <%- case field -%> + <%- when Prism::Template::LocationField, Prism::Template::OptionalLocationField, Prism::Template::UInt8Field, Prism::Template::UInt32Field, Prism::Template::ConstantField, Prism::Template::OptionalConstantField, Prism::Template::DoubleField -%> + <%- when Prism::Template::NodeField -%> + pm_node_destroy(parser, (pm_node_t *)cast-><%= field.name %>); + <%- when Prism::Template::OptionalNodeField -%> + if (cast-><%= field.name %> != NULL) { + pm_node_destroy(parser, (pm_node_t *)cast-><%= field.name %>); + } + <%- when Prism::Template::StringField -%> + pm_string_free(&cast-><%= field.name %>); + <%- when Prism::Template::NodeListField -%> + pm_node_list_destroy(parser, &cast-><%= field.name %>); + <%- when Prism::Template::ConstantListField -%> + pm_constant_id_list_free(&cast-><%= field.name %>); + <%- when Prism::Template::IntegerField -%> + pm_integer_free(&cast-><%= field.name %>); + <%- else -%> + <%- raise -%> + <%- end -%> + <%- end -%> + break; + } + <%- end -%> +#line <%= __LINE__ + 1 %> "prism/templates/src/<%= File.basename(__FILE__) %>" + default: + assert(false && "unreachable"); + break; + } + xfree(node); +} + +/** + * Returns a string representation of the given node type. + */ +PRISM_EXPORTED_FUNCTION const char * +pm_node_type_to_str(pm_node_type_t node_type) +{ + switch (node_type) { +<%- nodes.each do |node| -%> + case <%= node.type %>: + return "<%= node.type %>"; +<%- end -%> + } + return ""; +} + +/** + * Visit each of the nodes in this subtree using the given visitor callback. The + * callback function will be called for each node in the subtree. If it returns + * false, then that node's children will not be visited. If it returns true, + * then the children will be visited. The data parameter is treated as an opaque + * pointer and is passed to the visitor callback for consumers to use as they + * see fit. + */ +PRISM_EXPORTED_FUNCTION void +pm_visit_node(const pm_node_t *node, bool (*visitor)(const pm_node_t *node, void *data), void *data) { + if (visitor(node, data)) pm_visit_child_nodes(node, visitor, data); +} + +/** + * Visit the children of the given node with the given callback. This is the + * default behavior for walking the tree that is called from pm_visit_node if + * the callback returns true. + */ +PRISM_EXPORTED_FUNCTION void +pm_visit_child_nodes(const pm_node_t *node, bool (*visitor)(const pm_node_t *node, void *data), void *data) { + switch (PM_NODE_TYPE(node)) { + <%- nodes.each do |node| -%> + <%- if (fields = node.fields.select { |field| field.is_a?(Prism::Template::NodeField) || field.is_a?(Prism::Template::OptionalNodeField) || field.is_a?(Prism::Template::NodeListField) }).any? -%> + case <%= node.type %>: { + const pm_<%= node.human %>_t *cast = (const pm_<%= node.human %>_t *) node; + <%- fields.each do |field| -%> + + // Visit the <%= field.name %> field + <%- case field -%> + <%- when Prism::Template::NodeField -%> + pm_visit_node((const pm_node_t *) cast-><%= field.name %>, visitor, data); + <%- when Prism::Template::OptionalNodeField -%> + if (cast-><%= field.name %> != NULL) { + pm_visit_node((const pm_node_t *) cast-><%= field.name %>, visitor, data); + } + <%- when Prism::Template::NodeListField -%> + const pm_node_list_t *<%= field.name %> = &cast-><%= field.name %>; + for (size_t index = 0; index < <%= field.name %>->size; index++) { + pm_visit_node(<%= field.name %>->nodes[index], visitor, data); + } + <%- end -%> + <%- end -%> + + break; + } + <%- else -%> + case <%= node.type %>: + break; + <%- end -%> + <%- end -%> + case PM_SCOPE_NODE: + break; + } +} + +// We optionally support dumping to JSON. For systems that don't want or need +// this functionality, it can be turned off with the PRISM_EXCLUDE_JSON define. +#ifndef PRISM_EXCLUDE_JSON + +static void +pm_dump_json_constant(pm_buffer_t *buffer, const pm_parser_t *parser, pm_constant_id_t constant_id) { + const pm_constant_t *constant = pm_constant_pool_id_to_constant(&parser->constant_pool, constant_id); + pm_buffer_append_byte(buffer, '"'); + pm_buffer_append_source(buffer, constant->start, constant->length, PM_BUFFER_ESCAPING_JSON); + pm_buffer_append_byte(buffer, '"'); +} + +static void +pm_dump_json_location(pm_buffer_t *buffer, const pm_parser_t *parser, const pm_location_t *location) { + uint32_t start = (uint32_t) (location->start - parser->start); + uint32_t end = (uint32_t) (location->end - parser->start); + pm_buffer_append_format(buffer, "{\"start\":%" PRIu32 ",\"end\":%" PRIu32 "}", start, end); +} + +/** + * Dump JSON to the given buffer. + */ +PRISM_EXPORTED_FUNCTION void +pm_dump_json(pm_buffer_t *buffer, const pm_parser_t *parser, const pm_node_t *node) { + switch (PM_NODE_TYPE(node)) { + <%- nodes.each do |node| -%> + case <%= node.type %>: { + pm_buffer_append_string(buffer, "{\"type\":\"<%= node.name %>\",\"location\":", <%= node.name.bytesize + 22 %>); + + const pm_<%= node.human %>_t *cast = (const pm_<%= node.human %>_t *) node; + pm_dump_json_location(buffer, parser, &cast->base.location); + <%- [*node.flags, *node.fields].each_with_index do |field, index| -%> + + // Dump the <%= field.name %> field + pm_buffer_append_byte(buffer, ','); + pm_buffer_append_string(buffer, "\"<%= field.name %>\":", <%= field.name.bytesize + 3 %>); + <%- case field -%> + <%- when Prism::Template::NodeField -%> + pm_dump_json(buffer, parser, (const pm_node_t *) cast-><%= field.name %>); + <%- when Prism::Template::OptionalNodeField -%> + if (cast-><%= field.name %> != NULL) { + pm_dump_json(buffer, parser, (const pm_node_t *) cast-><%= field.name %>); + } else { + pm_buffer_append_string(buffer, "null", 4); + } + <%- when Prism::Template::NodeListField -%> + const pm_node_list_t *<%= field.name %> = &cast-><%= field.name %>; + pm_buffer_append_byte(buffer, '['); + + for (size_t index = 0; index < <%= field.name %>->size; index++) { + if (index != 0) pm_buffer_append_byte(buffer, ','); + pm_dump_json(buffer, parser, <%= field.name %>->nodes[index]); + } + pm_buffer_append_byte(buffer, ']'); + <%- when Prism::Template::StringField -%> + const pm_string_t *<%= field.name %> = &cast-><%= field.name %>; + pm_buffer_append_byte(buffer, '"'); + pm_buffer_append_source(buffer, pm_string_source(<%= field.name %>), pm_string_length(<%= field.name %>), PM_BUFFER_ESCAPING_JSON); + pm_buffer_append_byte(buffer, '"'); + <%- when Prism::Template::ConstantField -%> + pm_dump_json_constant(buffer, parser, cast-><%= field.name %>); + <%- when Prism::Template::OptionalConstantField -%> + if (cast-><%= field.name %> != PM_CONSTANT_ID_UNSET) { + pm_dump_json_constant(buffer, parser, cast-><%= field.name %>); + } else { + pm_buffer_append_string(buffer, "null", 4); + } + <%- when Prism::Template::ConstantListField -%> + const pm_constant_id_list_t *<%= field.name %> = &cast-><%= field.name %>; + pm_buffer_append_byte(buffer, '['); + + for (size_t index = 0; index < <%= field.name %>->size; index++) { + if (index != 0) pm_buffer_append_byte(buffer, ','); + pm_dump_json_constant(buffer, parser, <%= field.name %>->ids[index]); + } + pm_buffer_append_byte(buffer, ']'); + <%- when Prism::Template::LocationField -%> + pm_dump_json_location(buffer, parser, &cast-><%= field.name %>); + <%- when Prism::Template::OptionalLocationField -%> + if (cast-><%= field.name %>.start != NULL) { + pm_dump_json_location(buffer, parser, &cast-><%= field.name %>); + } else { + pm_buffer_append_string(buffer, "null", 4); + } + <%- when Prism::Template::UInt8Field -%> + pm_buffer_append_format(buffer, "%" PRIu8, cast-><%= field.name %>); + <%- when Prism::Template::UInt32Field -%> + pm_buffer_append_format(buffer, "%" PRIu32, cast-><%= field.name %>); + <%- when Prism::Template::Flags -%> + size_t flags = 0; + pm_buffer_append_byte(buffer, '['); + <%- node.flags.values.each_with_index do |value, index| -%> + if (PM_NODE_FLAG_P(cast, PM_<%= node.flags.human.upcase %>_<%= value.name %>)) { + if (flags != 0) pm_buffer_append_byte(buffer, ','); + pm_buffer_append_string(buffer, "\"<%= value.name %>\"", <%= value.name.bytesize + 2 %>); + flags++; + } + <%- end -%> + pm_buffer_append_byte(buffer, ']'); + <%- when Prism::Template::IntegerField -%> + pm_integer_string(buffer, &cast-><%= field.name %>); + <%- when Prism::Template::DoubleField -%> + pm_buffer_append_format(buffer, "%f", cast-><%= field.name %>); + <%- else -%> + <%- raise %> + <%- end -%> + <%- end -%> + + pm_buffer_append_byte(buffer, '}'); + break; + } + <%- end -%> + case PM_SCOPE_NODE: + break; + } +} + +#endif diff --git a/prism/templates/src/prettyprint.c.erb b/prism/templates/src/prettyprint.c.erb new file mode 100644 index 0000000000..639c2fecf3 --- /dev/null +++ b/prism/templates/src/prettyprint.c.erb @@ -0,0 +1,166 @@ +<%# encoding: ASCII -%> +#include "prism/prettyprint.h" + +// We optionally support pretty printing nodes. For systems that don't want or +// need this functionality, it can be turned off with the +// PRISM_EXCLUDE_PRETTYPRINT define. +#ifdef PRISM_EXCLUDE_PRETTYPRINT + +void pm_prettyprint(void) {} + +#else + +static inline void +prettyprint_location(pm_buffer_t *output_buffer, const pm_parser_t *parser, const pm_location_t *location) { + pm_line_column_t start = pm_newline_list_line_column(&parser->newline_list, location->start, parser->start_line); + pm_line_column_t end = pm_newline_list_line_column(&parser->newline_list, location->end, parser->start_line); + pm_buffer_append_format(output_buffer, "(%" PRIi32 ",%" PRIu32 ")-(%" PRIi32 ",%" PRIu32 ")", start.line, start.column, end.line, end.column); +} + +static inline void +prettyprint_constant(pm_buffer_t *output_buffer, const pm_parser_t *parser, const pm_constant_id_t constant_id) { + pm_constant_t *constant = pm_constant_pool_id_to_constant(&parser->constant_pool, constant_id); + pm_buffer_append_format(output_buffer, ":%.*s", (int) constant->length, constant->start); +} + +static void +prettyprint_node(pm_buffer_t *output_buffer, const pm_parser_t *parser, const pm_node_t *node, pm_buffer_t *prefix_buffer) { + switch (PM_NODE_TYPE(node)) { + case PM_SCOPE_NODE: + // We do not need to print a ScopeNode as it's not part of the AST. + return; + <%- nodes.each do |node| -%> + case <%= node.type %>: { + <%- if !node.flags.nil? || node.fields.any? -%> + pm_<%= node.human %>_t *cast = (pm_<%= node.human %>_t *) node; + <%- end -%> + pm_buffer_append_string(output_buffer, "@ <%= node.name %> (location: ", <%= node.name.length + 14 %>); + prettyprint_location(output_buffer, parser, &node->location); + pm_buffer_append_string(output_buffer, ")\n", 2); + <%- (fields = [*node.flags, *node.fields]).each_with_index do |field, index| -%> + <%- preadd = index == fields.length - 1 ? " " : "| " -%> + + // <%= field.name %> + { + pm_buffer_concat(output_buffer, prefix_buffer); + pm_buffer_append_string(output_buffer, "+-- <%= field.name %>:", <%= 4 + field.name.length + 1 %>); + <%- case field -%> + <%- when Prism::Template::NodeField -%> + pm_buffer_append_byte(output_buffer, '\n'); + + size_t prefix_length = prefix_buffer->length; + pm_buffer_append_string(prefix_buffer, "<%= preadd %>", 4); + pm_buffer_concat(output_buffer, prefix_buffer); + prettyprint_node(output_buffer, parser, (pm_node_t *) cast-><%= field.name %>, prefix_buffer); + prefix_buffer->length = prefix_length; + <%- when Prism::Template::OptionalNodeField -%> + if (cast-><%= field.name %> == NULL) { + pm_buffer_append_string(output_buffer, " nil\n", 5); + } else { + pm_buffer_append_byte(output_buffer, '\n'); + + size_t prefix_length = prefix_buffer->length; + pm_buffer_append_string(prefix_buffer, "<%= preadd %>", 4); + pm_buffer_concat(output_buffer, prefix_buffer); + prettyprint_node(output_buffer, parser, (pm_node_t *) cast-><%= field.name %>, prefix_buffer); + prefix_buffer->length = prefix_length; + } + <%- when Prism::Template::StringField -%> + pm_buffer_append_string(output_buffer, " \"", 2); + pm_buffer_append_source(output_buffer, pm_string_source(&cast-><%= field.name %>), pm_string_length(&cast-><%= field.name %>), PM_BUFFER_ESCAPING_RUBY); + pm_buffer_append_string(output_buffer, "\"\n", 2); + <%- when Prism::Template::NodeListField -%> + pm_buffer_append_format(output_buffer, " (length: %lu)\n", (unsigned long) (cast-><%= field.name %>.size)); + + size_t last_index = cast-><%= field.name %>.size; + for (uint32_t index = 0; index < last_index; index++) { + size_t prefix_length = prefix_buffer->length; + pm_buffer_append_string(prefix_buffer, "<%= preadd %>", 4); + pm_buffer_concat(output_buffer, prefix_buffer); + pm_buffer_append_string(output_buffer, "+-- ", 4); + pm_buffer_append_string(prefix_buffer, (index == last_index - 1) ? " " : "| ", 4); + prettyprint_node(output_buffer, parser, (pm_node_t *) cast-><%= field.name %>.nodes[index], prefix_buffer); + prefix_buffer->length = prefix_length; + } + <%- when Prism::Template::ConstantField -%> + pm_buffer_append_byte(output_buffer, ' '); + prettyprint_constant(output_buffer, parser, cast-><%= field.name %>); + pm_buffer_append_byte(output_buffer, '\n'); + <%- when Prism::Template::OptionalConstantField -%> + if (cast-><%= field.name %> == 0) { + pm_buffer_append_string(output_buffer, " nil\n", 5); + } else { + pm_buffer_append_byte(output_buffer, ' '); + prettyprint_constant(output_buffer, parser, cast-><%= field.name %>); + pm_buffer_append_byte(output_buffer, '\n'); + } + <%- when Prism::Template::ConstantListField -%> + pm_buffer_append_string(output_buffer, " [", 2); + for (uint32_t index = 0; index < cast-><%= field.name %>.size; index++) { + if (index != 0) pm_buffer_append_string(output_buffer, ", ", 2); + prettyprint_constant(output_buffer, parser, cast-><%= field.name %>.ids[index]); + } + pm_buffer_append_string(output_buffer, "]\n", 2); + <%- when Prism::Template::LocationField -%> + pm_location_t *location = &cast-><%= field.name %>; + pm_buffer_append_byte(output_buffer, ' '); + prettyprint_location(output_buffer, parser, location); + pm_buffer_append_string(output_buffer, " = \"", 4); + pm_buffer_append_source(output_buffer, location->start, (size_t) (location->end - location->start), PM_BUFFER_ESCAPING_RUBY); + pm_buffer_append_string(output_buffer, "\"\n", 2); + <%- when Prism::Template::OptionalLocationField -%> + pm_location_t *location = &cast-><%= field.name %>; + if (location->start == NULL) { + pm_buffer_append_string(output_buffer, " nil\n", 5); + } else { + pm_buffer_append_byte(output_buffer, ' '); + prettyprint_location(output_buffer, parser, location); + pm_buffer_append_string(output_buffer, " = \"", 4); + pm_buffer_append_source(output_buffer, location->start, (size_t) (location->end - location->start), PM_BUFFER_ESCAPING_RUBY); + pm_buffer_append_string(output_buffer, "\"\n", 2); + } + <%- when Prism::Template::UInt8Field -%> + pm_buffer_append_format(output_buffer, " %" PRIu8 "\n", cast-><%= field.name %>); + <%- when Prism::Template::UInt32Field -%> + pm_buffer_append_format(output_buffer, " %" PRIu32 "\n", cast-><%= field.name %>); + <%- when Prism::Template::Flags -%> + bool found = false; + <%- field.values.each do |value| -%> + if (cast->base.flags & PM_<%= field.human.upcase %>_<%= value.name %>) { + if (found) pm_buffer_append_byte(output_buffer, ','); + pm_buffer_append_string(output_buffer, " <%= value.name.downcase %>", <%= value.name.bytesize + 1 %>); + found = true; + } + <%- end -%> + if (!found) pm_buffer_append_string(output_buffer, " nil", 4); + pm_buffer_append_byte(output_buffer, '\n'); + <%- when Prism::Template::IntegerField -%> + const pm_integer_t *integer = &cast-><%= field.name %>; + pm_buffer_append_byte(output_buffer, ' '); + pm_integer_string(output_buffer, integer); + pm_buffer_append_byte(output_buffer, '\n'); + <%- when Prism::Template::DoubleField -%> + pm_buffer_append_format(output_buffer, " %f\n", cast-><%= field.name %>); + <%- else -%> + <%- raise -%> + <%- end -%> + } + <%- end -%> + + break; + } + <%- end -%> + } +} + +/** + * Pretty-prints the AST represented by the given node to the given buffer. + */ +PRISM_EXPORTED_FUNCTION void +pm_prettyprint(pm_buffer_t *output_buffer, const pm_parser_t *parser, const pm_node_t *node) { + pm_buffer_t prefix_buffer = { 0 }; + prettyprint_node(output_buffer, parser, node, &prefix_buffer); + pm_buffer_free(&prefix_buffer); +} + +#endif diff --git a/prism/templates/src/serialize.c.erb b/prism/templates/src/serialize.c.erb new file mode 100644 index 0000000000..0f0aace445 --- /dev/null +++ b/prism/templates/src/serialize.c.erb @@ -0,0 +1,406 @@ +#include "prism.h" + +// We optionally support serializing to a binary string. For systems that don't +// want or need this functionality, it can be turned off with the +// PRISM_EXCLUDE_SERIALIZATION define. +#ifndef PRISM_EXCLUDE_SERIALIZATION + +#include <stdio.h> + +static inline uint32_t +pm_ptrdifft_to_u32(ptrdiff_t value) { + assert(value >= 0 && ((unsigned long) value) < UINT32_MAX); + return (uint32_t) value; +} + +static inline uint32_t +pm_sizet_to_u32(size_t value) { + assert(value < UINT32_MAX); + return (uint32_t) value; +} + +static void +pm_serialize_location(const pm_parser_t *parser, const pm_location_t *location, pm_buffer_t *buffer) { + assert(location->start); + assert(location->end); + assert(location->start <= location->end); + + pm_buffer_append_varuint(buffer, pm_ptrdifft_to_u32(location->start - parser->start)); + pm_buffer_append_varuint(buffer, pm_ptrdifft_to_u32(location->end - location->start)); +} + +static void +pm_serialize_string(const pm_parser_t *parser, const pm_string_t *string, pm_buffer_t *buffer) { + switch (string->type) { + case PM_STRING_SHARED: { + pm_buffer_append_byte(buffer, 1); + pm_buffer_append_varuint(buffer, pm_ptrdifft_to_u32(pm_string_source(string) - parser->start)); + pm_buffer_append_varuint(buffer, pm_sizet_to_u32(pm_string_length(string))); + break; + } + case PM_STRING_OWNED: + case PM_STRING_CONSTANT: { + uint32_t length = pm_sizet_to_u32(pm_string_length(string)); + pm_buffer_append_byte(buffer, 2); + pm_buffer_append_varuint(buffer, length); + pm_buffer_append_bytes(buffer, pm_string_source(string), length); + break; + } +#ifdef PRISM_HAS_MMAP + case PM_STRING_MAPPED: + assert(false && "Cannot serialize mapped strings."); + break; +#endif + } +} + +static void +pm_serialize_integer(const pm_integer_t *integer, pm_buffer_t *buffer) { + pm_buffer_append_byte(buffer, integer->negative ? 1 : 0); + if (integer->values == NULL) { + pm_buffer_append_varuint(buffer, pm_sizet_to_u32(1)); + pm_buffer_append_varuint(buffer, integer->value); + } else { + pm_buffer_append_varuint(buffer, pm_sizet_to_u32(integer->length)); + for (size_t i = 0; i < integer->length; i++) { + pm_buffer_append_varuint(buffer, integer->values[i]); + } + } +} + +static void +pm_serialize_node(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) { + pm_buffer_append_byte(buffer, (uint8_t) PM_NODE_TYPE(node)); + + size_t offset = buffer->length; + + <%- if Prism::Template::INCLUDE_NODE_ID -%> + pm_buffer_append_varuint(buffer, node->node_id); + <%- end -%> + pm_serialize_location(parser, &node->location, buffer); + + switch (PM_NODE_TYPE(node)) { + // We do not need to serialize a ScopeNode ever as + // it is not part of the AST + case PM_SCOPE_NODE: + return; + <%- nodes.each do |node| -%> + case <%= node.type %>: { + <%- if node.needs_serialized_length? -%> + // serialize length + // encoding of location u32s make us need to save this offset. + size_t length_offset = buffer->length; + pm_buffer_append_string(buffer, "\0\0\0\0", 4); /* consume 4 bytes, updated below */ + <%- end -%> + <%- unless Prism::Template::SERIALIZE_ONLY_SEMANTICS_FIELDS && !node.flags -%> + pm_buffer_append_varuint(buffer, (uint32_t) node->flags); + <%- end -%> + <%- node.fields.each do |field| -%> + <%- case field -%> + <%- when Prism::Template::NodeField -%> + pm_serialize_node(parser, (pm_node_t *)((pm_<%= node.human %>_t *)node)-><%= field.name %>, buffer); + <%- when Prism::Template::OptionalNodeField -%> + if (((pm_<%= node.human %>_t *)node)-><%= field.name %> == NULL) { + pm_buffer_append_byte(buffer, 0); + } else { + pm_serialize_node(parser, (pm_node_t *)((pm_<%= node.human %>_t *)node)-><%= field.name %>, buffer); + } + <%- when Prism::Template::StringField -%> + pm_serialize_string(parser, &((pm_<%= node.human %>_t *)node)-><%= field.name %>, buffer); + <%- when Prism::Template::NodeListField -%> + uint32_t <%= field.name %>_size = pm_sizet_to_u32(((pm_<%= node.human %>_t *)node)-><%= field.name %>.size); + pm_buffer_append_varuint(buffer, <%= field.name %>_size); + for (uint32_t index = 0; index < <%= field.name %>_size; index++) { + pm_serialize_node(parser, (pm_node_t *) ((pm_<%= node.human %>_t *)node)-><%= field.name %>.nodes[index], buffer); + } + <%- when Prism::Template::ConstantField, Prism::Template::OptionalConstantField -%> + pm_buffer_append_varuint(buffer, pm_sizet_to_u32(((pm_<%= node.human %>_t *)node)-><%= field.name %>)); + <%- when Prism::Template::ConstantListField -%> + uint32_t <%= field.name %>_size = pm_sizet_to_u32(((pm_<%= node.human %>_t *)node)-><%= field.name %>.size); + pm_buffer_append_varuint(buffer, <%= field.name %>_size); + for (uint32_t index = 0; index < <%= field.name %>_size; index++) { + pm_buffer_append_varuint(buffer, pm_sizet_to_u32(((pm_<%= node.human %>_t *)node)-><%= field.name %>.ids[index])); + } + <%- when Prism::Template::LocationField -%> + <%- if field.should_be_serialized? -%> + pm_serialize_location(parser, &((pm_<%= node.human %>_t *)node)-><%= field.name %>, buffer); + <%- end -%> + <%- when Prism::Template::OptionalLocationField -%> + <%- if field.should_be_serialized? -%> + if (((pm_<%= node.human %>_t *)node)-><%= field.name %>.start == NULL) { + pm_buffer_append_byte(buffer, 0); + } else { + pm_buffer_append_byte(buffer, 1); + pm_serialize_location(parser, &((pm_<%= node.human %>_t *)node)-><%= field.name %>, buffer); + } + <%- end -%> + <%- when Prism::Template::UInt8Field -%> + pm_buffer_append_byte(buffer, ((pm_<%= node.human %>_t *)node)-><%= field.name %>); + <%- when Prism::Template::UInt32Field -%> + pm_buffer_append_varuint(buffer, ((pm_<%= node.human %>_t *)node)-><%= field.name %>); + <%- when Prism::Template::IntegerField -%> + pm_serialize_integer(&((pm_<%= node.human %>_t *)node)-><%= field.name %>, buffer); + <%- when Prism::Template::DoubleField -%> + pm_buffer_append_double(buffer, ((pm_<%= node.human %>_t *)node)-><%= field.name %>); + <%- else -%> + <%- raise -%> + <%- end -%> + <%- end -%> + <%- if node.needs_serialized_length? -%> + // serialize length + uint32_t length = pm_sizet_to_u32(buffer->length - offset - sizeof(uint32_t)); + memcpy(buffer->value + length_offset, &length, sizeof(uint32_t)); + <%- end -%> + break; + } + <%- end -%> + } +} + +static void +pm_serialize_newline_list(pm_newline_list_t *list, pm_buffer_t *buffer) { + uint32_t size = pm_sizet_to_u32(list->size); + pm_buffer_append_varuint(buffer, size); + + for (uint32_t i = 0; i < size; i++) { + uint32_t offset = pm_sizet_to_u32(list->offsets[i]); + pm_buffer_append_varuint(buffer, offset); + } +} + +static void +pm_serialize_comment(pm_parser_t *parser, pm_comment_t *comment, pm_buffer_t *buffer) { + // serialize type + pm_buffer_append_byte(buffer, (uint8_t) comment->type); + + // serialize location + pm_serialize_location(parser, &comment->location, buffer); +} + +/** + * Serialize the given list of comments to the given buffer. + */ +void +pm_serialize_comment_list(pm_parser_t *parser, pm_list_t *list, pm_buffer_t *buffer) { + pm_buffer_append_varuint(buffer, pm_sizet_to_u32(pm_list_size(list))); + + pm_comment_t *comment; + for (comment = (pm_comment_t *) list->head; comment != NULL; comment = (pm_comment_t *) comment->node.next) { + pm_serialize_comment(parser, comment, buffer); + } +} + +static void +pm_serialize_magic_comment(pm_parser_t *parser, pm_magic_comment_t *magic_comment, pm_buffer_t *buffer) { + // serialize key location + pm_buffer_append_varuint(buffer, pm_ptrdifft_to_u32(magic_comment->key_start - parser->start)); + pm_buffer_append_varuint(buffer, pm_sizet_to_u32(magic_comment->key_length)); + + // serialize value location + pm_buffer_append_varuint(buffer, pm_ptrdifft_to_u32(magic_comment->value_start - parser->start)); + pm_buffer_append_varuint(buffer, pm_sizet_to_u32(magic_comment->value_length)); +} + +static void +pm_serialize_magic_comment_list(pm_parser_t *parser, pm_list_t *list, pm_buffer_t *buffer) { + pm_buffer_append_varuint(buffer, pm_sizet_to_u32(pm_list_size(list))); + + pm_magic_comment_t *magic_comment; + for (magic_comment = (pm_magic_comment_t *) list->head; magic_comment != NULL; magic_comment = (pm_magic_comment_t *) magic_comment->node.next) { + pm_serialize_magic_comment(parser, magic_comment, buffer); + } +} + +static void +pm_serialize_data_loc(const pm_parser_t *parser, pm_buffer_t *buffer) { + if (parser->data_loc.end == NULL) { + pm_buffer_append_byte(buffer, 0); + } else { + pm_buffer_append_byte(buffer, 1); + pm_serialize_location(parser, &parser->data_loc, buffer); + } +} + +static void +pm_serialize_diagnostic(pm_parser_t *parser, pm_diagnostic_t *diagnostic, pm_buffer_t *buffer) { + // serialize the type + pm_buffer_append_varuint(buffer, (uint32_t) diagnostic->diag_id); + + // serialize message + size_t message_length = strlen(diagnostic->message); + pm_buffer_append_varuint(buffer, pm_sizet_to_u32(message_length)); + pm_buffer_append_string(buffer, diagnostic->message, message_length); + + // serialize location + pm_serialize_location(parser, &diagnostic->location, buffer); + + pm_buffer_append_byte(buffer, diagnostic->level); +} + +static void +pm_serialize_diagnostic_list(pm_parser_t *parser, pm_list_t *list, pm_buffer_t *buffer) { + pm_buffer_append_varuint(buffer, pm_sizet_to_u32(pm_list_size(list))); + + pm_diagnostic_t *diagnostic; + for (diagnostic = (pm_diagnostic_t *) list->head; diagnostic != NULL; diagnostic = (pm_diagnostic_t *) diagnostic->node.next) { + pm_serialize_diagnostic(parser, diagnostic, buffer); + } +} + +/** + * Serialize the name of the encoding to the buffer. + */ +void +pm_serialize_encoding(const pm_encoding_t *encoding, pm_buffer_t *buffer) { + size_t encoding_length = strlen(encoding->name); + pm_buffer_append_varuint(buffer, pm_sizet_to_u32(encoding_length)); + pm_buffer_append_string(buffer, encoding->name, encoding_length); +} + +static void +pm_serialize_metadata(pm_parser_t *parser, pm_buffer_t *buffer) { + pm_serialize_encoding(parser->encoding, buffer); + pm_buffer_append_varsint(buffer, parser->start_line); + pm_serialize_newline_list(&parser->newline_list, buffer); +<%- unless Prism::Template::SERIALIZE_ONLY_SEMANTICS_FIELDS -%> + pm_serialize_comment_list(parser, &parser->comment_list, buffer); +<%- end -%> + pm_serialize_magic_comment_list(parser, &parser->magic_comment_list, buffer); + pm_serialize_data_loc(parser, buffer); + pm_serialize_diagnostic_list(parser, &parser->error_list, buffer); + pm_serialize_diagnostic_list(parser, &parser->warning_list, buffer); +} + +#line <%= __LINE__ + 1 %> "prism/templates/src/<%= File.basename(__FILE__) %>" +/** + * Serialize the metadata, nodes, and constant pool. + */ +void +pm_serialize_content(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) { + pm_serialize_metadata(parser, buffer); + + // Here we're going to leave space for the offset of the constant pool in + // the buffer. + size_t offset = buffer->length; + pm_buffer_append_zeroes(buffer, 4); + + // Next, encode the length of the constant pool. + pm_buffer_append_varuint(buffer, parser->constant_pool.size); + + // Now we're going to serialize the content of the node. + pm_serialize_node(parser, node, buffer); + + // Now we're going to serialize the offset of the constant pool back where + // we left space for it. + uint32_t length = pm_sizet_to_u32(buffer->length); + memcpy(buffer->value + offset, &length, sizeof(uint32_t)); + + // Now we're going to serialize the constant pool. + offset = buffer->length; + pm_buffer_append_zeroes(buffer, parser->constant_pool.size * 8); + + for (uint32_t index = 0; index < parser->constant_pool.capacity; index++) { + pm_constant_pool_bucket_t *bucket = &parser->constant_pool.buckets[index]; + + // If we find a constant at this index, serialize it at the correct + // index in the buffer. + if (bucket->id != 0) { + pm_constant_t *constant = &parser->constant_pool.constants[bucket->id - 1]; + size_t buffer_offset = offset + ((((size_t)bucket->id) - 1) * 8); + + if (bucket->type == PM_CONSTANT_POOL_BUCKET_OWNED || bucket->type == PM_CONSTANT_POOL_BUCKET_CONSTANT) { + // Since this is an owned or constant constant, we are going to + // write its contents into the buffer after the constant pool. + // So effectively in place of the source offset, we have a + // buffer offset. We will add a leading 1 to indicate that this + // is a buffer offset. + uint32_t content_offset = pm_sizet_to_u32(buffer->length); + uint32_t owned_mask = 1U << 31; + + assert(content_offset < owned_mask); + content_offset |= owned_mask; + + memcpy(buffer->value + buffer_offset, &content_offset, 4); + pm_buffer_append_bytes(buffer, constant->start, constant->length); + } else { + // Since this is a shared constant, we are going to write its + // source offset directly into the buffer. + uint32_t source_offset = pm_ptrdifft_to_u32(constant->start - parser->start); + memcpy(buffer->value + buffer_offset, &source_offset, 4); + } + + // Now we can write the length of the constant into the buffer. + uint32_t constant_length = pm_sizet_to_u32(constant->length); + memcpy(buffer->value + buffer_offset + 4, &constant_length, 4); + } + } +} + +static void +serialize_token(void *data, pm_parser_t *parser, pm_token_t *token) { + pm_buffer_t *buffer = (pm_buffer_t *) data; + + pm_buffer_append_varuint(buffer, token->type); + pm_buffer_append_varuint(buffer, pm_ptrdifft_to_u32(token->start - parser->start)); + pm_buffer_append_varuint(buffer, pm_ptrdifft_to_u32(token->end - token->start)); + pm_buffer_append_varuint(buffer, parser->lex_state); +} + +/** + * Lex the given source and serialize to the given buffer. + */ +PRISM_EXPORTED_FUNCTION void +pm_serialize_lex(pm_buffer_t *buffer, const uint8_t *source, size_t size, const char *data) { + pm_options_t options = { 0 }; + pm_options_read(&options, data); + + pm_parser_t parser; + pm_parser_init(&parser, source, size, &options); + + pm_lex_callback_t lex_callback = (pm_lex_callback_t) { + .data = (void *) buffer, + .callback = serialize_token, + }; + + parser.lex_callback = &lex_callback; + pm_node_t *node = pm_parse(&parser); + + // Append 0 to mark end of tokens. + pm_buffer_append_byte(buffer, 0); + + pm_serialize_metadata(&parser, buffer); + + pm_node_destroy(&parser, node); + pm_parser_free(&parser); + pm_options_free(&options); +} + +/** + * Parse and serialize both the AST and the tokens represented by the given + * source to the given buffer. + */ +PRISM_EXPORTED_FUNCTION void +pm_serialize_parse_lex(pm_buffer_t *buffer, const uint8_t *source, size_t size, const char *data) { + pm_options_t options = { 0 }; + pm_options_read(&options, data); + + pm_parser_t parser; + pm_parser_init(&parser, source, size, &options); + + pm_lex_callback_t lex_callback = (pm_lex_callback_t) { + .data = (void *) buffer, + .callback = serialize_token, + }; + + parser.lex_callback = &lex_callback; + pm_node_t *node = pm_parse(&parser); + + pm_buffer_append_byte(buffer, 0); + pm_serialize(&parser, node, buffer); + + pm_node_destroy(&parser, node); + pm_parser_free(&parser); + pm_options_free(&options); +} + +#endif diff --git a/prism/templates/src/token_type.c.erb b/prism/templates/src/token_type.c.erb new file mode 100644 index 0000000000..f196393ee1 --- /dev/null +++ b/prism/templates/src/token_type.c.erb @@ -0,0 +1,369 @@ +#include <string.h> + +#include "prism/ast.h" + +/** + * Returns a string representation of the given token type. + */ +PRISM_EXPORTED_FUNCTION const char * +pm_token_type_name(pm_token_type_t token_type) { + switch (token_type) { +<%- tokens.each do |token| -%> + case PM_TOKEN_<%= token.name %>: + return "<%= token.name %>"; +<%- end -%> + case PM_TOKEN_MAXIMUM: + assert(false && "unreachable"); + return ""; + } + + // Provide a default, because some compilers can't determine that the above + // switch is exhaustive. + assert(false && "unreachable"); + return ""; +} + +/** + * Returns the human name of the given token type. + */ +const char * +pm_token_type_human(pm_token_type_t token_type) { + switch (token_type) { + case PM_TOKEN_EOF: + return "end-of-input"; + case PM_TOKEN_MISSING: + return "missing token"; + case PM_TOKEN_NOT_PROVIDED: + return "not provided token"; + case PM_TOKEN_AMPERSAND: + return "'&'"; + case PM_TOKEN_AMPERSAND_AMPERSAND: + return "'&&'"; + case PM_TOKEN_AMPERSAND_AMPERSAND_EQUAL: + return "'&&='"; + case PM_TOKEN_AMPERSAND_DOT: + return "'&.'"; + case PM_TOKEN_AMPERSAND_EQUAL: + return "'&='"; + case PM_TOKEN_BACKTICK: + return "'`'"; + case PM_TOKEN_BACK_REFERENCE: + return "back reference"; + case PM_TOKEN_BANG: + return "'!'"; + case PM_TOKEN_BANG_EQUAL: + return "'!='"; + case PM_TOKEN_BANG_TILDE: + return "'!~'"; + case PM_TOKEN_BRACE_LEFT: + return "'{'"; + case PM_TOKEN_BRACE_RIGHT: + return "'}'"; + case PM_TOKEN_BRACKET_LEFT: + return "'['"; + case PM_TOKEN_BRACKET_LEFT_ARRAY: + return "'['"; + case PM_TOKEN_BRACKET_LEFT_RIGHT: + return "'[]'"; + case PM_TOKEN_BRACKET_LEFT_RIGHT_EQUAL: + return "'[]='"; + case PM_TOKEN_BRACKET_RIGHT: + return "']'"; + case PM_TOKEN_CARET: + return "'^'"; + case PM_TOKEN_CARET_EQUAL: + return "'^='"; + case PM_TOKEN_CHARACTER_LITERAL: + return "character literal"; + case PM_TOKEN_CLASS_VARIABLE: + return "class variable"; + case PM_TOKEN_COLON: + return "':'"; + case PM_TOKEN_COLON_COLON: + return "'::'"; + case PM_TOKEN_COMMA: + return "','"; + case PM_TOKEN_COMMENT: + return "comment"; + case PM_TOKEN_CONSTANT: + return "constant"; + case PM_TOKEN_DOT: + return "'.'"; + case PM_TOKEN_DOT_DOT: + return ".."; + case PM_TOKEN_DOT_DOT_DOT: + return "..."; + case PM_TOKEN_EMBDOC_BEGIN: + return "'=begin'"; + case PM_TOKEN_EMBDOC_END: + return "'=end'"; + case PM_TOKEN_EMBDOC_LINE: + return "embedded documentation line"; + case PM_TOKEN_EMBEXPR_BEGIN: + return "'#{'"; + case PM_TOKEN_EMBEXPR_END: + return "'}'"; + case PM_TOKEN_EMBVAR: + return "'#'"; + case PM_TOKEN_EQUAL: + return "'='"; + case PM_TOKEN_EQUAL_EQUAL: + return "'=='"; + case PM_TOKEN_EQUAL_EQUAL_EQUAL: + return "'==='"; + case PM_TOKEN_EQUAL_GREATER: + return "'=>'"; + case PM_TOKEN_EQUAL_TILDE: + return "'=~'"; + case PM_TOKEN_FLOAT: + return "float"; + case PM_TOKEN_FLOAT_IMAGINARY: + return "imaginary"; + case PM_TOKEN_FLOAT_RATIONAL: + return "rational"; + case PM_TOKEN_FLOAT_RATIONAL_IMAGINARY: + return "imaginary"; + case PM_TOKEN_GLOBAL_VARIABLE: + return "global variable"; + case PM_TOKEN_GREATER: + return "'>'"; + case PM_TOKEN_GREATER_EQUAL: + return "'>='"; + case PM_TOKEN_GREATER_GREATER: + return ">>"; + case PM_TOKEN_GREATER_GREATER_EQUAL: + return ">>="; + case PM_TOKEN_HEREDOC_END: + return "heredoc ending"; + case PM_TOKEN_HEREDOC_START: + return "heredoc beginning"; + case PM_TOKEN_IDENTIFIER: + return "local variable or method"; + case PM_TOKEN_IGNORED_NEWLINE: + return "ignored newline"; + case PM_TOKEN_INSTANCE_VARIABLE: + return "instance variable"; + case PM_TOKEN_INTEGER: + return "integer"; + case PM_TOKEN_INTEGER_IMAGINARY: + return "imaginary"; + case PM_TOKEN_INTEGER_RATIONAL: + return "rational"; + case PM_TOKEN_INTEGER_RATIONAL_IMAGINARY: + return "imaginary"; + case PM_TOKEN_KEYWORD_ALIAS: + return "'alias'"; + case PM_TOKEN_KEYWORD_AND: + return "'and'"; + case PM_TOKEN_KEYWORD_BEGIN: + return "'begin'"; + case PM_TOKEN_KEYWORD_BEGIN_UPCASE: + return "'BEGIN'"; + case PM_TOKEN_KEYWORD_BREAK: + return "'break'"; + case PM_TOKEN_KEYWORD_CASE: + return "'case'"; + case PM_TOKEN_KEYWORD_CLASS: + return "'class'"; + case PM_TOKEN_KEYWORD_DEF: + return "'def'"; + case PM_TOKEN_KEYWORD_DEFINED: + return "'defined?'"; + case PM_TOKEN_KEYWORD_DO: + return "'do'"; + case PM_TOKEN_KEYWORD_DO_LOOP: + return "'do'"; + case PM_TOKEN_KEYWORD_ELSE: + return "'else'"; + case PM_TOKEN_KEYWORD_ELSIF: + return "'elsif'"; + case PM_TOKEN_KEYWORD_END: + return "'end'"; + case PM_TOKEN_KEYWORD_END_UPCASE: + return "'END'"; + case PM_TOKEN_KEYWORD_ENSURE: + return "'ensure'"; + case PM_TOKEN_KEYWORD_FALSE: + return "'false'"; + case PM_TOKEN_KEYWORD_FOR: + return "'for'"; + case PM_TOKEN_KEYWORD_IF: + return "'if'"; + case PM_TOKEN_KEYWORD_IF_MODIFIER: + return "'if'"; + case PM_TOKEN_KEYWORD_IN: + return "'in'"; + case PM_TOKEN_KEYWORD_MODULE: + return "'module'"; + case PM_TOKEN_KEYWORD_NEXT: + return "'next'"; + case PM_TOKEN_KEYWORD_NIL: + return "'nil'"; + case PM_TOKEN_KEYWORD_NOT: + return "'not'"; + case PM_TOKEN_KEYWORD_OR: + return "'or'"; + case PM_TOKEN_KEYWORD_REDO: + return "'redo'"; + case PM_TOKEN_KEYWORD_RESCUE: + return "'rescue'"; + case PM_TOKEN_KEYWORD_RESCUE_MODIFIER: + return "'rescue' modifier"; + case PM_TOKEN_KEYWORD_RETRY: + return "'retry'"; + case PM_TOKEN_KEYWORD_RETURN: + return "'return'"; + case PM_TOKEN_KEYWORD_SELF: + return "'self'"; + case PM_TOKEN_KEYWORD_SUPER: + return "'super'"; + case PM_TOKEN_KEYWORD_THEN: + return "'then'"; + case PM_TOKEN_KEYWORD_TRUE: + return "'true'"; + case PM_TOKEN_KEYWORD_UNDEF: + return "'undef'"; + case PM_TOKEN_KEYWORD_UNLESS: + return "'unless'"; + case PM_TOKEN_KEYWORD_UNLESS_MODIFIER: + return "'unless'"; + case PM_TOKEN_KEYWORD_UNTIL: + return "'until'"; + case PM_TOKEN_KEYWORD_UNTIL_MODIFIER: + return "'until'"; + case PM_TOKEN_KEYWORD_WHEN: + return "'when'"; + case PM_TOKEN_KEYWORD_WHILE: + return "'while'"; + case PM_TOKEN_KEYWORD_WHILE_MODIFIER: + return "'while'"; + case PM_TOKEN_KEYWORD_YIELD: + return "'yield'"; + case PM_TOKEN_KEYWORD___ENCODING__: + return "'__ENCODING__'"; + case PM_TOKEN_KEYWORD___FILE__: + return "'__FILE__'"; + case PM_TOKEN_KEYWORD___LINE__: + return "'__LINE__'"; + case PM_TOKEN_LABEL: + return "label"; + case PM_TOKEN_LABEL_END: + return "label terminator"; + case PM_TOKEN_LAMBDA_BEGIN: + return "'{'"; + case PM_TOKEN_LESS: + return "'<'"; + case PM_TOKEN_LESS_EQUAL: + return "'<='"; + case PM_TOKEN_LESS_EQUAL_GREATER: + return "'<=>'"; + case PM_TOKEN_LESS_LESS: + return "<<"; + case PM_TOKEN_LESS_LESS_EQUAL: + return "<<="; + case PM_TOKEN_METHOD_NAME: + return "method name"; + case PM_TOKEN_MINUS: + return "'-'"; + case PM_TOKEN_MINUS_EQUAL: + return "'-='"; + case PM_TOKEN_MINUS_GREATER: + return "'->'"; + case PM_TOKEN_NEWLINE: + return "newline"; + case PM_TOKEN_NUMBERED_REFERENCE: + return "numbered reference"; + case PM_TOKEN_PARENTHESIS_LEFT: + return "'('"; + case PM_TOKEN_PARENTHESIS_LEFT_PARENTHESES: + return "'('"; + case PM_TOKEN_PARENTHESIS_RIGHT: + return "')'"; + case PM_TOKEN_PERCENT: + return "'%'"; + case PM_TOKEN_PERCENT_EQUAL: + return "'%='"; + case PM_TOKEN_PERCENT_LOWER_I: + return "'%i'"; + case PM_TOKEN_PERCENT_LOWER_W: + return "'%w'"; + case PM_TOKEN_PERCENT_LOWER_X: + return "'%x'"; + case PM_TOKEN_PERCENT_UPPER_I: + return "'%I'"; + case PM_TOKEN_PERCENT_UPPER_W: + return "'%W'"; + case PM_TOKEN_PIPE: + return "'|'"; + case PM_TOKEN_PIPE_EQUAL: + return "'|='"; + case PM_TOKEN_PIPE_PIPE: + return "'||'"; + case PM_TOKEN_PIPE_PIPE_EQUAL: + return "'||='"; + case PM_TOKEN_PLUS: + return "'+'"; + case PM_TOKEN_PLUS_EQUAL: + return "'+='"; + case PM_TOKEN_QUESTION_MARK: + return "'?'"; + case PM_TOKEN_REGEXP_BEGIN: + return "regular expression beginning"; + case PM_TOKEN_REGEXP_END: + return "regular expression ending"; + case PM_TOKEN_SEMICOLON: + return "';'"; + case PM_TOKEN_SLASH: + return "'/'"; + case PM_TOKEN_SLASH_EQUAL: + return "'/='"; + case PM_TOKEN_STAR: + return "'*'"; + case PM_TOKEN_STAR_EQUAL: + return "'*='"; + case PM_TOKEN_STAR_STAR: + return "'**'"; + case PM_TOKEN_STAR_STAR_EQUAL: + return "'**='"; + case PM_TOKEN_STRING_BEGIN: + return "string literal"; + case PM_TOKEN_STRING_CONTENT: + return "string content"; + case PM_TOKEN_STRING_END: + return "string ending"; + case PM_TOKEN_SYMBOL_BEGIN: + return "symbol literal"; + case PM_TOKEN_TILDE: + return "'~'"; + case PM_TOKEN_UAMPERSAND: + return "'&'"; + case PM_TOKEN_UCOLON_COLON: + return "'::'"; + case PM_TOKEN_UDOT_DOT: + return "'..'"; + case PM_TOKEN_UDOT_DOT_DOT: + return "'...'"; + case PM_TOKEN_UMINUS: + return "'-'"; + case PM_TOKEN_UMINUS_NUM: + return "'-'"; + case PM_TOKEN_UPLUS: + return "'+'"; + case PM_TOKEN_USTAR: + return "*"; + case PM_TOKEN_USTAR_STAR: + return "**"; + case PM_TOKEN_WORDS_SEP: + return "string separator"; + case PM_TOKEN___END__: + return "'__END__'"; + case PM_TOKEN_MAXIMUM: + assert(false && "unreachable"); + return ""; + } + + // Provide a default, because some compilers can't determine that the above + // switch is exhaustive. + assert(false && "unreachable"); + return ""; +} diff --git a/prism/templates/template.rb b/prism/templates/template.rb new file mode 100755 index 0000000000..6c3efd7e6c --- /dev/null +++ b/prism/templates/template.rb @@ -0,0 +1,689 @@ +#!/usr/bin/env ruby +# typed: ignore + +require "erb" +require "fileutils" +require "yaml" + +module Prism + module Template + SERIALIZE_ONLY_SEMANTICS_FIELDS = ENV.fetch("PRISM_SERIALIZE_ONLY_SEMANTICS_FIELDS", false) + REMOVE_ON_ERROR_TYPES = SERIALIZE_ONLY_SEMANTICS_FIELDS + CHECK_FIELD_KIND = ENV.fetch("CHECK_FIELD_KIND", false) + + JAVA_BACKEND = ENV["PRISM_JAVA_BACKEND"] || "truffleruby" + JAVA_STRING_TYPE = JAVA_BACKEND == "jruby" ? "org.jruby.RubySymbol" : "String" + INCLUDE_NODE_ID = !SERIALIZE_ONLY_SEMANTICS_FIELDS || JAVA_BACKEND == "jruby" + + COMMON_FLAGS_COUNT = 2 + + class Error + attr_reader :name + + def initialize(name) + @name = name + end + end + + class Warning + attr_reader :name + + def initialize(name) + @name = name + end + end + + # This module contains methods for escaping characters in JavaDoc comments. + module JavaDoc + ESCAPES = { + "'" => "'", + "\"" => """, + "@" => "@", + "&" => "&", + "<" => "<", + ">" => ">" + }.freeze + + def self.escape(value) + value.gsub(/['&"<>@]/, ESCAPES) + end + end + + # A comment attached to a field or node. + class ConfigComment + attr_reader :value + + def initialize(value) + @value = value + end + + def each_line(&block) + value.each_line { |line| yield line.prepend(" ").rstrip } + end + + def each_java_line(&block) + ConfigComment.new(JavaDoc.escape(value)).each_line(&block) + end + end + + # This represents a field on a node. It contains all of the necessary + # information to template out the code for that field. + class Field + attr_reader :name, :comment, :options + + def initialize(name:, comment: nil, **options) + @name = name + @comment = comment + @options = options + end + + def each_comment_line(&block) + ConfigComment.new(comment).each_line(&block) if comment + end + + def each_comment_java_line(&block) + ConfigComment.new(comment).each_java_line(&block) if comment + end + + def semantic_field? + true + end + + def should_be_serialized? + SERIALIZE_ONLY_SEMANTICS_FIELDS ? semantic_field? : true + end + end + + # Some node fields can be specialized if they point to a specific kind of + # node and not just a generic node. + class NodeKindField < Field + def initialize(kind:, **options) + @kind = kind + super(**options) + end + + def c_type + if specific_kind + "pm_#{specific_kind.gsub(/(?<=.)[A-Z]/, "_\\0").downcase}" + else + "pm_node" + end + end + + def ruby_type + specific_kind || "Node" + end + + def java_type + specific_kind || "Node" + end + + def java_cast + if specific_kind + "(Nodes.#{@kind}) " + else + "" + end + end + + def specific_kind + @kind unless @kind.is_a?(Array) + end + + def union_kind + @kind if @kind.is_a?(Array) + end + end + + # This represents a field on a node that is itself a node. We pass them as + # references and store them as references. + class NodeField < NodeKindField + def rbs_class + if specific_kind + specific_kind + elsif union_kind + union_kind.join(" | ") + else + "Prism::node" + end + end + + def rbi_class + if specific_kind + "Prism::#{specific_kind}" + elsif union_kind + "T.any(#{union_kind.map { |kind| "Prism::#{kind}" }.join(", ")})" + else + "Prism::Node" + end + end + + def check_field_kind + if union_kind + "[#{union_kind.join(', ')}].include?(#{name}.class)" + else + "#{name}.is_a?(#{ruby_type})" + end + end + end + + # This represents a field on a node that is itself a node and can be + # optionally null. We pass them as references and store them as references. + class OptionalNodeField < NodeKindField + def rbs_class + if specific_kind + "#{specific_kind}?" + elsif union_kind + [*union_kind, "nil"].join(" | ") + else + "Prism::node?" + end + end + + def rbi_class + if specific_kind + "T.nilable(Prism::#{specific_kind})" + elsif union_kind + "T.nilable(T.any(#{union_kind.map { |kind| "Prism::#{kind}" }.join(", ")}))" + else + "T.nilable(Prism::Node)" + end + end + + def check_field_kind + if union_kind + "[#{union_kind.join(', ')}, NilClass].include?(#{name}.class)" + else + "#{name}.nil? || #{name}.is_a?(#{ruby_type})" + end + end + end + + # This represents a field on a node that is a list of nodes. We pass them as + # references and store them directly on the struct. + class NodeListField < NodeKindField + def rbs_class + if specific_kind + "Array[#{specific_kind}]" + elsif union_kind + "Array[#{union_kind.join(" | ")}]" + else + "Array[Prism::node]" + end + end + + def rbi_class + if specific_kind + "T::Array[Prism::#{specific_kind}]" + elsif union_kind + "T::Array[T.any(#{union_kind.map { |kind| "Prism::#{kind}" }.join(", ")})]" + else + "T::Array[Prism::Node]" + end + end + + def java_type + "#{super}[]" + end + + def check_field_kind + if union_kind + "#{name}.all? { |n| [#{union_kind.join(', ')}].include?(n.class) }" + else + "#{name}.all? { |n| n.is_a?(#{ruby_type}) }" + end + end + end + + # This represents a field on a node that is the ID of a string interned + # through the parser's constant pool. + class ConstantField < Field + def rbs_class + "Symbol" + end + + def rbi_class + "Symbol" + end + + def java_type + JAVA_STRING_TYPE + end + end + + # This represents a field on a node that is the ID of a string interned + # through the parser's constant pool and can be optionally null. + class OptionalConstantField < Field + def rbs_class + "Symbol?" + end + + def rbi_class + "T.nilable(Symbol)" + end + + def java_type + JAVA_STRING_TYPE + end + end + + # This represents a field on a node that is a list of IDs that are associated + # with strings interned through the parser's constant pool. + class ConstantListField < Field + def rbs_class + "Array[Symbol]" + end + + def rbi_class + "T::Array[Symbol]" + end + + def java_type + "#{JAVA_STRING_TYPE}[]" + end + end + + # This represents a field on a node that is a string. + class StringField < Field + def rbs_class + "String" + end + + def rbi_class + "String" + end + + def java_type + "byte[]" + end + end + + # This represents a field on a node that is a location. + class LocationField < Field + def semantic_field? + false + end + + def rbs_class + "Location" + end + + def rbi_class + "Prism::Location" + end + + def java_type + "Location" + end + end + + # This represents a field on a node that is a location that is optional. + class OptionalLocationField < Field + def semantic_field? + false + end + + def rbs_class + "Location?" + end + + def rbi_class + "T.nilable(Prism::Location)" + end + + def java_type + "Location" + end + end + + # This represents an integer field. + class UInt8Field < Field + def rbs_class + "Integer" + end + + def rbi_class + "Integer" + end + + def java_type + "int" + end + end + + # This represents an integer field. + class UInt32Field < Field + def rbs_class + "Integer" + end + + def rbi_class + "Integer" + end + + def java_type + "int" + end + end + + # This represents an arbitrarily-sized integer. When it gets to Ruby it will + # be an Integer. + class IntegerField < Field + def rbs_class + "Integer" + end + + def rbi_class + "Integer" + end + + def java_type + "Object" + end + end + + # This represents a double-precision floating point number. When it gets to + # Ruby it will be a Float. + class DoubleField < Field + def rbs_class + "Float" + end + + def rbi_class + "Float" + end + + def java_type + "double" + end + end + + # This class represents a node in the tree, configured by the config.yml file + # in YAML format. It contains information about the name of the node and the + # various child nodes it contains. + class NodeType + attr_reader :name, :type, :human, :flags, :fields, :newline, :comment + + def initialize(config, flags) + @name = config.fetch("name") + + type = @name.gsub(/(?<=.)[A-Z]/, "_\\0") + @type = "PM_#{type.upcase}" + @human = type.downcase + + @fields = + config.fetch("fields", []).map do |field| + type = field_type_for(field.fetch("type")) + + options = field.transform_keys(&:to_sym) + options.delete(:type) + + # If/when we have documentation on every field, this should be + # changed to use fetch instead of delete. + comment = options.delete(:comment) + + if kinds = options[:kind] + kinds = [kinds] unless kinds.is_a?(Array) + kinds = kinds.map do |kind| + case kind + when "non-void expression" + # the actual list of types would be way too long + "Node" + when "pattern expression" + # the list of all possible types is too long with 37+ different classes + "Node" + when Hash + kind = kind.fetch("on error") + REMOVE_ON_ERROR_TYPES ? nil : kind + else + kind + end + end.compact + if kinds.size == 1 + kinds = kinds.first + kinds = nil if kinds == "Node" + end + options[:kind] = kinds + else + if type < NodeKindField + raise "Missing kind in config.yml for field #{@name}##{options.fetch(:name)}" + end + end + + type.new(comment: comment, **options) + end + + @flags = config.key?("flags") ? flags.fetch(config.fetch("flags")) : nil + @newline = config.fetch("newline", true) + @comment = config.fetch("comment") + end + + def each_comment_line(&block) + ConfigComment.new(comment).each_line(&block) + end + + def each_comment_java_line(&block) + ConfigComment.new(comment).each_java_line(&block) + end + + def semantic_fields + @semantic_fields ||= @fields.select(&:semantic_field?) + end + + # Should emit serialized length of node so implementations can skip + # the node to enable lazy parsing. + def needs_serialized_length? + name == "DefNode" + end + + private + + def field_type_for(name) + case name + when "node" then NodeField + when "node?" then OptionalNodeField + when "node[]" then NodeListField + when "string" then StringField + when "constant" then ConstantField + when "constant?" then OptionalConstantField + when "constant[]" then ConstantListField + when "location" then LocationField + when "location?" then OptionalLocationField + when "uint8" then UInt8Field + when "uint32" then UInt32Field + when "integer" then IntegerField + when "double" then DoubleField + else raise("Unknown field type: #{name.inspect}") + end + end + end + + # This represents a token in the lexer. + class Token + attr_reader :name, :value, :comment + + def initialize(config) + @name = config.fetch("name") + @value = config["value"] + @comment = config.fetch("comment") + end + end + + # Represents a set of flags that should be internally represented with an enum. + class Flags + # Represents an individual flag within a set of flags. + class Flag + attr_reader :name, :camelcase, :comment + + def initialize(config) + @name = config.fetch("name") + @camelcase = @name.split("_").map(&:capitalize).join + @comment = config.fetch("comment") + end + end + + attr_reader :name, :human, :values, :comment + + def initialize(config) + @name = config.fetch("name") + @human = @name.gsub(/(?<=.)[A-Z]/, "_\\0").downcase + @values = config.fetch("values").map { |flag| Flag.new(flag) } + @comment = config.fetch("comment") + end + + def self.empty + new("name" => "", "values" => [], "comment" => "") + end + end + + class << self + # This templates out a file using ERB with the given locals. The locals are + # derived from the config.yml file. + def render(name, write_to: nil) + filepath = "templates/#{name}.erb" + template = File.expand_path("../#{filepath}", __dir__) + + erb = read_template(template) + extension = File.extname(filepath.gsub(".erb", "")) + + heading = + case extension + when ".rb" + <<~HEADING + # frozen_string_literal: true + # :markup: markdown + + =begin + -- + This file is generated by the templates/template.rb script and should not be + modified manually. See #{filepath} + if you are looking to modify the template + ++ + =end + + HEADING + when ".rbs" + <<~HEADING + # This file is generated by the templates/template.rb script and should not be + # modified manually. See #{filepath} + # if you are looking to modify the template + + HEADING + when ".rbi" + <<~HEADING + # typed: strict + + =begin + This file is generated by the templates/template.rb script and should not be + modified manually. See #{filepath} + if you are looking to modify the template + =end + + HEADING + else + <<~HEADING + /* :markup: markdown */ + + /*----------------------------------------------------------------------------*/ + /* This file is generated by the templates/template.rb script and should not */ + /* be modified manually. See */ + /* #{filepath.ljust(74)} */ + /* if you are looking to modify the */ + /* template */ + /*----------------------------------------------------------------------------*/ + + HEADING + end + + write_to ||= File.expand_path("../#{name}", __dir__) + contents = heading + erb.result_with_hash(locals) + + if (extension == ".c" || extension == ".h") && !contents.ascii_only? + # Enforce that we only have ASCII characters here. This is necessary + # for non-UTF-8 locales that only allow ASCII characters in C source + # files. + contents.each_line.with_index(1) do |line, line_number| + raise "Non-ASCII character on line #{line_number} of #{write_to}" unless line.ascii_only? + end + end + + FileUtils.mkdir_p(File.dirname(write_to)) + File.write(write_to, contents) + end + + private + + def read_template(filepath) + template = File.read(filepath, encoding: Encoding::UTF_8) + erb = erb(template) + erb.filename = filepath + erb + end + + def erb(template) + ERB.new(template, trim_mode: "-") + end + + def locals + @locals ||= + begin + config = YAML.load_file(File.expand_path("../config.yml", __dir__)) + flags = config.fetch("flags").to_h { |flags| [flags["name"], Flags.new(flags)] } + + { + errors: config.fetch("errors").map { |name| Error.new(name) }, + warnings: config.fetch("warnings").map { |name| Warning.new(name) }, + nodes: config.fetch("nodes").map { |node| NodeType.new(node, flags) }.sort_by(&:name), + tokens: config.fetch("tokens").map { |token| Token.new(token) }, + flags: flags.values + } + end + end + end + + TEMPLATES = [ + "ext/prism/api_node.c", + "include/prism/ast.h", + "include/prism/diagnostic.h", + "javascript/src/deserialize.js", + "javascript/src/nodes.js", + "javascript/src/visitor.js", + "java/org/prism/Loader.java", + "java/org/prism/Nodes.java", + "java/org/prism/AbstractNodeVisitor.java", + "lib/prism/compiler.rb", + "lib/prism/dispatcher.rb", + "lib/prism/dot_visitor.rb", + "lib/prism/dsl.rb", + "lib/prism/inspect_visitor.rb", + "lib/prism/mutation_compiler.rb", + "lib/prism/node.rb", + "lib/prism/reflection.rb", + "lib/prism/serialize.rb", + "lib/prism/visitor.rb", + "src/diagnostic.c", + "src/node.c", + "src/prettyprint.c", + "src/serialize.c", + "src/token_type.c", + "rbi/prism/dsl.rbi", + "rbi/prism/node.rbi", + "rbi/prism/visitor.rbi", + "sig/prism.rbs", + "sig/prism/dsl.rbs", + "sig/prism/mutation_compiler.rbs", + "sig/prism/node.rbs", + "sig/prism/visitor.rbs", + "sig/prism/_private/dot_visitor.rbs" + ] + end +end + +if __FILE__ == $0 + if ARGV.empty? + Prism::Template::TEMPLATES.each { |filepath| Prism::Template.render(filepath) } + else # ruby/ruby + name, write_to = ARGV + Prism::Template.render(name, write_to: write_to) + end +end diff --git a/prism/util/pm_buffer.c b/prism/util/pm_buffer.c new file mode 100644 index 0000000000..2136a7c43e --- /dev/null +++ b/prism/util/pm_buffer.c @@ -0,0 +1,357 @@ +#include "prism/util/pm_buffer.h" + +/** + * Return the size of the pm_buffer_t struct. + */ +size_t +pm_buffer_sizeof(void) { + return sizeof(pm_buffer_t); +} + +/** + * Initialize a pm_buffer_t with the given capacity. + */ +bool +pm_buffer_init_capacity(pm_buffer_t *buffer, size_t capacity) { + buffer->length = 0; + buffer->capacity = capacity; + + buffer->value = (char *) xmalloc(capacity); + return buffer->value != NULL; +} + +/** + * Initialize a pm_buffer_t with its default values. + */ +bool +pm_buffer_init(pm_buffer_t *buffer) { + return pm_buffer_init_capacity(buffer, 1024); +} + +/** + * Return the value of the buffer. + */ +char * +pm_buffer_value(const pm_buffer_t *buffer) { + return buffer->value; +} + +/** + * Return the length of the buffer. + */ +size_t +pm_buffer_length(const pm_buffer_t *buffer) { + return buffer->length; +} + +/** + * Append the given amount of space to the buffer. + */ +static inline bool +pm_buffer_append_length(pm_buffer_t *buffer, size_t length) { + size_t next_length = buffer->length + length; + + if (next_length > buffer->capacity) { + if (buffer->capacity == 0) { + buffer->capacity = 1; + } + + while (next_length > buffer->capacity) { + buffer->capacity *= 2; + } + + buffer->value = xrealloc(buffer->value, buffer->capacity); + if (buffer->value == NULL) return false; + } + + buffer->length = next_length; + return true; +} + +/** + * Append a generic pointer to memory to the buffer. + */ +static inline void +pm_buffer_append(pm_buffer_t *buffer, const void *source, size_t length) { + size_t cursor = buffer->length; + if (pm_buffer_append_length(buffer, length)) { + memcpy(buffer->value + cursor, source, length); + } +} + +/** + * Append the given amount of space as zeroes to the buffer. + */ +void +pm_buffer_append_zeroes(pm_buffer_t *buffer, size_t length) { + size_t cursor = buffer->length; + if (pm_buffer_append_length(buffer, length)) { + memset(buffer->value + cursor, 0, length); + } +} + +/** + * Append a formatted string to the buffer. + */ +void +pm_buffer_append_format(pm_buffer_t *buffer, const char *format, ...) { + va_list arguments; + va_start(arguments, format); + int result = vsnprintf(NULL, 0, format, arguments); + va_end(arguments); + + if (result < 0) return; + size_t length = (size_t) (result + 1); + + size_t cursor = buffer->length; + if (pm_buffer_append_length(buffer, length)) { + va_start(arguments, format); + vsnprintf(buffer->value + cursor, length, format, arguments); + va_end(arguments); + buffer->length--; + } +} + +/** + * Append a string to the buffer. + */ +void +pm_buffer_append_string(pm_buffer_t *buffer, const char *value, size_t length) { + pm_buffer_append(buffer, value, length); +} + +/** + * Append a list of bytes to the buffer. + */ +void +pm_buffer_append_bytes(pm_buffer_t *buffer, const uint8_t *value, size_t length) { + pm_buffer_append(buffer, (const char *) value, length); +} + +/** + * Append a single byte to the buffer. + */ +void +pm_buffer_append_byte(pm_buffer_t *buffer, uint8_t value) { + const void *source = &value; + pm_buffer_append(buffer, source, sizeof(uint8_t)); +} + +/** + * Append a 32-bit unsigned integer to the buffer as a variable-length integer. + */ +void +pm_buffer_append_varuint(pm_buffer_t *buffer, uint32_t value) { + if (value < 128) { + pm_buffer_append_byte(buffer, (uint8_t) value); + } else { + uint32_t n = value; + while (n >= 128) { + pm_buffer_append_byte(buffer, (uint8_t) (n | 128)); + n >>= 7; + } + pm_buffer_append_byte(buffer, (uint8_t) n); + } +} + +/** + * Append a 32-bit signed integer to the buffer as a variable-length integer. + */ +void +pm_buffer_append_varsint(pm_buffer_t *buffer, int32_t value) { + uint32_t unsigned_int = ((uint32_t)(value) << 1) ^ ((uint32_t)(value >> 31)); + pm_buffer_append_varuint(buffer, unsigned_int); +} + +/** + * Append a double to the buffer. + */ +void +pm_buffer_append_double(pm_buffer_t *buffer, double value) { + const void *source = &value; + pm_buffer_append(buffer, source, sizeof(double)); +} + +/** + * Append a unicode codepoint to the buffer. + */ +bool +pm_buffer_append_unicode_codepoint(pm_buffer_t *buffer, uint32_t value) { + if (value <= 0x7F) { + pm_buffer_append_byte(buffer, (uint8_t) value); // 0xxxxxxx + return true; + } else if (value <= 0x7FF) { + uint8_t bytes[] = { + (uint8_t) (0xC0 | ((value >> 6) & 0x3F)), // 110xxxxx + (uint8_t) (0x80 | (value & 0x3F)) // 10xxxxxx + }; + + pm_buffer_append_bytes(buffer, bytes, 2); + return true; + } else if (value <= 0xFFFF) { + uint8_t bytes[] = { + (uint8_t) (0xE0 | ((value >> 12) & 0x3F)), // 1110xxxx + (uint8_t) (0x80 | ((value >> 6) & 0x3F)), // 10xxxxxx + (uint8_t) (0x80 | (value & 0x3F)) // 10xxxxxx + }; + + pm_buffer_append_bytes(buffer, bytes, 3); + return true; + } else if (value <= 0x10FFFF) { + uint8_t bytes[] = { + (uint8_t) (0xF0 | ((value >> 18) & 0x3F)), // 11110xxx + (uint8_t) (0x80 | ((value >> 12) & 0x3F)), // 10xxxxxx + (uint8_t) (0x80 | ((value >> 6) & 0x3F)), // 10xxxxxx + (uint8_t) (0x80 | (value & 0x3F)) // 10xxxxxx + }; + + pm_buffer_append_bytes(buffer, bytes, 4); + return true; + } else { + return false; + } +} + +/** + * Append a slice of source code to the buffer. + */ +void +pm_buffer_append_source(pm_buffer_t *buffer, const uint8_t *source, size_t length, pm_buffer_escaping_t escaping) { + for (size_t index = 0; index < length; index++) { + const uint8_t byte = source[index]; + + if ((byte <= 0x06) || (byte >= 0x0E && byte <= 0x1F) || (byte >= 0x7F)) { + if (escaping == PM_BUFFER_ESCAPING_RUBY) { + pm_buffer_append_format(buffer, "\\x%02X", byte); + } else { + pm_buffer_append_format(buffer, "\\u%04X", byte); + } + } else { + switch (byte) { + case '\a': + if (escaping == PM_BUFFER_ESCAPING_RUBY) { + pm_buffer_append_string(buffer, "\\a", 2); + } else { + pm_buffer_append_format(buffer, "\\u%04X", byte); + } + break; + case '\b': + pm_buffer_append_string(buffer, "\\b", 2); + break; + case '\t': + pm_buffer_append_string(buffer, "\\t", 2); + break; + case '\n': + pm_buffer_append_string(buffer, "\\n", 2); + break; + case '\v': + if (escaping == PM_BUFFER_ESCAPING_RUBY) { + pm_buffer_append_string(buffer, "\\v", 2); + } else { + pm_buffer_append_format(buffer, "\\u%04X", byte); + } + break; + case '\f': + pm_buffer_append_string(buffer, "\\f", 2); + break; + case '\r': + pm_buffer_append_string(buffer, "\\r", 2); + break; + case '"': + pm_buffer_append_string(buffer, "\\\"", 2); + break; + case '#': { + if (escaping == PM_BUFFER_ESCAPING_RUBY && index + 1 < length) { + const uint8_t next_byte = source[index + 1]; + if (next_byte == '{' || next_byte == '@' || next_byte == '$') { + pm_buffer_append_byte(buffer, '\\'); + } + } + + pm_buffer_append_byte(buffer, '#'); + break; + } + case '\\': + pm_buffer_append_string(buffer, "\\\\", 2); + break; + default: + pm_buffer_append_byte(buffer, byte); + break; + } + } + } +} + +/** + * Prepend the given string to the buffer. + */ +void +pm_buffer_prepend_string(pm_buffer_t *buffer, const char *value, size_t length) { + size_t cursor = buffer->length; + if (pm_buffer_append_length(buffer, length)) { + memmove(buffer->value + length, buffer->value, cursor); + memcpy(buffer->value, value, length); + } +} + +/** + * Concatenate one buffer onto another. + */ +void +pm_buffer_concat(pm_buffer_t *destination, const pm_buffer_t *source) { + if (source->length > 0) { + pm_buffer_append(destination, source->value, source->length); + } +} + +/** + * Clear the buffer by reducing its size to 0. This does not free the allocated + * memory, but it does allow the buffer to be reused. + */ +void +pm_buffer_clear(pm_buffer_t *buffer) { + buffer->length = 0; +} + +/** + * Strip the whitespace from the end of the buffer. + */ +void +pm_buffer_rstrip(pm_buffer_t *buffer) { + while (buffer->length > 0 && pm_char_is_whitespace((uint8_t) buffer->value[buffer->length - 1])) { + buffer->length--; + } +} + +/** + * Checks if the buffer includes the given value. + */ +size_t +pm_buffer_index(const pm_buffer_t *buffer, char value) { + const char *first = memchr(buffer->value, value, buffer->length); + return (first == NULL) ? SIZE_MAX : (size_t) (first - buffer->value); +} + +/** + * Insert the given string into the buffer at the given index. + */ +void +pm_buffer_insert(pm_buffer_t *buffer, size_t index, const char *value, size_t length) { + assert(index <= buffer->length); + + if (index == buffer->length) { + pm_buffer_append_string(buffer, value, length); + } else { + pm_buffer_append_zeroes(buffer, length); + memmove(buffer->value + index + length, buffer->value + index, buffer->length - length - index); + memcpy(buffer->value + index, value, length); + } +} + +/** + * Free the memory associated with the buffer. + */ +void +pm_buffer_free(pm_buffer_t *buffer) { + xfree(buffer->value); +} diff --git a/prism/util/pm_buffer.h b/prism/util/pm_buffer.h new file mode 100644 index 0000000000..cb80f8b3ce --- /dev/null +++ b/prism/util/pm_buffer.h @@ -0,0 +1,236 @@ +/** + * @file pm_buffer.h + * + * A wrapper around a contiguous block of allocated memory. + */ +#ifndef PRISM_BUFFER_H +#define PRISM_BUFFER_H + +#include "prism/defines.h" +#include "prism/util/pm_char.h" + +#include <assert.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> + +/** + * A pm_buffer_t is a simple memory buffer that stores data in a contiguous + * block of memory. + */ +typedef struct { + /** The length of the buffer in bytes. */ + size_t length; + + /** The capacity of the buffer in bytes that has been allocated. */ + size_t capacity; + + /** A pointer to the start of the buffer. */ + char *value; +} pm_buffer_t; + +/** + * Return the size of the pm_buffer_t struct. + * + * @returns The size of the pm_buffer_t struct. + */ +PRISM_EXPORTED_FUNCTION size_t pm_buffer_sizeof(void); + +/** + * Initialize a pm_buffer_t with the given capacity. + * + * @param buffer The buffer to initialize. + * @param capacity The capacity of the buffer. + * @returns True if the buffer was initialized successfully, false otherwise. + */ +bool pm_buffer_init_capacity(pm_buffer_t *buffer, size_t capacity); + +/** + * Initialize a pm_buffer_t with its default values. + * + * @param buffer The buffer to initialize. + * @returns True if the buffer was initialized successfully, false otherwise. + * + * \public \memberof pm_buffer_t + */ +PRISM_EXPORTED_FUNCTION bool pm_buffer_init(pm_buffer_t *buffer); + +/** + * Return the value of the buffer. + * + * @param buffer The buffer to get the value of. + * @returns The value of the buffer. + * + * \public \memberof pm_buffer_t + */ +PRISM_EXPORTED_FUNCTION char * pm_buffer_value(const pm_buffer_t *buffer); + +/** + * Return the length of the buffer. + * + * @param buffer The buffer to get the length of. + * @returns The length of the buffer. + * + * \public \memberof pm_buffer_t + */ +PRISM_EXPORTED_FUNCTION size_t pm_buffer_length(const pm_buffer_t *buffer); + +/** + * Append the given amount of space as zeroes to the buffer. + * + * @param buffer The buffer to append to. + * @param length The amount of space to append and zero. + */ +void pm_buffer_append_zeroes(pm_buffer_t *buffer, size_t length); + +/** + * Append a formatted string to the buffer. + * + * @param buffer The buffer to append to. + * @param format The format string to append. + * @param ... The arguments to the format string. + */ +void pm_buffer_append_format(pm_buffer_t *buffer, const char *format, ...) PRISM_ATTRIBUTE_FORMAT(2, 3); + +/** + * Append a string to the buffer. + * + * @param buffer The buffer to append to. + * @param value The string to append. + * @param length The length of the string to append. + */ +void pm_buffer_append_string(pm_buffer_t *buffer, const char *value, size_t length); + +/** + * Append a list of bytes to the buffer. + * + * @param buffer The buffer to append to. + * @param value The bytes to append. + * @param length The length of the bytes to append. + */ +void pm_buffer_append_bytes(pm_buffer_t *buffer, const uint8_t *value, size_t length); + +/** + * Append a single byte to the buffer. + * + * @param buffer The buffer to append to. + * @param value The byte to append. + */ +void pm_buffer_append_byte(pm_buffer_t *buffer, uint8_t value); + +/** + * Append a 32-bit unsigned integer to the buffer as a variable-length integer. + * + * @param buffer The buffer to append to. + * @param value The integer to append. + */ +void pm_buffer_append_varuint(pm_buffer_t *buffer, uint32_t value); + +/** + * Append a 32-bit signed integer to the buffer as a variable-length integer. + * + * @param buffer The buffer to append to. + * @param value The integer to append. + */ +void pm_buffer_append_varsint(pm_buffer_t *buffer, int32_t value); + +/** + * Append a double to the buffer. + * + * @param buffer The buffer to append to. + * @param value The double to append. + */ +void pm_buffer_append_double(pm_buffer_t *buffer, double value); + +/** + * Append a unicode codepoint to the buffer. + * + * @param buffer The buffer to append to. + * @param value The character to append. + * @returns True if the codepoint was valid and appended successfully, false + * otherwise. + */ +bool pm_buffer_append_unicode_codepoint(pm_buffer_t *buffer, uint32_t value); + +/** + * The different types of escaping that can be performed by the buffer when + * appending a slice of Ruby source code. + */ +typedef enum { + PM_BUFFER_ESCAPING_RUBY, + PM_BUFFER_ESCAPING_JSON +} pm_buffer_escaping_t; + +/** + * Append a slice of source code to the buffer. + * + * @param buffer The buffer to append to. + * @param source The source code to append. + * @param length The length of the source code to append. + * @param escaping The type of escaping to perform. + */ +void pm_buffer_append_source(pm_buffer_t *buffer, const uint8_t *source, size_t length, pm_buffer_escaping_t escaping); + +/** + * Prepend the given string to the buffer. + * + * @param buffer The buffer to prepend to. + * @param value The string to prepend. + * @param length The length of the string to prepend. + */ +void pm_buffer_prepend_string(pm_buffer_t *buffer, const char *value, size_t length); + +/** + * Concatenate one buffer onto another. + * + * @param destination The buffer to concatenate onto. + * @param source The buffer to concatenate. + */ +void pm_buffer_concat(pm_buffer_t *destination, const pm_buffer_t *source); + +/** + * Clear the buffer by reducing its size to 0. This does not free the allocated + * memory, but it does allow the buffer to be reused. + * + * @param buffer The buffer to clear. + */ +void pm_buffer_clear(pm_buffer_t *buffer); + +/** + * Strip the whitespace from the end of the buffer. + * + * @param buffer The buffer to strip. + */ +void pm_buffer_rstrip(pm_buffer_t *buffer); + +/** + * Checks if the buffer includes the given value. + * + * @param buffer The buffer to check. + * @param value The value to check for. + * @returns The index of the first occurrence of the value in the buffer, or + * SIZE_MAX if the value is not found. + */ +size_t pm_buffer_index(const pm_buffer_t *buffer, char value); + +/** + * Insert the given string into the buffer at the given index. + * + * @param buffer The buffer to insert into. + * @param index The index to insert at. + * @param value The string to insert. + * @param length The length of the string to insert. + */ +void pm_buffer_insert(pm_buffer_t *buffer, size_t index, const char *value, size_t length); + +/** + * Free the memory associated with the buffer. + * + * @param buffer The buffer to free. + * + * \public \memberof pm_buffer_t + */ +PRISM_EXPORTED_FUNCTION void pm_buffer_free(pm_buffer_t *buffer); + +#endif diff --git a/prism/util/pm_char.c b/prism/util/pm_char.c new file mode 100644 index 0000000000..a51dc11645 --- /dev/null +++ b/prism/util/pm_char.c @@ -0,0 +1,318 @@ +#include "prism/util/pm_char.h" + +#define PRISM_CHAR_BIT_WHITESPACE (1 << 0) +#define PRISM_CHAR_BIT_INLINE_WHITESPACE (1 << 1) +#define PRISM_CHAR_BIT_REGEXP_OPTION (1 << 2) + +#define PRISM_NUMBER_BIT_BINARY_DIGIT (1 << 0) +#define PRISM_NUMBER_BIT_BINARY_NUMBER (1 << 1) +#define PRISM_NUMBER_BIT_OCTAL_DIGIT (1 << 2) +#define PRISM_NUMBER_BIT_OCTAL_NUMBER (1 << 3) +#define PRISM_NUMBER_BIT_DECIMAL_DIGIT (1 << 4) +#define PRISM_NUMBER_BIT_DECIMAL_NUMBER (1 << 5) +#define PRISM_NUMBER_BIT_HEXADECIMAL_DIGIT (1 << 6) +#define PRISM_NUMBER_BIT_HEXADECIMAL_NUMBER (1 << 7) + +static const uint8_t pm_byte_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 3x + 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 4x + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, // 5x + 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 6x + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +static const uint8_t pm_number_table[256] = { + // 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0x + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 1x + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 2x + 0xff, 0xff, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xf0, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 3x + 0x00, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 4x + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xaa, // 5x + 0x00, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 6x + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 7x + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 8x + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 9x + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Ax + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Bx + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Cx + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Dx + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Ex + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Fx +}; + +/** + * Returns the number of characters at the start of the string that match the + * given kind. Disallows searching past the given maximum number of characters. + */ +static inline size_t +pm_strspn_char_kind(const uint8_t *string, ptrdiff_t length, uint8_t kind) { + if (length <= 0) return 0; + + size_t size = 0; + size_t maximum = (size_t) length; + + while (size < maximum && (pm_byte_table[string[size]] & kind)) size++; + return size; +} + +/** + * Returns the number of characters at the start of the string that are + * whitespace. Disallows searching past the given maximum number of characters. + */ +size_t +pm_strspn_whitespace(const uint8_t *string, ptrdiff_t length) { + return pm_strspn_char_kind(string, length, PRISM_CHAR_BIT_WHITESPACE); +} + +/** + * Returns the number of characters at the start of the string that are + * whitespace while also tracking the location of each newline. Disallows + * searching past the given maximum number of characters. + */ +size_t +pm_strspn_whitespace_newlines(const uint8_t *string, ptrdiff_t length, pm_newline_list_t *newline_list) { + if (length <= 0) return 0; + + size_t size = 0; + size_t maximum = (size_t) length; + + while (size < maximum && (pm_byte_table[string[size]] & PRISM_CHAR_BIT_WHITESPACE)) { + if (string[size] == '\n') { + pm_newline_list_append(newline_list, string + size); + } + + size++; + } + + return size; +} + +/** + * Returns the number of characters at the start of the string that are inline + * whitespace. Disallows searching past the given maximum number of characters. + */ +size_t +pm_strspn_inline_whitespace(const uint8_t *string, ptrdiff_t length) { + return pm_strspn_char_kind(string, length, PRISM_CHAR_BIT_INLINE_WHITESPACE); +} + +/** + * Returns the number of characters at the start of the string that are regexp + * options. Disallows searching past the given maximum number of characters. + */ +size_t +pm_strspn_regexp_option(const uint8_t *string, ptrdiff_t length) { + return pm_strspn_char_kind(string, length, PRISM_CHAR_BIT_REGEXP_OPTION); +} + +/** + * Returns true if the given character matches the given kind. + */ +static inline bool +pm_char_is_char_kind(const uint8_t b, uint8_t kind) { + return (pm_byte_table[b] & kind) != 0; +} + +/** + * Returns true if the given character is a whitespace character. + */ +bool +pm_char_is_whitespace(const uint8_t b) { + return pm_char_is_char_kind(b, PRISM_CHAR_BIT_WHITESPACE); +} + +/** + * Returns true if the given character is an inline whitespace character. + */ +bool +pm_char_is_inline_whitespace(const uint8_t b) { + return pm_char_is_char_kind(b, PRISM_CHAR_BIT_INLINE_WHITESPACE); +} + +/** + * Scan through the string and return the number of characters at the start of + * the string that match the given kind. Disallows searching past the given + * maximum number of characters. + */ +static inline size_t +pm_strspn_number_kind(const uint8_t *string, ptrdiff_t length, uint8_t kind) { + if (length <= 0) return 0; + + size_t size = 0; + size_t maximum = (size_t) length; + + while (size < maximum && (pm_number_table[string[size]] & kind)) size++; + return size; +} + +/** + * Scan through the string and return the number of characters at the start of + * the string that match the given kind. Disallows searching past the given + * maximum number of characters. + * + * Additionally, report the location of the last invalid underscore character + * found in the string through the out invalid parameter. + */ +static inline size_t +pm_strspn_number_kind_underscores(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid, uint8_t kind) { + if (length <= 0) return 0; + + size_t size = 0; + size_t maximum = (size_t) length; + + bool underscore = false; + while (size < maximum && (pm_number_table[string[size]] & kind)) { + if (string[size] == '_') { + if (underscore) *invalid = string + size; + underscore = true; + } else { + underscore = false; + } + + size++; + } + + if (size > 0 && string[size - 1] == '_') *invalid = string + size - 1; + return size; +} + +/** + * Returns the number of characters at the start of the string that are binary + * digits or underscores. Disallows searching past the given maximum number of + * characters. + * + * If multiple underscores are found in a row or if an underscore is + * found at the end of the number, then the invalid pointer is set to the index + * of the first invalid underscore. + */ +size_t +pm_strspn_binary_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid) { + return pm_strspn_number_kind_underscores(string, length, invalid, PRISM_NUMBER_BIT_BINARY_NUMBER); +} + +/** + * Returns the number of characters at the start of the string that are octal + * digits or underscores. Disallows searching past the given maximum number of + * characters. + * + * If multiple underscores are found in a row or if an underscore is + * found at the end of the number, then the invalid pointer is set to the index + * of the first invalid underscore. + */ +size_t +pm_strspn_octal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid) { + return pm_strspn_number_kind_underscores(string, length, invalid, PRISM_NUMBER_BIT_OCTAL_NUMBER); +} + +/** + * Returns the number of characters at the start of the string that are decimal + * digits. Disallows searching past the given maximum number of characters. + */ +size_t +pm_strspn_decimal_digit(const uint8_t *string, ptrdiff_t length) { + return pm_strspn_number_kind(string, length, PRISM_NUMBER_BIT_DECIMAL_DIGIT); +} + +/** + * Returns the number of characters at the start of the string that are decimal + * digits or underscores. Disallows searching past the given maximum number of + * characters. + * + * If multiple underscores are found in a row or if an underscore is + * found at the end of the number, then the invalid pointer is set to the index + * of the first invalid underscore + */ +size_t +pm_strspn_decimal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid) { + return pm_strspn_number_kind_underscores(string, length, invalid, PRISM_NUMBER_BIT_DECIMAL_NUMBER); +} + +/** + * Returns the number of characters at the start of the string that are + * hexadecimal digits. Disallows searching past the given maximum number of + * characters. + */ +size_t +pm_strspn_hexadecimal_digit(const uint8_t *string, ptrdiff_t length) { + return pm_strspn_number_kind(string, length, PRISM_NUMBER_BIT_HEXADECIMAL_DIGIT); +} + +/** + * Returns the number of characters at the start of the string that are + * hexadecimal digits or underscores. Disallows searching past the given maximum + * number of characters. + * + * If multiple underscores are found in a row or if an underscore is + * found at the end of the number, then the invalid pointer is set to the index + * of the first invalid underscore. + */ +size_t +pm_strspn_hexadecimal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid) { + return pm_strspn_number_kind_underscores(string, length, invalid, PRISM_NUMBER_BIT_HEXADECIMAL_NUMBER); +} + +/** + * Returns true if the given character matches the given kind. + */ +static inline bool +pm_char_is_number_kind(const uint8_t b, uint8_t kind) { + return (pm_number_table[b] & kind) != 0; +} + +/** + * Returns true if the given character is a binary digit. + */ +bool +pm_char_is_binary_digit(const uint8_t b) { + return pm_char_is_number_kind(b, PRISM_NUMBER_BIT_BINARY_DIGIT); +} + +/** + * Returns true if the given character is an octal digit. + */ +bool +pm_char_is_octal_digit(const uint8_t b) { + return pm_char_is_number_kind(b, PRISM_NUMBER_BIT_OCTAL_DIGIT); +} + +/** + * Returns true if the given character is a decimal digit. + */ +bool +pm_char_is_decimal_digit(const uint8_t b) { + return pm_char_is_number_kind(b, PRISM_NUMBER_BIT_DECIMAL_DIGIT); +} + +/** + * Returns true if the given character is a hexadecimal digit. + */ +bool +pm_char_is_hexadecimal_digit(const uint8_t b) { + return pm_char_is_number_kind(b, PRISM_NUMBER_BIT_HEXADECIMAL_DIGIT); +} + +#undef PRISM_CHAR_BIT_WHITESPACE +#undef PRISM_CHAR_BIT_INLINE_WHITESPACE +#undef PRISM_CHAR_BIT_REGEXP_OPTION + +#undef PRISM_NUMBER_BIT_BINARY_DIGIT +#undef PRISM_NUMBER_BIT_BINARY_NUMBER +#undef PRISM_NUMBER_BIT_OCTAL_DIGIT +#undef PRISM_NUMBER_BIT_OCTAL_NUMBER +#undef PRISM_NUMBER_BIT_DECIMAL_DIGIT +#undef PRISM_NUMBER_BIT_DECIMAL_NUMBER +#undef PRISM_NUMBER_BIT_HEXADECIMAL_NUMBER +#undef PRISM_NUMBER_BIT_HEXADECIMAL_DIGIT diff --git a/prism/util/pm_char.h b/prism/util/pm_char.h new file mode 100644 index 0000000000..deeafd6321 --- /dev/null +++ b/prism/util/pm_char.h @@ -0,0 +1,204 @@ +/** + * @file pm_char.h + * + * Functions for working with characters and strings. + */ +#ifndef PRISM_CHAR_H +#define PRISM_CHAR_H + +#include "prism/defines.h" +#include "prism/util/pm_newline_list.h" + +#include <stdbool.h> +#include <stddef.h> + +/** + * Returns the number of characters at the start of the string that are + * whitespace. Disallows searching past the given maximum number of characters. + * + * @param string The string to search. + * @param length The maximum number of characters to search. + * @return The number of characters at the start of the string that are + * whitespace. + */ +size_t pm_strspn_whitespace(const uint8_t *string, ptrdiff_t length); + +/** + * Returns the number of characters at the start of the string that are + * whitespace while also tracking the location of each newline. Disallows + * searching past the given maximum number of characters. + * + * @param string The string to search. + * @param length The maximum number of characters to search. + * @param newline_list The list of newlines to populate. + * @return The number of characters at the start of the string that are + * whitespace. + */ +size_t pm_strspn_whitespace_newlines(const uint8_t *string, ptrdiff_t length, pm_newline_list_t *newline_list); + +/** + * Returns the number of characters at the start of the string that are inline + * whitespace. Disallows searching past the given maximum number of characters. + * + * @param string The string to search. + * @param length The maximum number of characters to search. + * @return The number of characters at the start of the string that are inline + * whitespace. + */ +size_t pm_strspn_inline_whitespace(const uint8_t *string, ptrdiff_t length); + +/** + * Returns the number of characters at the start of the string that are decimal + * digits. Disallows searching past the given maximum number of characters. + * + * @param string The string to search. + * @param length The maximum number of characters to search. + * @return The number of characters at the start of the string that are decimal + * digits. + */ +size_t pm_strspn_decimal_digit(const uint8_t *string, ptrdiff_t length); + +/** + * Returns the number of characters at the start of the string that are + * hexadecimal digits. Disallows searching past the given maximum number of + * characters. + * + * @param string The string to search. + * @param length The maximum number of characters to search. + * @return The number of characters at the start of the string that are + * hexadecimal digits. + */ +size_t pm_strspn_hexadecimal_digit(const uint8_t *string, ptrdiff_t length); + +/** + * Returns the number of characters at the start of the string that are octal + * digits or underscores. Disallows searching past the given maximum number of + * characters. + * + * If multiple underscores are found in a row or if an underscore is + * found at the end of the number, then the invalid pointer is set to the index + * of the first invalid underscore. + * + * @param string The string to search. + * @param length The maximum number of characters to search. + * @param invalid The pointer to set to the index of the first invalid + * underscore. + * @return The number of characters at the start of the string that are octal + * digits or underscores. + */ +size_t pm_strspn_octal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid); + +/** + * Returns the number of characters at the start of the string that are decimal + * digits or underscores. Disallows searching past the given maximum number of + * characters. + * + * If multiple underscores are found in a row or if an underscore is + * found at the end of the number, then the invalid pointer is set to the index + * of the first invalid underscore. + * + * @param string The string to search. + * @param length The maximum number of characters to search. + * @param invalid The pointer to set to the index of the first invalid + * underscore. + * @return The number of characters at the start of the string that are decimal + * digits or underscores. + */ +size_t pm_strspn_decimal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid); + +/** + * Returns the number of characters at the start of the string that are + * hexadecimal digits or underscores. Disallows searching past the given maximum + * number of characters. + * + * If multiple underscores are found in a row or if an underscore is + * found at the end of the number, then the invalid pointer is set to the index + * of the first invalid underscore. + * + * @param string The string to search. + * @param length The maximum number of characters to search. + * @param invalid The pointer to set to the index of the first invalid + * underscore. + * @return The number of characters at the start of the string that are + * hexadecimal digits or underscores. + */ +size_t pm_strspn_hexadecimal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid); + +/** + * Returns the number of characters at the start of the string that are regexp + * options. Disallows searching past the given maximum number of characters. + * + * @param string The string to search. + * @param length The maximum number of characters to search. + * @return The number of characters at the start of the string that are regexp + * options. + */ +size_t pm_strspn_regexp_option(const uint8_t *string, ptrdiff_t length); + +/** + * Returns the number of characters at the start of the string that are binary + * digits or underscores. Disallows searching past the given maximum number of + * characters. + * + * If multiple underscores are found in a row or if an underscore is + * found at the end of the number, then the invalid pointer is set to the index + * of the first invalid underscore. + * + * @param string The string to search. + * @param length The maximum number of characters to search. + * @param invalid The pointer to set to the index of the first invalid + * underscore. + * @return The number of characters at the start of the string that are binary + * digits or underscores. + */ +size_t pm_strspn_binary_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid); + +/** + * Returns true if the given character is a whitespace character. + * + * @param b The character to check. + * @return True if the given character is a whitespace character. + */ +bool pm_char_is_whitespace(const uint8_t b); + +/** + * Returns true if the given character is an inline whitespace character. + * + * @param b The character to check. + * @return True if the given character is an inline whitespace character. + */ +bool pm_char_is_inline_whitespace(const uint8_t b); + +/** + * Returns true if the given character is a binary digit. + * + * @param b The character to check. + * @return True if the given character is a binary digit. + */ +bool pm_char_is_binary_digit(const uint8_t b); + +/** + * Returns true if the given character is an octal digit. + * + * @param b The character to check. + * @return True if the given character is an octal digit. + */ +bool pm_char_is_octal_digit(const uint8_t b); + +/** + * Returns true if the given character is a decimal digit. + * + * @param b The character to check. + * @return True if the given character is a decimal digit. + */ +bool pm_char_is_decimal_digit(const uint8_t b); + +/** + * Returns true if the given character is a hexadecimal digit. + * + * @param b The character to check. + * @return True if the given character is a hexadecimal digit. + */ +bool pm_char_is_hexadecimal_digit(const uint8_t b); + +#endif diff --git a/prism/util/pm_constant_pool.c b/prism/util/pm_constant_pool.c new file mode 100644 index 0000000000..922ce6a18c --- /dev/null +++ b/prism/util/pm_constant_pool.c @@ -0,0 +1,342 @@ +#include "prism/util/pm_constant_pool.h" + +/** + * Initialize a list of constant ids. + */ +void +pm_constant_id_list_init(pm_constant_id_list_t *list) { + list->ids = NULL; + list->size = 0; + list->capacity = 0; +} + +/** + * Initialize a list of constant ids with a given capacity. + */ +void +pm_constant_id_list_init_capacity(pm_constant_id_list_t *list, size_t capacity) { + if (capacity) { + list->ids = xcalloc(capacity, sizeof(pm_constant_id_t)); + if (list->ids == NULL) abort(); + } else { + list->ids = NULL; + } + + list->size = 0; + list->capacity = capacity; +} + +/** + * Append a constant id to a list of constant ids. Returns false if any + * potential reallocations fail. + */ +bool +pm_constant_id_list_append(pm_constant_id_list_t *list, pm_constant_id_t id) { + if (list->size >= list->capacity) { + list->capacity = list->capacity == 0 ? 8 : list->capacity * 2; + list->ids = (pm_constant_id_t *) xrealloc(list->ids, sizeof(pm_constant_id_t) * list->capacity); + if (list->ids == NULL) return false; + } + + list->ids[list->size++] = id; + return true; +} + +/** + * Insert a constant id into a list of constant ids at the specified index. + */ +void +pm_constant_id_list_insert(pm_constant_id_list_t *list, size_t index, pm_constant_id_t id) { + assert(index < list->capacity); + assert(list->ids[index] == PM_CONSTANT_ID_UNSET); + + list->ids[index] = id; + list->size++; +} + +/** + * Checks if the current constant id list includes the given constant id. + */ +bool +pm_constant_id_list_includes(pm_constant_id_list_t *list, pm_constant_id_t id) { + for (size_t index = 0; index < list->size; index++) { + if (list->ids[index] == id) return true; + } + return false; +} + +/** + * Free the memory associated with a list of constant ids. + */ +void +pm_constant_id_list_free(pm_constant_id_list_t *list) { + if (list->ids != NULL) { + xfree(list->ids); + } +} + +/** + * A relatively simple hash function (djb2) that is used to hash strings. We are + * optimizing here for simplicity and speed. + */ +static inline uint32_t +pm_constant_pool_hash(const uint8_t *start, size_t length) { + // This is a prime number used as the initial value for the hash function. + uint32_t value = 5381; + + for (size_t index = 0; index < length; index++) { + value = ((value << 5) + value) + start[index]; + } + + return value; +} + +/** + * https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 + */ +static uint32_t +next_power_of_two(uint32_t v) { + // Avoid underflow in subtraction on next line. + if (v == 0) { + // 1 is the nearest power of 2 to 0 (2^0) + return 1; + } + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v++; + return v; +} + +#ifndef NDEBUG +static bool +is_power_of_two(uint32_t size) { + return (size & (size - 1)) == 0; +} +#endif + +/** + * Resize a constant pool to a given capacity. + */ +static inline bool +pm_constant_pool_resize(pm_constant_pool_t *pool) { + assert(is_power_of_two(pool->capacity)); + + uint32_t next_capacity = pool->capacity * 2; + if (next_capacity < pool->capacity) return false; + + const uint32_t mask = next_capacity - 1; + const size_t element_size = sizeof(pm_constant_pool_bucket_t) + sizeof(pm_constant_t); + + void *next = xcalloc(next_capacity, element_size); + if (next == NULL) return false; + + pm_constant_pool_bucket_t *next_buckets = next; + pm_constant_t *next_constants = (void *)(((char *) next) + next_capacity * sizeof(pm_constant_pool_bucket_t)); + + // For each bucket in the current constant pool, find the index in the + // next constant pool, and insert it. + for (uint32_t index = 0; index < pool->capacity; index++) { + pm_constant_pool_bucket_t *bucket = &pool->buckets[index]; + + // If an id is set on this constant, then we know we have content here. + // In this case we need to insert it into the next constant pool. + if (bucket->id != PM_CONSTANT_ID_UNSET) { + uint32_t next_index = bucket->hash & mask; + + // This implements linear scanning to find the next available slot + // in case this index is already taken. We don't need to bother + // comparing the values since we know that the hash is unique. + while (next_buckets[next_index].id != PM_CONSTANT_ID_UNSET) { + next_index = (next_index + 1) & mask; + } + + // Here we copy over the entire bucket, which includes the id so + // that they are consistent between resizes. + next_buckets[next_index] = *bucket; + } + } + + // The constants are stable with respect to hash table resizes. + memcpy(next_constants, pool->constants, pool->size * sizeof(pm_constant_t)); + + // pool->constants and pool->buckets are allocated out of the same chunk + // of memory, with the buckets coming first. + xfree(pool->buckets); + pool->constants = next_constants; + pool->buckets = next_buckets; + pool->capacity = next_capacity; + return true; +} + +/** + * Initialize a new constant pool with a given capacity. + */ +bool +pm_constant_pool_init(pm_constant_pool_t *pool, uint32_t capacity) { + const uint32_t maximum = (~((uint32_t) 0)); + if (capacity >= ((maximum / 2) + 1)) return false; + + capacity = next_power_of_two(capacity); + const size_t element_size = sizeof(pm_constant_pool_bucket_t) + sizeof(pm_constant_t); + void *memory = xcalloc(capacity, element_size); + if (memory == NULL) return false; + + pool->buckets = memory; + pool->constants = (void *)(((char *)memory) + capacity * sizeof(pm_constant_pool_bucket_t)); + pool->size = 0; + pool->capacity = capacity; + return true; +} + +/** + * Return a pointer to the constant indicated by the given constant id. + */ +pm_constant_t * +pm_constant_pool_id_to_constant(const pm_constant_pool_t *pool, pm_constant_id_t constant_id) { + assert(constant_id != PM_CONSTANT_ID_UNSET && constant_id <= pool->size); + return &pool->constants[constant_id - 1]; +} + +/** + * Find a constant in a constant pool. Returns the id of the constant, or 0 if + * the constant is not found. + */ +pm_constant_id_t +pm_constant_pool_find(const pm_constant_pool_t *pool, const uint8_t *start, size_t length) { + assert(is_power_of_two(pool->capacity)); + const uint32_t mask = pool->capacity - 1; + + uint32_t hash = pm_constant_pool_hash(start, length); + uint32_t index = hash & mask; + pm_constant_pool_bucket_t *bucket; + + while (bucket = &pool->buckets[index], bucket->id != PM_CONSTANT_ID_UNSET) { + pm_constant_t *constant = &pool->constants[bucket->id - 1]; + if ((constant->length == length) && memcmp(constant->start, start, length) == 0) { + return bucket->id; + } + + index = (index + 1) & mask; + } + + return PM_CONSTANT_ID_UNSET; +} + +/** + * Insert a constant into a constant pool and return its index in the pool. + */ +static inline pm_constant_id_t +pm_constant_pool_insert(pm_constant_pool_t *pool, const uint8_t *start, size_t length, pm_constant_pool_bucket_type_t type) { + if (pool->size >= (pool->capacity / 4 * 3)) { + if (!pm_constant_pool_resize(pool)) return PM_CONSTANT_ID_UNSET; + } + + assert(is_power_of_two(pool->capacity)); + const uint32_t mask = pool->capacity - 1; + + uint32_t hash = pm_constant_pool_hash(start, length); + uint32_t index = hash & mask; + pm_constant_pool_bucket_t *bucket; + + while (bucket = &pool->buckets[index], bucket->id != PM_CONSTANT_ID_UNSET) { + // If there is a collision, then we need to check if the content is the + // same as the content we are trying to insert. If it is, then we can + // return the id of the existing constant. + pm_constant_t *constant = &pool->constants[bucket->id - 1]; + + if ((constant->length == length) && memcmp(constant->start, start, length) == 0) { + // Since we have found a match, we need to check if this is + // attempting to insert a shared or an owned constant. We want to + // prefer shared constants since they don't require allocations. + if (type == PM_CONSTANT_POOL_BUCKET_OWNED) { + // If we're attempting to insert an owned constant and we have + // an existing constant, then either way we don't want the given + // memory. Either it's duplicated with the existing constant or + // it's not necessary because we have a shared version. + xfree((void *) start); + } else if (bucket->type == PM_CONSTANT_POOL_BUCKET_OWNED) { + // If we're attempting to insert a shared constant and the + // existing constant is owned, then we can free the owned + // constant and replace it with the shared constant. + xfree((void *) constant->start); + constant->start = start; + bucket->type = (unsigned int) (type & 0x3); + } + + return bucket->id; + } + + index = (index + 1) & mask; + } + + // IDs are allocated starting at 1, since the value 0 denotes a non-existent + // constant. + uint32_t id = ++pool->size; + assert(pool->size < ((uint32_t) (1 << 30))); + + *bucket = (pm_constant_pool_bucket_t) { + .id = (unsigned int) (id & 0x3fffffff), + .type = (unsigned int) (type & 0x3), + .hash = hash + }; + + pool->constants[id - 1] = (pm_constant_t) { + .start = start, + .length = length, + }; + + return id; +} + +/** + * Insert a constant into a constant pool. Returns the id of the constant, or + * PM_CONSTANT_ID_UNSET if any potential calls to resize fail. + */ +pm_constant_id_t +pm_constant_pool_insert_shared(pm_constant_pool_t *pool, const uint8_t *start, size_t length) { + return pm_constant_pool_insert(pool, start, length, PM_CONSTANT_POOL_BUCKET_DEFAULT); +} + +/** + * Insert a constant into a constant pool from memory that is now owned by the + * constant pool. Returns the id of the constant, or PM_CONSTANT_ID_UNSET if any + * potential calls to resize fail. + */ +pm_constant_id_t +pm_constant_pool_insert_owned(pm_constant_pool_t *pool, uint8_t *start, size_t length) { + return pm_constant_pool_insert(pool, start, length, PM_CONSTANT_POOL_BUCKET_OWNED); +} + +/** + * Insert a constant into a constant pool from memory that is constant. Returns + * the id of the constant, or PM_CONSTANT_ID_UNSET if any potential calls to + * resize fail. + */ +pm_constant_id_t +pm_constant_pool_insert_constant(pm_constant_pool_t *pool, const uint8_t *start, size_t length) { + return pm_constant_pool_insert(pool, start, length, PM_CONSTANT_POOL_BUCKET_CONSTANT); +} + +/** + * Free the memory associated with a constant pool. + */ +void +pm_constant_pool_free(pm_constant_pool_t *pool) { + // For each constant in the current constant pool, free the contents if the + // contents are owned. + for (uint32_t index = 0; index < pool->capacity; index++) { + pm_constant_pool_bucket_t *bucket = &pool->buckets[index]; + + // If an id is set on this constant, then we know we have content here. + if (bucket->id != PM_CONSTANT_ID_UNSET && bucket->type == PM_CONSTANT_POOL_BUCKET_OWNED) { + pm_constant_t *constant = &pool->constants[bucket->id - 1]; + xfree((void *) constant->start); + } + } + + xfree(pool->buckets); +} diff --git a/prism/util/pm_constant_pool.h b/prism/util/pm_constant_pool.h new file mode 100644 index 0000000000..6df23f8f50 --- /dev/null +++ b/prism/util/pm_constant_pool.h @@ -0,0 +1,218 @@ +/** + * @file pm_constant_pool.h + * + * A data structure that stores a set of strings. + * + * Each string is assigned a unique id, which can be used to compare strings for + * equality. This comparison ends up being much faster than strcmp, since it + * only requires a single integer comparison. + */ +#ifndef PRISM_CONSTANT_POOL_H +#define PRISM_CONSTANT_POOL_H + +#include "prism/defines.h" + +#include <assert.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> + +/** + * When we allocate constants into the pool, we reserve 0 to mean that the slot + * is not yet filled. This constant is reused in other places to indicate the + * lack of a constant id. + */ +#define PM_CONSTANT_ID_UNSET 0 + +/** + * A constant id is a unique identifier for a constant in the constant pool. + */ +typedef uint32_t pm_constant_id_t; + +/** + * A list of constant IDs. Usually used to represent a set of locals. + */ +typedef struct { + /** The number of constant ids in the list. */ + size_t size; + + /** The number of constant ids that have been allocated in the list. */ + size_t capacity; + + /** The constant ids in the list. */ + pm_constant_id_t *ids; +} pm_constant_id_list_t; + +/** + * Initialize a list of constant ids. + * + * @param list The list to initialize. + */ +void pm_constant_id_list_init(pm_constant_id_list_t *list); + +/** + * Initialize a list of constant ids with a given capacity. + * + * @param list The list to initialize. + * @param capacity The initial capacity of the list. + */ +void pm_constant_id_list_init_capacity(pm_constant_id_list_t *list, size_t capacity); + +/** + * Append a constant id to a list of constant ids. Returns false if any + * potential reallocations fail. + * + * @param list The list to append to. + * @param id The id to append. + * @return Whether the append succeeded. + */ +bool pm_constant_id_list_append(pm_constant_id_list_t *list, pm_constant_id_t id); + +/** + * Insert a constant id into a list of constant ids at the specified index. + * + * @param list The list to insert into. + * @param index The index at which to insert. + * @param id The id to insert. + */ +void pm_constant_id_list_insert(pm_constant_id_list_t *list, size_t index, pm_constant_id_t id); + +/** + * Checks if the current constant id list includes the given constant id. + * + * @param list The list to check. + * @param id The id to check for. + * @return Whether the list includes the given id. + */ +bool pm_constant_id_list_includes(pm_constant_id_list_t *list, pm_constant_id_t id); + +/** + * Free the memory associated with a list of constant ids. + * + * @param list The list to free. + */ +void pm_constant_id_list_free(pm_constant_id_list_t *list); + +/** + * The type of bucket in the constant pool hash map. This determines how the + * bucket should be freed. + */ +typedef unsigned int pm_constant_pool_bucket_type_t; + +/** By default, each constant is a slice of the source. */ +static const pm_constant_pool_bucket_type_t PM_CONSTANT_POOL_BUCKET_DEFAULT = 0; + +/** An owned constant is one for which memory has been allocated. */ +static const pm_constant_pool_bucket_type_t PM_CONSTANT_POOL_BUCKET_OWNED = 1; + +/** A constant constant is known at compile time. */ +static const pm_constant_pool_bucket_type_t PM_CONSTANT_POOL_BUCKET_CONSTANT = 2; + +/** A bucket in the hash map. */ +typedef struct { + /** The incremental ID used for indexing back into the pool. */ + unsigned int id: 30; + + /** The type of the bucket, which determines how to free it. */ + pm_constant_pool_bucket_type_t type: 2; + + /** The hash of the bucket. */ + uint32_t hash; +} pm_constant_pool_bucket_t; + +/** A constant in the pool which effectively stores a string. */ +typedef struct { + /** A pointer to the start of the string. */ + const uint8_t *start; + + /** The length of the string. */ + size_t length; +} pm_constant_t; + +/** The overall constant pool, which stores constants found while parsing. */ +typedef struct { + /** The buckets in the hash map. */ + pm_constant_pool_bucket_t *buckets; + + /** The constants that are stored in the buckets. */ + pm_constant_t *constants; + + /** The number of buckets in the hash map. */ + uint32_t size; + + /** The number of buckets that have been allocated in the hash map. */ + uint32_t capacity; +} pm_constant_pool_t; + +/** + * Initialize a new constant pool with a given capacity. + * + * @param pool The pool to initialize. + * @param capacity The initial capacity of the pool. + * @return Whether the initialization succeeded. + */ +bool pm_constant_pool_init(pm_constant_pool_t *pool, uint32_t capacity); + +/** + * Return a pointer to the constant indicated by the given constant id. + * + * @param pool The pool to get the constant from. + * @param constant_id The id of the constant to get. + * @return A pointer to the constant. + */ +pm_constant_t * pm_constant_pool_id_to_constant(const pm_constant_pool_t *pool, pm_constant_id_t constant_id); + +/** + * Find a constant in a constant pool. Returns the id of the constant, or 0 if + * the constant is not found. + * + * @param pool The pool to find the constant in. + * @param start A pointer to the start of the constant. + * @param length The length of the constant. + * @return The id of the constant. + */ +pm_constant_id_t pm_constant_pool_find(const pm_constant_pool_t *pool, const uint8_t *start, size_t length); + +/** + * Insert a constant into a constant pool that is a slice of a source string. + * Returns the id of the constant, or 0 if any potential calls to resize fail. + * + * @param pool The pool to insert the constant into. + * @param start A pointer to the start of the constant. + * @param length The length of the constant. + * @return The id of the constant. + */ +pm_constant_id_t pm_constant_pool_insert_shared(pm_constant_pool_t *pool, const uint8_t *start, size_t length); + +/** + * Insert a constant into a constant pool from memory that is now owned by the + * constant pool. Returns the id of the constant, or 0 if any potential calls to + * resize fail. + * + * @param pool The pool to insert the constant into. + * @param start A pointer to the start of the constant. + * @param length The length of the constant. + * @return The id of the constant. + */ +pm_constant_id_t pm_constant_pool_insert_owned(pm_constant_pool_t *pool, uint8_t *start, size_t length); + +/** + * Insert a constant into a constant pool from memory that is constant. Returns + * the id of the constant, or 0 if any potential calls to resize fail. + * + * @param pool The pool to insert the constant into. + * @param start A pointer to the start of the constant. + * @param length The length of the constant. + * @return The id of the constant. + */ +pm_constant_id_t pm_constant_pool_insert_constant(pm_constant_pool_t *pool, const uint8_t *start, size_t length); + +/** + * Free the memory associated with a constant pool. + * + * @param pool The pool to free. + */ +void pm_constant_pool_free(pm_constant_pool_t *pool); + +#endif diff --git a/prism/util/pm_integer.c b/prism/util/pm_integer.c new file mode 100644 index 0000000000..4170ecc58d --- /dev/null +++ b/prism/util/pm_integer.c @@ -0,0 +1,670 @@ +#include "prism/util/pm_integer.h" + +/** + * Pull out the length and values from the integer, regardless of the form in + * which the length/values are stored. + */ +#define INTEGER_EXTRACT(integer, length_variable, values_variable) \ + if ((integer)->values == NULL) { \ + length_variable = 1; \ + values_variable = &(integer)->value; \ + } else { \ + length_variable = (integer)->length; \ + values_variable = (integer)->values; \ + } + +/** + * Adds two positive pm_integer_t with the given base. + * Return pm_integer_t with values allocated. Not normalized. + */ +static void +big_add(pm_integer_t *destination, pm_integer_t *left, pm_integer_t *right, uint64_t base) { + size_t left_length; + uint32_t *left_values; + INTEGER_EXTRACT(left, left_length, left_values) + + size_t right_length; + uint32_t *right_values; + INTEGER_EXTRACT(right, right_length, right_values) + + size_t length = left_length < right_length ? right_length : left_length; + uint32_t *values = (uint32_t *) xmalloc(sizeof(uint32_t) * (length + 1)); + if (values == NULL) return; + + uint64_t carry = 0; + for (size_t index = 0; index < length; index++) { + uint64_t sum = carry + (index < left_length ? left_values[index] : 0) + (index < right_length ? right_values[index] : 0); + values[index] = (uint32_t) (sum % base); + carry = sum / base; + } + + if (carry > 0) { + values[length] = (uint32_t) carry; + length++; + } + + *destination = (pm_integer_t) { length, values, 0, false }; +} + +/** + * Internal use for karatsuba_multiply. Calculates `a - b - c` with the given + * base. Assume a, b, c, a - b - c all to be positive. + * Return pm_integer_t with values allocated. Not normalized. + */ +static void +big_sub2(pm_integer_t *destination, pm_integer_t *a, pm_integer_t *b, pm_integer_t *c, uint64_t base) { + size_t a_length; + uint32_t *a_values; + INTEGER_EXTRACT(a, a_length, a_values) + + size_t b_length; + uint32_t *b_values; + INTEGER_EXTRACT(b, b_length, b_values) + + size_t c_length; + uint32_t *c_values; + INTEGER_EXTRACT(c, c_length, c_values) + + uint32_t *values = (uint32_t*) xmalloc(sizeof(uint32_t) * a_length); + int64_t carry = 0; + + for (size_t index = 0; index < a_length; index++) { + int64_t sub = ( + carry + + a_values[index] - + (index < b_length ? b_values[index] : 0) - + (index < c_length ? c_values[index] : 0) + ); + + if (sub >= 0) { + values[index] = (uint32_t) sub; + carry = 0; + } else { + sub += 2 * (int64_t) base; + values[index] = (uint32_t) ((uint64_t) sub % base); + carry = sub / (int64_t) base - 2; + } + } + + while (a_length > 1 && values[a_length - 1] == 0) a_length--; + *destination = (pm_integer_t) { a_length, values, 0, false }; +} + +/** + * Multiply two positive integers with the given base using karatsuba algorithm. + * Return pm_integer_t with values allocated. Not normalized. + */ +static void +karatsuba_multiply(pm_integer_t *destination, pm_integer_t *left, pm_integer_t *right, uint64_t base) { + size_t left_length; + uint32_t *left_values; + INTEGER_EXTRACT(left, left_length, left_values) + + size_t right_length; + uint32_t *right_values; + INTEGER_EXTRACT(right, right_length, right_values) + + if (left_length > right_length) { + size_t temporary_length = left_length; + left_length = right_length; + right_length = temporary_length; + + uint32_t *temporary_values = left_values; + left_values = right_values; + right_values = temporary_values; + } + + if (left_length <= 10) { + size_t length = left_length + right_length; + uint32_t *values = (uint32_t *) xcalloc(length, sizeof(uint32_t)); + if (values == NULL) return; + + for (size_t left_index = 0; left_index < left_length; left_index++) { + uint32_t carry = 0; + for (size_t right_index = 0; right_index < right_length; right_index++) { + uint64_t product = (uint64_t) left_values[left_index] * right_values[right_index] + values[left_index + right_index] + carry; + values[left_index + right_index] = (uint32_t) (product % base); + carry = (uint32_t) (product / base); + } + values[left_index + right_length] = carry; + } + + while (length > 1 && values[length - 1] == 0) length--; + *destination = (pm_integer_t) { length, values, 0, false }; + return; + } + + if (left_length * 2 <= right_length) { + uint32_t *values = (uint32_t *) xcalloc(left_length + right_length, sizeof(uint32_t)); + + for (size_t start_offset = 0; start_offset < right_length; start_offset += left_length) { + size_t end_offset = start_offset + left_length; + if (end_offset > right_length) end_offset = right_length; + + pm_integer_t sliced_left = { + .length = left_length, + .values = left_values, + .value = 0, + .negative = false + }; + + pm_integer_t sliced_right = { + .length = end_offset - start_offset, + .values = right_values + start_offset, + .value = 0, + .negative = false + }; + + pm_integer_t product; + karatsuba_multiply(&product, &sliced_left, &sliced_right, base); + + uint32_t carry = 0; + for (size_t index = 0; index < product.length; index++) { + uint64_t sum = (uint64_t) values[start_offset + index] + product.values[index] + carry; + values[start_offset + index] = (uint32_t) (sum % base); + carry = (uint32_t) (sum / base); + } + + if (carry > 0) values[start_offset + product.length] += carry; + pm_integer_free(&product); + } + + *destination = (pm_integer_t) { left_length + right_length, values, 0, false }; + return; + } + + size_t half = left_length / 2; + pm_integer_t x0 = { half, left_values, 0, false }; + pm_integer_t x1 = { left_length - half, left_values + half, 0, false }; + pm_integer_t y0 = { half, right_values, 0, false }; + pm_integer_t y1 = { right_length - half, right_values + half, 0, false }; + + pm_integer_t z0 = { 0 }; + karatsuba_multiply(&z0, &x0, &y0, base); + + pm_integer_t z2 = { 0 }; + karatsuba_multiply(&z2, &x1, &y1, base); + + // For simplicity to avoid considering negative values, + // use `z1 = (x0 + x1) * (y0 + y1) - z0 - z2` instead of original karatsuba algorithm. + pm_integer_t x01 = { 0 }; + big_add(&x01, &x0, &x1, base); + + pm_integer_t y01 = { 0 }; + big_add(&y01, &y0, &y1, base); + + pm_integer_t xy = { 0 }; + karatsuba_multiply(&xy, &x01, &y01, base); + + pm_integer_t z1; + big_sub2(&z1, &xy, &z0, &z2, base); + + size_t length = left_length + right_length; + uint32_t *values = (uint32_t*) xcalloc(length, sizeof(uint32_t)); + + assert(z0.values != NULL); + memcpy(values, z0.values, sizeof(uint32_t) * z0.length); + + assert(z2.values != NULL); + memcpy(values + 2 * half, z2.values, sizeof(uint32_t) * z2.length); + + uint32_t carry = 0; + for(size_t index = 0; index < z1.length; index++) { + uint64_t sum = (uint64_t) carry + values[index + half] + z1.values[index]; + values[index + half] = (uint32_t) (sum % base); + carry = (uint32_t) (sum / base); + } + + for(size_t index = half + z1.length; carry > 0; index++) { + uint64_t sum = (uint64_t) carry + values[index]; + values[index] = (uint32_t) (sum % base); + carry = (uint32_t) (sum / base); + } + + while (length > 1 && values[length - 1] == 0) length--; + pm_integer_free(&z0); + pm_integer_free(&z1); + pm_integer_free(&z2); + pm_integer_free(&x01); + pm_integer_free(&y01); + pm_integer_free(&xy); + + *destination = (pm_integer_t) { length, values, 0, false }; +} + +/** + * The values of a hexadecimal digit, where the index is the ASCII character. + * Note that there's an odd exception here where _ is mapped to 0. This is + * because it's possible for us to end up trying to parse a number that has + * already had an error attached to it, and we want to provide _something_ to + * the user. + */ +static const int8_t pm_integer_parse_digit_values[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 0x + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 1x + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 2x + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, // 3x + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 4x + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, // 5x + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 6x + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 7x + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 8x + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 9x + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Ax + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Bx + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Cx + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Dx + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Ex + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Fx +}; + +/** + * Return the value of a hexadecimal digit in a uint8_t. + */ +static uint8_t +pm_integer_parse_digit(const uint8_t character) { + int8_t value = pm_integer_parse_digit_values[character]; + assert(value != -1 && "invalid digit"); + + return (uint8_t) value; +} + +/** + * Create a pm_integer_t from uint64_t with the given base. It is assumed that + * the memory for the pm_integer_t pointer has been zeroed. + */ +static void +pm_integer_from_uint64(pm_integer_t *integer, uint64_t value, uint64_t base) { + if (value < base) { + integer->value = (uint32_t) value; + return; + } + + size_t length = 0; + uint64_t length_value = value; + while (length_value > 0) { + length++; + length_value /= base; + } + + uint32_t *values = (uint32_t *) xmalloc(sizeof(uint32_t) * length); + if (values == NULL) return; + + for (size_t value_index = 0; value_index < length; value_index++) { + values[value_index] = (uint32_t) (value % base); + value /= base; + } + + integer->length = length; + integer->values = values; +} + +/** + * Normalize pm_integer_t. + * Heading zero values will be removed. If the integer fits into uint32_t, + * values is set to NULL, length is set to 0, and value field will be used. + */ +static void +pm_integer_normalize(pm_integer_t *integer) { + if (integer->values == NULL) { + return; + } + + while (integer->length > 1 && integer->values[integer->length - 1] == 0) { + integer->length--; + } + + if (integer->length > 1) { + return; + } + + uint32_t value = integer->values[0]; + bool negative = integer->negative && value != 0; + + pm_integer_free(integer); + *integer = (pm_integer_t) { .values = NULL, .value = value, .length = 0, .negative = negative }; +} + +/** + * Convert base of the integer. + * In practice, it converts 10**9 to 1<<32 or 1<<32 to 10**9. + */ +static void +pm_integer_convert_base(pm_integer_t *destination, const pm_integer_t *source, uint64_t base_from, uint64_t base_to) { + size_t source_length; + const uint32_t *source_values; + INTEGER_EXTRACT(source, source_length, source_values) + + size_t bigints_length = (source_length + 1) / 2; + assert(bigints_length > 0); + + pm_integer_t *bigints = (pm_integer_t *) xcalloc(bigints_length, sizeof(pm_integer_t)); + if (bigints == NULL) return; + + for (size_t index = 0; index < source_length; index += 2) { + uint64_t value = source_values[index] + base_from * (index + 1 < source_length ? source_values[index + 1] : 0); + pm_integer_from_uint64(&bigints[index / 2], value, base_to); + } + + pm_integer_t base = { 0 }; + pm_integer_from_uint64(&base, base_from, base_to); + + while (bigints_length > 1) { + pm_integer_t next_base; + karatsuba_multiply(&next_base, &base, &base, base_to); + + pm_integer_free(&base); + base = next_base; + + size_t next_length = (bigints_length + 1) / 2; + pm_integer_t *next_bigints = (pm_integer_t *) xcalloc(next_length, sizeof(pm_integer_t)); + + for (size_t bigints_index = 0; bigints_index < bigints_length; bigints_index += 2) { + if (bigints_index + 1 == bigints_length) { + next_bigints[bigints_index / 2] = bigints[bigints_index]; + } else { + pm_integer_t multiplied = { 0 }; + karatsuba_multiply(&multiplied, &base, &bigints[bigints_index + 1], base_to); + + big_add(&next_bigints[bigints_index / 2], &bigints[bigints_index], &multiplied, base_to); + pm_integer_free(&bigints[bigints_index]); + pm_integer_free(&bigints[bigints_index + 1]); + pm_integer_free(&multiplied); + } + } + + xfree(bigints); + bigints = next_bigints; + bigints_length = next_length; + } + + *destination = bigints[0]; + destination->negative = source->negative; + pm_integer_normalize(destination); + + xfree(bigints); + pm_integer_free(&base); +} + +#undef INTEGER_EXTRACT + +/** + * Convert digits to integer with the given power-of-two base. + */ +static void +pm_integer_parse_powof2(pm_integer_t *integer, uint32_t base, const uint8_t *digits, size_t digits_length) { + size_t bit = 1; + while (base > (uint32_t) (1 << bit)) bit++; + + size_t length = (digits_length * bit + 31) / 32; + uint32_t *values = (uint32_t *) xcalloc(length, sizeof(uint32_t)); + + for (size_t digit_index = 0; digit_index < digits_length; digit_index++) { + size_t bit_position = bit * (digits_length - digit_index - 1); + uint32_t value = digits[digit_index]; + + size_t index = bit_position / 32; + size_t shift = bit_position % 32; + + values[index] |= value << shift; + if (32 - shift < bit) values[index + 1] |= value >> (32 - shift); + } + + while (length > 1 && values[length - 1] == 0) length--; + *integer = (pm_integer_t) { .length = length, .values = values, .value = 0, .negative = false }; + pm_integer_normalize(integer); +} + +/** + * Convert decimal digits to pm_integer_t. + */ +static void +pm_integer_parse_decimal(pm_integer_t *integer, const uint8_t *digits, size_t digits_length) { + const size_t batch = 9; + size_t length = (digits_length + batch - 1) / batch; + + uint32_t *values = (uint32_t *) xcalloc(length, sizeof(uint32_t)); + uint32_t value = 0; + + for (size_t digits_index = 0; digits_index < digits_length; digits_index++) { + value = value * 10 + digits[digits_index]; + + size_t reverse_index = digits_length - digits_index - 1; + if (reverse_index % batch == 0) { + values[reverse_index / batch] = value; + value = 0; + } + } + + // Convert base from 10**9 to 1<<32. + pm_integer_convert_base(integer, &((pm_integer_t) { .length = length, .values = values, .value = 0, .negative = false }), 1000000000, ((uint64_t) 1 << 32)); + xfree(values); +} + +/** + * Parse a large integer from a string that does not fit into uint32_t. + */ +static void +pm_integer_parse_big(pm_integer_t *integer, uint32_t multiplier, const uint8_t *start, const uint8_t *end) { + // Allocate an array to store digits. + uint8_t *digits = xmalloc(sizeof(uint8_t) * (size_t) (end - start)); + size_t digits_length = 0; + + for (; start < end; start++) { + if (*start == '_') continue; + digits[digits_length++] = pm_integer_parse_digit(*start); + } + + // Construct pm_integer_t from the digits. + if (multiplier == 10) { + pm_integer_parse_decimal(integer, digits, digits_length); + } else { + pm_integer_parse_powof2(integer, multiplier, digits, digits_length); + } + + xfree(digits); +} + +/** + * Parse an integer from a string. This assumes that the format of the integer + * has already been validated, as internal validation checks are not performed + * here. + */ +void +pm_integer_parse(pm_integer_t *integer, pm_integer_base_t base, const uint8_t *start, const uint8_t *end) { + // Ignore unary +. Unary - is parsed differently and will not end up here. + // Instead, it will modify the parsed integer later. + if (*start == '+') start++; + + // Determine the multiplier from the base, and skip past any prefixes. + uint32_t multiplier = 10; + switch (base) { + case PM_INTEGER_BASE_DEFAULT: + while (*start == '0') start++; // 01 -> 1 + break; + case PM_INTEGER_BASE_BINARY: + start += 2; // 0b + multiplier = 2; + break; + case PM_INTEGER_BASE_OCTAL: + start++; // 0 + if (*start == '_' || *start == 'o' || *start == 'O') start++; // o + multiplier = 8; + break; + case PM_INTEGER_BASE_DECIMAL: + if (*start == '0' && (end - start) > 1) start += 2; // 0d + break; + case PM_INTEGER_BASE_HEXADECIMAL: + start += 2; // 0x + multiplier = 16; + break; + case PM_INTEGER_BASE_UNKNOWN: + if (*start == '0' && (end - start) > 1) { + switch (start[1]) { + case '_': start += 2; multiplier = 8; break; + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': start++; multiplier = 8; break; + case 'b': case 'B': start += 2; multiplier = 2; break; + case 'o': case 'O': start += 2; multiplier = 8; break; + case 'd': case 'D': start += 2; break; + case 'x': case 'X': start += 2; multiplier = 16; break; + default: assert(false && "unreachable"); break; + } + } + break; + } + + // It's possible that we've consumed everything at this point if there is an + // invalid integer. If this is the case, we'll just return 0. + if (start >= end) return; + + const uint8_t *cursor = start; + uint64_t value = (uint64_t) pm_integer_parse_digit(*cursor++); + + for (; cursor < end; cursor++) { + if (*cursor == '_') continue; + value = value * multiplier + (uint64_t) pm_integer_parse_digit(*cursor); + + if (value > UINT32_MAX) { + // If the integer is too large to fit into a single uint32_t, then + // we'll parse it as a big integer. + pm_integer_parse_big(integer, multiplier, start, end); + return; + } + } + + integer->value = (uint32_t) value; +} + +/** + * Compare two integers. This function returns -1 if the left integer is less + * than the right integer, 0 if they are equal, and 1 if the left integer is + * greater than the right integer. + */ +int +pm_integer_compare(const pm_integer_t *left, const pm_integer_t *right) { + if (left->negative != right->negative) return left->negative ? -1 : 1; + int negative = left->negative ? -1 : 1; + + if (left->values == NULL && right->values == NULL) { + if (left->value < right->value) return -1 * negative; + if (left->value > right->value) return 1 * negative; + return 0; + } + + if (left->values == NULL || left->length < right->length) return -1 * negative; + if (right->values == NULL || left->length > right->length) return 1 * negative; + + for (size_t index = 0; index < left->length; index++) { + size_t value_index = left->length - index - 1; + uint32_t left_value = left->values[value_index]; + uint32_t right_value = right->values[value_index]; + + if (left_value < right_value) return -1 * negative; + if (left_value > right_value) return 1 * negative; + } + + return 0; +} + +/** + * Reduce a ratio of integers to its simplest form. + */ +void pm_integers_reduce(pm_integer_t *numerator, pm_integer_t *denominator) { + // If either the numerator or denominator do not fit into a 32-bit integer, + // then this function is a no-op. In the future, we may consider reducing + // even the larger numbers, but for now we're going to keep it simple. + if ( + // If the numerator doesn't fit into a 32-bit integer, return early. + numerator->length != 0 || + // If the denominator doesn't fit into a 32-bit integer, return early. + denominator->length != 0 || + // If the numerator is 0, then return early. + numerator->value == 0 || + // If the denominator is 1, then return early. + denominator->value == 1 + ) return; + + // Find the greatest common divisor of the numerator and denominator. + uint32_t divisor = numerator->value; + uint32_t remainder = denominator->value; + + while (remainder != 0) { + uint32_t temporary = remainder; + remainder = divisor % remainder; + divisor = temporary; + } + + // Divide the numerator and denominator by the greatest common divisor. + numerator->value /= divisor; + denominator->value /= divisor; +} + +/** + * Convert an integer to a decimal string. + */ +PRISM_EXPORTED_FUNCTION void +pm_integer_string(pm_buffer_t *buffer, const pm_integer_t *integer) { + if (integer->negative) { + pm_buffer_append_byte(buffer, '-'); + } + + // If the integer fits into a single uint32_t, then we can just append the + // value directly to the buffer. + if (integer->values == NULL) { + pm_buffer_append_format(buffer, "%" PRIu32, integer->value); + return; + } + + // If the integer is two uint32_t values, then we can | them together and + // append the result to the buffer. + if (integer->length == 2) { + const uint64_t value = ((uint64_t) integer->values[0]) | ((uint64_t) integer->values[1] << 32); + pm_buffer_append_format(buffer, "%" PRIu64, value); + return; + } + + // Otherwise, first we'll convert the base from 1<<32 to 10**9. + pm_integer_t converted = { 0 }; + pm_integer_convert_base(&converted, integer, (uint64_t) 1 << 32, 1000000000); + + if (converted.values == NULL) { + pm_buffer_append_format(buffer, "%" PRIu32, converted.value); + pm_integer_free(&converted); + return; + } + + // Allocate a buffer that we'll copy the decimal digits into. + size_t digits_length = converted.length * 9; + char *digits = xcalloc(digits_length, sizeof(char)); + if (digits == NULL) return; + + // Pack bigdecimal to digits. + for (size_t value_index = 0; value_index < converted.length; value_index++) { + uint32_t value = converted.values[value_index]; + + for (size_t digit_index = 0; digit_index < 9; digit_index++) { + digits[digits_length - 9 * value_index - digit_index - 1] = (char) ('0' + value % 10); + value /= 10; + } + } + + size_t start_offset = 0; + while (start_offset < digits_length - 1 && digits[start_offset] == '0') start_offset++; + + // Finally, append the string to the buffer and free the digits. + pm_buffer_append_string(buffer, digits + start_offset, digits_length - start_offset); + xfree(digits); + pm_integer_free(&converted); +} + +/** + * Free the internal memory of an integer. This memory will only be allocated if + * the integer exceeds the size of a single uint32_t. + */ +PRISM_EXPORTED_FUNCTION void +pm_integer_free(pm_integer_t *integer) { + if (integer->values) { + xfree(integer->values); + } +} diff --git a/prism/util/pm_integer.h b/prism/util/pm_integer.h new file mode 100644 index 0000000000..304665e620 --- /dev/null +++ b/prism/util/pm_integer.h @@ -0,0 +1,130 @@ +/** + * @file pm_integer.h + * + * This module provides functions for working with arbitrary-sized integers. + */ +#ifndef PRISM_NUMBER_H +#define PRISM_NUMBER_H + +#include "prism/defines.h" +#include "prism/util/pm_buffer.h" + +#include <assert.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdlib.h> + +/** + * A structure represents an arbitrary-sized integer. + */ +typedef struct { + /** + * The number of allocated values. length is set to 0 if the integer fits + * into uint32_t. + */ + size_t length; + + /** + * List of 32-bit integers. Set to NULL if the integer fits into uint32_t. + */ + uint32_t *values; + + /** + * Embedded value for small integer. This value is set to 0 if the value + * does not fit into uint32_t. + */ + uint32_t value; + + /** + * Whether or not the integer is negative. It is stored this way so that a + * zeroed pm_integer_t is always positive zero. + */ + bool negative; +} pm_integer_t; + +/** + * An enum controlling the base of an integer. It is expected that the base is + * already known before parsing the integer, even though it could be derived + * from the string itself. + */ +typedef enum { + /** The default decimal base, with no prefix. Leading 0s will be ignored. */ + PM_INTEGER_BASE_DEFAULT, + + /** The binary base, indicated by a 0b or 0B prefix. */ + PM_INTEGER_BASE_BINARY, + + /** The octal base, indicated by a 0, 0o, or 0O prefix. */ + PM_INTEGER_BASE_OCTAL, + + /** The decimal base, indicated by a 0d, 0D, or empty prefix. */ + PM_INTEGER_BASE_DECIMAL, + + /** The hexadecimal base, indicated by a 0x or 0X prefix. */ + PM_INTEGER_BASE_HEXADECIMAL, + + /** + * An unknown base, in which case pm_integer_parse will derive it based on + * the content of the string. This is less efficient and does more + * comparisons, so if callers know the base ahead of time, they should use + * that instead. + */ + PM_INTEGER_BASE_UNKNOWN +} pm_integer_base_t; + +/** + * Parse an integer from a string. This assumes that the format of the integer + * has already been validated, as internal validation checks are not performed + * here. + * + * @param integer The integer to parse into. + * @param base The base of the integer. + * @param start The start of the string. + * @param end The end of the string. + */ +void pm_integer_parse(pm_integer_t *integer, pm_integer_base_t base, const uint8_t *start, const uint8_t *end); + +/** + * Compare two integers. This function returns -1 if the left integer is less + * than the right integer, 0 if they are equal, and 1 if the left integer is + * greater than the right integer. + * + * @param left The left integer to compare. + * @param right The right integer to compare. + * @return The result of the comparison. + */ +int pm_integer_compare(const pm_integer_t *left, const pm_integer_t *right); + +/** + * Reduce a ratio of integers to its simplest form. + * + * If either the numerator or denominator do not fit into a 32-bit integer, then + * this function is a no-op. In the future, we may consider reducing even the + * larger numbers, but for now we're going to keep it simple. + * + * @param numerator The numerator of the ratio. + * @param denominator The denominator of the ratio. + */ +void pm_integers_reduce(pm_integer_t *numerator, pm_integer_t *denominator); + +/** + * Convert an integer to a decimal string. + * + * @param buffer The buffer to append the string to. + * @param integer The integer to convert to a string. + * + * \public \memberof pm_integer_t + */ +PRISM_EXPORTED_FUNCTION void pm_integer_string(pm_buffer_t *buffer, const pm_integer_t *integer); + +/** + * Free the internal memory of an integer. This memory will only be allocated if + * the integer exceeds the size of a single node in the linked list. + * + * @param integer The integer to free. + * + * \public \memberof pm_integer_t + */ +PRISM_EXPORTED_FUNCTION void pm_integer_free(pm_integer_t *integer); + +#endif diff --git a/prism/util/pm_list.c b/prism/util/pm_list.c new file mode 100644 index 0000000000..ad2294cd60 --- /dev/null +++ b/prism/util/pm_list.c @@ -0,0 +1,49 @@ +#include "prism/util/pm_list.h" + +/** + * Returns true if the given list is empty. + */ +PRISM_EXPORTED_FUNCTION bool +pm_list_empty_p(pm_list_t *list) { + return list->head == NULL; +} + +/** + * Returns the size of the list. + */ +PRISM_EXPORTED_FUNCTION size_t +pm_list_size(pm_list_t *list) { + return list->size; +} + +/** + * Append a node to the given list. + */ +void +pm_list_append(pm_list_t *list, pm_list_node_t *node) { + if (list->head == NULL) { + list->head = node; + } else { + list->tail->next = node; + } + + list->tail = node; + list->size++; +} + +/** + * Deallocate the internal state of the given list. + */ +PRISM_EXPORTED_FUNCTION void +pm_list_free(pm_list_t *list) { + pm_list_node_t *node = list->head; + pm_list_node_t *next; + + while (node != NULL) { + next = node->next; + xfree(node); + node = next; + } + + list->size = 0; +} diff --git a/prism/util/pm_list.h b/prism/util/pm_list.h new file mode 100644 index 0000000000..f544bb2943 --- /dev/null +++ b/prism/util/pm_list.h @@ -0,0 +1,103 @@ +/** + * @file pm_list.h + * + * An abstract linked list. + */ +#ifndef PRISM_LIST_H +#define PRISM_LIST_H + +#include "prism/defines.h" + +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> +#include <stdlib.h> + +/** + * This struct represents an abstract linked list that provides common + * functionality. It is meant to be used any time a linked list is necessary to + * store data. + * + * The linked list itself operates off a set of pointers. Because the pointers + * are not necessarily sequential, they can be of any size. We use this fact to + * allow the consumer of this linked list to extend the node struct to include + * any data they want. This is done by using the pm_list_node_t as the first + * member of the struct. + * + * For example, if we want to store a list of integers, we can do the following: + * + * ```c + * typedef struct { + * pm_list_node_t node; + * int value; + * } pm_int_node_t; + * + * pm_list_t list = { 0 }; + * pm_int_node_t *node = xmalloc(sizeof(pm_int_node_t)); + * node->value = 5; + * + * pm_list_append(&list, &node->node); + * ``` + * + * The pm_list_t struct is used to represent the overall linked list. It + * contains a pointer to the head and tail of the list. This allows for easy + * iteration and appending of new nodes. + */ +typedef struct pm_list_node { + /** A pointer to the next node in the list. */ + struct pm_list_node *next; +} pm_list_node_t; + +/** + * This represents the overall linked list. It keeps a pointer to the head and + * tail so that iteration is easy and pushing new nodes is easy. + */ +typedef struct { + /** The size of the list. */ + size_t size; + + /** A pointer to the head of the list. */ + pm_list_node_t *head; + + /** A pointer to the tail of the list. */ + pm_list_node_t *tail; +} pm_list_t; + +/** + * Returns true if the given list is empty. + * + * @param list The list to check. + * @return True if the given list is empty, otherwise false. + * + * \public \memberof pm_list_t + */ +PRISM_EXPORTED_FUNCTION bool pm_list_empty_p(pm_list_t *list); + +/** + * Returns the size of the list. + * + * @param list The list to check. + * @return The size of the list. + * + * \public \memberof pm_list_t + */ +PRISM_EXPORTED_FUNCTION size_t pm_list_size(pm_list_t *list); + +/** + * Append a node to the given list. + * + * @param list The list to append to. + * @param node The node to append. + */ +void pm_list_append(pm_list_t *list, pm_list_node_t *node); + +/** + * Deallocate the internal state of the given list. + * + * @param list The list to free. + * + * \public \memberof pm_list_t + */ +PRISM_EXPORTED_FUNCTION void pm_list_free(pm_list_t *list); + +#endif diff --git a/prism/util/pm_memchr.c b/prism/util/pm_memchr.c new file mode 100644 index 0000000000..7ea20ace6d --- /dev/null +++ b/prism/util/pm_memchr.c @@ -0,0 +1,35 @@ +#include "prism/util/pm_memchr.h" + +#define PRISM_MEMCHR_TRAILING_BYTE_MINIMUM 0x40 + +/** + * We need to roll our own memchr to handle cases where the encoding changes and + * we need to search for a character in a buffer that could be the trailing byte + * of a multibyte character. + */ +void * +pm_memchr(const void *memory, int character, size_t number, bool encoding_changed, const pm_encoding_t *encoding) { + if (encoding_changed && encoding->multibyte && character >= PRISM_MEMCHR_TRAILING_BYTE_MINIMUM) { + const uint8_t *source = (const uint8_t *) memory; + size_t index = 0; + + while (index < number) { + if (source[index] == character) { + return (void *) (source + index); + } + + size_t width = encoding->char_width(source + index, (ptrdiff_t) (number - index)); + if (width == 0) { + return NULL; + } + + index += width; + } + + return NULL; + } else { + return memchr(memory, character, number); + } +} + +#undef PRISM_MEMCHR_TRAILING_BYTE_MINIMUM diff --git a/prism/util/pm_memchr.h b/prism/util/pm_memchr.h new file mode 100644 index 0000000000..e0671eaed3 --- /dev/null +++ b/prism/util/pm_memchr.h @@ -0,0 +1,29 @@ +/** + * @file pm_memchr.h + * + * A custom memchr implementation. + */ +#ifndef PRISM_MEMCHR_H +#define PRISM_MEMCHR_H + +#include "prism/defines.h" +#include "prism/encoding.h" + +#include <stddef.h> + +/** + * We need to roll our own memchr to handle cases where the encoding changes and + * we need to search for a character in a buffer that could be the trailing byte + * of a multibyte character. + * + * @param source The source string. + * @param character The character to search for. + * @param number The maximum number of bytes to search. + * @param encoding_changed Whether the encoding changed. + * @param encoding A pointer to the encoding. + * @return A pointer to the first occurrence of the character in the source + * string, or NULL if no such character exists. + */ +void * pm_memchr(const void *source, int character, size_t number, bool encoding_changed, const pm_encoding_t *encoding); + +#endif diff --git a/prism/util/pm_newline_list.c b/prism/util/pm_newline_list.c new file mode 100644 index 0000000000..8331618f54 --- /dev/null +++ b/prism/util/pm_newline_list.c @@ -0,0 +1,125 @@ +#include "prism/util/pm_newline_list.h" + +/** + * Initialize a new newline list with the given capacity. Returns true if the + * allocation of the offsets succeeds, otherwise returns false. + */ +bool +pm_newline_list_init(pm_newline_list_t *list, const uint8_t *start, size_t capacity) { + list->offsets = (size_t *) xcalloc(capacity, sizeof(size_t)); + if (list->offsets == NULL) return false; + + list->start = start; + + // This is 1 instead of 0 because we want to include the first line of the + // file as having offset 0, which is set because of calloc. + list->size = 1; + list->capacity = capacity; + + return true; +} + +/** + * Clear out the newlines that have been appended to the list. + */ +void +pm_newline_list_clear(pm_newline_list_t *list) { + list->size = 1; +} + +/** + * Append a new offset to the newline list. Returns true if the reallocation of + * the offsets succeeds (if one was necessary), otherwise returns false. + */ +bool +pm_newline_list_append(pm_newline_list_t *list, const uint8_t *cursor) { + if (list->size == list->capacity) { + size_t *original_offsets = list->offsets; + + list->capacity = (list->capacity * 3) / 2; + list->offsets = (size_t *) xcalloc(list->capacity, sizeof(size_t)); + if (list->offsets == NULL) return false; + + memcpy(list->offsets, original_offsets, list->size * sizeof(size_t)); + xfree(original_offsets); + } + + assert(*cursor == '\n'); + assert(cursor >= list->start); + size_t newline_offset = (size_t) (cursor - list->start + 1); + + assert(list->size == 0 || newline_offset > list->offsets[list->size - 1]); + list->offsets[list->size++] = newline_offset; + + return true; +} + +/** + * Returns the line of the given offset. If the offset is not in the list, the + * line of the closest offset less than the given offset is returned. + */ +int32_t +pm_newline_list_line(const pm_newline_list_t *list, const uint8_t *cursor, int32_t start_line) { + assert(cursor >= list->start); + size_t offset = (size_t) (cursor - list->start); + + size_t left = 0; + size_t right = list->size - 1; + + while (left <= right) { + size_t mid = left + (right - left) / 2; + + if (list->offsets[mid] == offset) { + return ((int32_t) mid) + start_line; + } + + if (list->offsets[mid] < offset) { + left = mid + 1; + } else { + right = mid - 1; + } + } + + return ((int32_t) left) + start_line - 1; +} + +/** + * Returns the line and column of the given offset. If the offset is not in the + * list, the line and column of the closest offset less than the given offset + * are returned. + */ +pm_line_column_t +pm_newline_list_line_column(const pm_newline_list_t *list, const uint8_t *cursor, int32_t start_line) { + assert(cursor >= list->start); + size_t offset = (size_t) (cursor - list->start); + + size_t left = 0; + size_t right = list->size - 1; + + while (left <= right) { + size_t mid = left + (right - left) / 2; + + if (list->offsets[mid] == offset) { + return ((pm_line_column_t) { ((int32_t) mid) + start_line, 0 }); + } + + if (list->offsets[mid] < offset) { + left = mid + 1; + } else { + right = mid - 1; + } + } + + return ((pm_line_column_t) { + .line = ((int32_t) left) + start_line - 1, + .column = (uint32_t) (offset - list->offsets[left - 1]) + }); +} + +/** + * Free the internal memory allocated for the newline list. + */ +void +pm_newline_list_free(pm_newline_list_t *list) { + xfree(list->offsets); +} diff --git a/prism/util/pm_newline_list.h b/prism/util/pm_newline_list.h new file mode 100644 index 0000000000..406abe8ba5 --- /dev/null +++ b/prism/util/pm_newline_list.h @@ -0,0 +1,113 @@ +/** + * @file pm_newline_list.h + * + * A list of byte offsets of newlines in a string. + * + * When compiling the syntax tree, it's necessary to know the line and column + * of many nodes. This is necessary to support things like error messages, + * tracepoints, etc. + * + * It's possible that we could store the start line, start column, end line, and + * end column on every node in addition to the offsets that we already store, + * but that would be quite a lot of memory overhead. + */ +#ifndef PRISM_NEWLINE_LIST_H +#define PRISM_NEWLINE_LIST_H + +#include "prism/defines.h" + +#include <assert.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdlib.h> + +/** + * A list of offsets of newlines in a string. The offsets are assumed to be + * sorted/inserted in ascending order. + */ +typedef struct { + /** A pointer to the start of the source string. */ + const uint8_t *start; + + /** The number of offsets in the list. */ + size_t size; + + /** The capacity of the list that has been allocated. */ + size_t capacity; + + /** The list of offsets. */ + size_t *offsets; +} pm_newline_list_t; + +/** + * A line and column in a string. + */ +typedef struct { + /** The line number. */ + int32_t line; + + /** The column number. */ + uint32_t column; +} pm_line_column_t; + +/** + * Initialize a new newline list with the given capacity. Returns true if the + * allocation of the offsets succeeds, otherwise returns false. + * + * @param list The list to initialize. + * @param start A pointer to the start of the source string. + * @param capacity The initial capacity of the list. + * @return True if the allocation of the offsets succeeds, otherwise false. + */ +bool pm_newline_list_init(pm_newline_list_t *list, const uint8_t *start, size_t capacity); + +/** + * Clear out the newlines that have been appended to the list. + * + * @param list The list to clear. + */ +void +pm_newline_list_clear(pm_newline_list_t *list); + +/** + * Append a new offset to the newline list. Returns true if the reallocation of + * the offsets succeeds (if one was necessary), otherwise returns false. + * + * @param list The list to append to. + * @param cursor A pointer to the offset to append. + * @return True if the reallocation of the offsets succeeds (if one was + * necessary), otherwise false. + */ +bool pm_newline_list_append(pm_newline_list_t *list, const uint8_t *cursor); + +/** + * Returns the line of the given offset. If the offset is not in the list, the + * line of the closest offset less than the given offset is returned. + * + * @param list The list to search. + * @param cursor A pointer to the offset to search for. + * @param start_line The line to start counting from. + * @return The line of the given offset. + */ +int32_t pm_newline_list_line(const pm_newline_list_t *list, const uint8_t *cursor, int32_t start_line); + +/** + * Returns the line and column of the given offset. If the offset is not in the + * list, the line and column of the closest offset less than the given offset + * are returned. + * + * @param list The list to search. + * @param cursor A pointer to the offset to search for. + * @param start_line The line to start counting from. + * @return The line and column of the given offset. + */ +pm_line_column_t pm_newline_list_line_column(const pm_newline_list_t *list, const uint8_t *cursor, int32_t start_line); + +/** + * Free the internal memory allocated for the newline list. + * + * @param list The list to free. + */ +void pm_newline_list_free(pm_newline_list_t *list); + +#endif diff --git a/prism/util/pm_string.c b/prism/util/pm_string.c new file mode 100644 index 0000000000..a7493c468b --- /dev/null +++ b/prism/util/pm_string.c @@ -0,0 +1,381 @@ +#include "prism/util/pm_string.h" + +static const uint8_t empty_source[] = ""; + +/** + * Returns the size of the pm_string_t struct. This is necessary to allocate the + * correct amount of memory in the FFI backend. + */ +PRISM_EXPORTED_FUNCTION size_t +pm_string_sizeof(void) { + return sizeof(pm_string_t); +} + +/** + * Initialize a shared string that is based on initial input. + */ +void +pm_string_shared_init(pm_string_t *string, const uint8_t *start, const uint8_t *end) { + assert(start <= end); + + *string = (pm_string_t) { + .type = PM_STRING_SHARED, + .source = start, + .length = (size_t) (end - start) + }; +} + +/** + * Initialize an owned string that is responsible for freeing allocated memory. + */ +void +pm_string_owned_init(pm_string_t *string, uint8_t *source, size_t length) { + *string = (pm_string_t) { + .type = PM_STRING_OWNED, + .source = source, + .length = length + }; +} + +/** + * Initialize a constant string that doesn't own its memory source. + */ +void +pm_string_constant_init(pm_string_t *string, const char *source, size_t length) { + *string = (pm_string_t) { + .type = PM_STRING_CONSTANT, + .source = (const uint8_t *) source, + .length = length + }; +} + +#ifdef _WIN32 +/** + * Represents a file handle on Windows, where the path will need to be freed + * when the file is closed. + */ +typedef struct { + /** The path to the file, which will become allocated memory. */ + WCHAR *path; + + /** The handle to the file, which will start as uninitialized memory. */ + HANDLE file; +} pm_string_file_handle_t; + +/** + * Open the file indicated by the filepath parameter for reading on Windows. + * Perform any kind of normalization that needs to happen on the filepath. + */ +static pm_string_init_result_t +pm_string_file_handle_open(pm_string_file_handle_t *handle, const char *filepath) { + int length = MultiByteToWideChar(CP_UTF8, 0, filepath, -1, NULL, 0); + if (length == 0) return PM_STRING_INIT_ERROR_GENERIC; + + handle->path = xmalloc(sizeof(WCHAR) * ((size_t) length)); + if ((handle->path == NULL) || (MultiByteToWideChar(CP_UTF8, 0, filepath, -1, handle->path, length) == 0)) { + xfree(handle->path); + return PM_STRING_INIT_ERROR_GENERIC; + } + + handle->file = CreateFileW(handle->path, GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_READONLY, NULL); + if (handle->file == INVALID_HANDLE_VALUE) { + pm_string_init_result_t result = PM_STRING_INIT_ERROR_GENERIC; + + if (GetLastError() == ERROR_ACCESS_DENIED) { + DWORD attributes = GetFileAttributesW(handle->path); + if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) { + result = PM_STRING_INIT_ERROR_DIRECTORY; + } + } + + xfree(handle->path); + return result; + } + + return PM_STRING_INIT_SUCCESS; +} + +/** + * Close the file handle and free the path. + */ +static void +pm_string_file_handle_close(pm_string_file_handle_t *handle) { + xfree(handle->path); + CloseHandle(handle->file); +} +#endif + +/** + * Read the file indicated by the filepath parameter into source and load its + * contents and size into the given `pm_string_t`. The given `pm_string_t` + * should be freed using `pm_string_free` when it is no longer used. + * + * We want to use demand paging as much as possible in order to avoid having to + * read the entire file into memory (which could be detrimental to performance + * for large files). This means that if we're on windows we'll use + * `MapViewOfFile`, on POSIX systems that have access to `mmap` we'll use + * `mmap`, and on other POSIX systems we'll use `read`. + */ +PRISM_EXPORTED_FUNCTION pm_string_init_result_t +pm_string_mapped_init(pm_string_t *string, const char *filepath) { +#ifdef _WIN32 + // Open the file for reading. + pm_string_file_handle_t handle; + pm_string_init_result_t result = pm_string_file_handle_open(&handle, filepath); + if (result != PM_STRING_INIT_SUCCESS) return result; + + // Get the file size. + DWORD file_size = GetFileSize(handle.file, NULL); + if (file_size == INVALID_FILE_SIZE) { + pm_string_file_handle_close(&handle); + return PM_STRING_INIT_ERROR_GENERIC; + } + + // If the file is empty, then we don't need to do anything else, we'll set + // the source to a constant empty string and return. + if (file_size == 0) { + pm_string_file_handle_close(&handle); + *string = (pm_string_t) { .type = PM_STRING_CONSTANT, .source = empty_source, .length = 0 }; + return PM_STRING_INIT_SUCCESS; + } + + // Create a mapping of the file. + HANDLE mapping = CreateFileMapping(handle.file, NULL, PAGE_READONLY, 0, 0, NULL); + if (mapping == NULL) { + pm_string_file_handle_close(&handle); + return PM_STRING_INIT_ERROR_GENERIC; + } + + // Map the file into memory. + uint8_t *source = (uint8_t *) MapViewOfFile(mapping, FILE_MAP_READ, 0, 0, 0); + CloseHandle(mapping); + pm_string_file_handle_close(&handle); + + if (source == NULL) { + return PM_STRING_INIT_ERROR_GENERIC; + } + + *string = (pm_string_t) { .type = PM_STRING_MAPPED, .source = source, .length = (size_t) file_size }; + return PM_STRING_INIT_SUCCESS; +#elif defined(_POSIX_MAPPED_FILES) + // Open the file for reading + int fd = open(filepath, O_RDONLY); + if (fd == -1) { + return PM_STRING_INIT_ERROR_GENERIC; + } + + // Stat the file to get the file size + struct stat sb; + if (fstat(fd, &sb) == -1) { + close(fd); + return PM_STRING_INIT_ERROR_GENERIC; + } + + // Ensure it is a file and not a directory + if (S_ISDIR(sb.st_mode)) { + close(fd); + return PM_STRING_INIT_ERROR_DIRECTORY; + } + + // mmap the file descriptor to virtually get the contents + size_t size = (size_t) sb.st_size; + uint8_t *source = NULL; + + if (size == 0) { + close(fd); + *string = (pm_string_t) { .type = PM_STRING_CONSTANT, .source = empty_source, .length = 0 }; + return PM_STRING_INIT_SUCCESS; + } + + source = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0); + if (source == MAP_FAILED) { + close(fd); + return PM_STRING_INIT_ERROR_GENERIC; + } + + close(fd); + *string = (pm_string_t) { .type = PM_STRING_MAPPED, .source = source, .length = size }; + return PM_STRING_INIT_SUCCESS; +#else + return pm_string_file_init(string, filepath); +#endif +} + +/** + * Read the file indicated by the filepath parameter into source and load its + * contents and size into the given `pm_string_t`. The given `pm_string_t` + * should be freed using `pm_string_free` when it is no longer used. + */ +PRISM_EXPORTED_FUNCTION pm_string_init_result_t +pm_string_file_init(pm_string_t *string, const char *filepath) { +#ifdef _WIN32 + // Open the file for reading. + pm_string_file_handle_t handle; + pm_string_init_result_t result = pm_string_file_handle_open(&handle, filepath); + if (result != PM_STRING_INIT_SUCCESS) return result; + + // Get the file size. + DWORD file_size = GetFileSize(handle.file, NULL); + if (file_size == INVALID_FILE_SIZE) { + pm_string_file_handle_close(&handle); + return PM_STRING_INIT_ERROR_GENERIC; + } + + // If the file is empty, then we don't need to do anything else, we'll set + // the source to a constant empty string and return. + if (file_size == 0) { + pm_string_file_handle_close(&handle); + *string = (pm_string_t) { .type = PM_STRING_CONSTANT, .source = empty_source, .length = 0 }; + return PM_STRING_INIT_SUCCESS; + } + + // Create a buffer to read the file into. + uint8_t *source = xmalloc(file_size); + if (source == NULL) { + pm_string_file_handle_close(&handle); + return PM_STRING_INIT_ERROR_GENERIC; + } + + // Read the contents of the file + DWORD bytes_read; + if (!ReadFile(handle.file, source, file_size, &bytes_read, NULL)) { + pm_string_file_handle_close(&handle); + return PM_STRING_INIT_ERROR_GENERIC; + } + + // Check the number of bytes read + if (bytes_read != file_size) { + xfree(source); + pm_string_file_handle_close(&handle); + return PM_STRING_INIT_ERROR_GENERIC; + } + + pm_string_file_handle_close(&handle); + *string = (pm_string_t) { .type = PM_STRING_OWNED, .source = source, .length = (size_t) file_size }; + return PM_STRING_INIT_SUCCESS; +#elif defined(PRISM_HAS_FILESYSTEM) + // Open the file for reading + int fd = open(filepath, O_RDONLY); + if (fd == -1) { + return PM_STRING_INIT_ERROR_GENERIC; + } + + // Stat the file to get the file size + struct stat sb; + if (fstat(fd, &sb) == -1) { + close(fd); + return PM_STRING_INIT_ERROR_GENERIC; + } + + // Ensure it is a file and not a directory + if (S_ISDIR(sb.st_mode)) { + close(fd); + return PM_STRING_INIT_ERROR_DIRECTORY; + } + + // Check the size to see if it's empty + size_t size = (size_t) sb.st_size; + if (size == 0) { + close(fd); + *string = (pm_string_t) { .type = PM_STRING_CONSTANT, .source = empty_source, .length = 0 }; + return PM_STRING_INIT_SUCCESS; + } + + size_t length = (size_t) size; + uint8_t *source = xmalloc(length); + if (source == NULL) { + close(fd); + return PM_STRING_INIT_ERROR_GENERIC; + } + + long bytes_read = (long) read(fd, source, length); + close(fd); + + if (bytes_read == -1) { + xfree(source); + return PM_STRING_INIT_ERROR_GENERIC; + } + + *string = (pm_string_t) { .type = PM_STRING_OWNED, .source = source, .length = length }; + return PM_STRING_INIT_SUCCESS; +#else + (void) string; + (void) filepath; + perror("pm_string_file_init is not implemented for this platform"); + return PM_STRING_INIT_ERROR_GENERIC; +#endif +} + +/** + * Ensure the string is owned. If it is not, then reinitialize it as owned and + * copy over the previous source. + */ +void +pm_string_ensure_owned(pm_string_t *string) { + if (string->type == PM_STRING_OWNED) return; + + size_t length = pm_string_length(string); + const uint8_t *source = pm_string_source(string); + + uint8_t *memory = xmalloc(length); + if (!memory) return; + + pm_string_owned_init(string, memory, length); + memcpy((void *) string->source, source, length); +} + +/** + * Compare the underlying lengths and bytes of two strings. Returns 0 if the + * strings are equal, a negative number if the left string is less than the + * right string, and a positive number if the left string is greater than the + * right string. + */ +int +pm_string_compare(const pm_string_t *left, const pm_string_t *right) { + size_t left_length = pm_string_length(left); + size_t right_length = pm_string_length(right); + + if (left_length < right_length) { + return -1; + } else if (left_length > right_length) { + return 1; + } + + return memcmp(pm_string_source(left), pm_string_source(right), left_length); +} + +/** + * Returns the length associated with the string. + */ +PRISM_EXPORTED_FUNCTION size_t +pm_string_length(const pm_string_t *string) { + return string->length; +} + +/** + * Returns the start pointer associated with the string. + */ +PRISM_EXPORTED_FUNCTION const uint8_t * +pm_string_source(const pm_string_t *string) { + return string->source; +} + +/** + * Free the associated memory of the given string. + */ +PRISM_EXPORTED_FUNCTION void +pm_string_free(pm_string_t *string) { + void *memory = (void *) string->source; + + if (string->type == PM_STRING_OWNED) { + xfree(memory); +#ifdef PRISM_HAS_MMAP + } else if (string->type == PM_STRING_MAPPED && string->length) { +#if defined(_WIN32) + UnmapViewOfFile(memory); +#elif defined(_POSIX_MAPPED_FILES) + munmap(memory, string->length); +#endif +#endif /* PRISM_HAS_MMAP */ + } +} diff --git a/prism/util/pm_string.h b/prism/util/pm_string.h new file mode 100644 index 0000000000..d8456ff294 --- /dev/null +++ b/prism/util/pm_string.h @@ -0,0 +1,200 @@ +/** + * @file pm_string.h + * + * A generic string type that can have various ownership semantics. + */ +#ifndef PRISM_STRING_H +#define PRISM_STRING_H + +#include "prism/defines.h" + +#include <assert.h> +#include <errno.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> + +// The following headers are necessary to read files using demand paging. +#ifdef _WIN32 +#include <windows.h> +#elif defined(_POSIX_MAPPED_FILES) +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/stat.h> +#elif defined(PRISM_HAS_FILESYSTEM) +#include <fcntl.h> +#include <sys/stat.h> +#endif + +/** + * A generic string type that can have various ownership semantics. + */ +typedef struct { + /** A pointer to the start of the string. */ + const uint8_t *source; + + /** The length of the string in bytes of memory. */ + size_t length; + + /** The type of the string. This field determines how the string should be freed. */ + enum { + /** This string is a constant string, and should not be freed. */ + PM_STRING_CONSTANT, + + /** This is a slice of another string, and should not be freed. */ + PM_STRING_SHARED, + + /** This string owns its memory, and should be freed using `pm_string_free()`. */ + PM_STRING_OWNED, + +#ifdef PRISM_HAS_MMAP + /** This string is a memory-mapped file, and should be freed using `pm_string_free()`. */ + PM_STRING_MAPPED +#endif + } type; +} pm_string_t; + +/** + * Returns the size of the pm_string_t struct. This is necessary to allocate the + * correct amount of memory in the FFI backend. + * + * @return The size of the pm_string_t struct. + */ +PRISM_EXPORTED_FUNCTION size_t pm_string_sizeof(void); + +/** + * Defines an empty string. This is useful for initializing a string that will + * be filled in later. + */ +#define PM_STRING_EMPTY ((pm_string_t) { .type = PM_STRING_CONSTANT, .source = NULL, .length = 0 }) + +/** + * Initialize a shared string that is based on initial input. + * + * @param string The string to initialize. + * @param start The start of the string. + * @param end The end of the string. + */ +void pm_string_shared_init(pm_string_t *string, const uint8_t *start, const uint8_t *end); + +/** + * Initialize an owned string that is responsible for freeing allocated memory. + * + * @param string The string to initialize. + * @param source The source of the string. + * @param length The length of the string. + */ +void pm_string_owned_init(pm_string_t *string, uint8_t *source, size_t length); + +/** + * Initialize a constant string that doesn't own its memory source. + * + * @param string The string to initialize. + * @param source The source of the string. + * @param length The length of the string. + */ +void pm_string_constant_init(pm_string_t *string, const char *source, size_t length); + +/** + * Represents the result of calling pm_string_mapped_init or + * pm_string_file_init. We need this additional information because there is + * not a platform-agnostic way to indicate that the file that was attempted to + * be opened was a directory. + */ +typedef enum { + /** Indicates that the string was successfully initialized. */ + PM_STRING_INIT_SUCCESS = 0, + /** + * Indicates a generic error from a string_*_init function, where the type + * of error should be read from `errno` or `GetLastError()`. + */ + PM_STRING_INIT_ERROR_GENERIC = 1, + /** + * Indicates that the file that was attempted to be opened was a directory. + */ + PM_STRING_INIT_ERROR_DIRECTORY = 2 +} pm_string_init_result_t; + +/** + * Read the file indicated by the filepath parameter into source and load its + * contents and size into the given `pm_string_t`. The given `pm_string_t` + * should be freed using `pm_string_free` when it is no longer used. + * + * We want to use demand paging as much as possible in order to avoid having to + * read the entire file into memory (which could be detrimental to performance + * for large files). This means that if we're on windows we'll use + * `MapViewOfFile`, on POSIX systems that have access to `mmap` we'll use + * `mmap`, and on other POSIX systems we'll use `read`. + * + * @param string The string to initialize. + * @param filepath The filepath to read. + * @return The success of the read, indicated by the value of the enum. + * + * \public \memberof pm_string_t + */ +PRISM_EXPORTED_FUNCTION pm_string_init_result_t pm_string_mapped_init(pm_string_t *string, const char *filepath); + +/** + * Read the file indicated by the filepath parameter into source and load its + * contents and size into the given `pm_string_t`. The given `pm_string_t` + * should be freed using `pm_string_free` when it is no longer used. + * + * @param string The string to initialize. + * @param filepath The filepath to read. + * @return The success of the read, indicated by the value of the enum. + * + * \public \memberof pm_string_t + */ +PRISM_EXPORTED_FUNCTION pm_string_init_result_t pm_string_file_init(pm_string_t *string, const char *filepath); + +/** + * Ensure the string is owned. If it is not, then reinitialize it as owned and + * copy over the previous source. + * + * @param string The string to ensure is owned. + */ +void pm_string_ensure_owned(pm_string_t *string); + +/** + * Compare the underlying lengths and bytes of two strings. Returns 0 if the + * strings are equal, a negative number if the left string is less than the + * right string, and a positive number if the left string is greater than the + * right string. + * + * @param left The left string to compare. + * @param right The right string to compare. + * @return The comparison result. + */ +int pm_string_compare(const pm_string_t *left, const pm_string_t *right); + +/** + * Returns the length associated with the string. + * + * @param string The string to get the length of. + * @return The length of the string. + * + * \public \memberof pm_string_t + */ +PRISM_EXPORTED_FUNCTION size_t pm_string_length(const pm_string_t *string); + +/** + * Returns the start pointer associated with the string. + * + * @param string The string to get the start pointer of. + * @return The start pointer of the string. + * + * \public \memberof pm_string_t + */ +PRISM_EXPORTED_FUNCTION const uint8_t * pm_string_source(const pm_string_t *string); + +/** + * Free the associated memory of the given string. + * + * @param string The string to free. + * + * \public \memberof pm_string_t + */ +PRISM_EXPORTED_FUNCTION void pm_string_free(pm_string_t *string); + +#endif diff --git a/prism/util/pm_strncasecmp.c b/prism/util/pm_strncasecmp.c new file mode 100644 index 0000000000..3f58421554 --- /dev/null +++ b/prism/util/pm_strncasecmp.c @@ -0,0 +1,36 @@ +#include "prism/util/pm_strncasecmp.h" + +/** + * A locale-insensitive version of `tolower(3)` + */ +static inline int +pm_tolower(int c) +{ + if ('A' <= c && c <= 'Z') { + return c | 0x20; + } + return c; +} + +/** + * Compare two strings, ignoring case, up to the given length. Returns 0 if the + * strings are equal, a negative number if string1 is less than string2, or a + * positive number if string1 is greater than string2. + * + * Note that this is effectively our own implementation of strncasecmp, but it's + * not available on all of the platforms we want to support so we're rolling it + * here. + */ +int +pm_strncasecmp(const uint8_t *string1, const uint8_t *string2, size_t length) { + size_t offset = 0; + int difference = 0; + + while (offset < length && string1[offset] != '\0') { + if (string2[offset] == '\0') return string1[offset]; + if ((difference = pm_tolower(string1[offset]) - pm_tolower(string2[offset])) != 0) return difference; + offset++; + } + + return difference; +} diff --git a/prism/util/pm_strncasecmp.h b/prism/util/pm_strncasecmp.h new file mode 100644 index 0000000000..5cb88cb5eb --- /dev/null +++ b/prism/util/pm_strncasecmp.h @@ -0,0 +1,32 @@ +/** + * @file pm_strncasecmp.h + * + * A custom strncasecmp implementation. + */ +#ifndef PRISM_STRNCASECMP_H +#define PRISM_STRNCASECMP_H + +#include "prism/defines.h" + +#include <ctype.h> +#include <stddef.h> +#include <stdint.h> + +/** + * Compare two strings, ignoring case, up to the given length. Returns 0 if the + * strings are equal, a negative number if string1 is less than string2, or a + * positive number if string1 is greater than string2. + * + * Note that this is effectively our own implementation of strncasecmp, but it's + * not available on all of the platforms we want to support so we're rolling it + * here. + * + * @param string1 The first string to compare. + * @param string2 The second string to compare + * @param length The maximum number of characters to compare. + * @return 0 if the strings are equal, a negative number if string1 is less than + * string2, or a positive number if string1 is greater than string2. + */ +int pm_strncasecmp(const uint8_t *string1, const uint8_t *string2, size_t length); + +#endif diff --git a/prism/util/pm_strpbrk.c b/prism/util/pm_strpbrk.c new file mode 100644 index 0000000000..916a4cc3fd --- /dev/null +++ b/prism/util/pm_strpbrk.c @@ -0,0 +1,206 @@ +#include "prism/util/pm_strpbrk.h" + +/** + * Add an invalid multibyte character error to the parser. + */ +static inline void +pm_strpbrk_invalid_multibyte_character(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) { + pm_diagnostic_list_append_format(&parser->error_list, start, end, PM_ERR_INVALID_MULTIBYTE_CHARACTER, *start); +} + +/** + * Set the explicit encoding for the parser to the current encoding. + */ +static inline void +pm_strpbrk_explicit_encoding_set(pm_parser_t *parser, const uint8_t *source, size_t width) { + if (parser->explicit_encoding != NULL) { + if (parser->explicit_encoding == parser->encoding) { + // Okay, we already locked to this encoding. + } else if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) { + // Not okay, we already found a Unicode escape sequence and this + // conflicts. + pm_diagnostic_list_append_format(&parser->error_list, source, source + width, PM_ERR_MIXED_ENCODING, parser->encoding->name); + } else { + // Should not be anything else. + assert(false && "unreachable"); + } + } + + parser->explicit_encoding = parser->encoding; +} + +/** + * This is the default path. + */ +static inline const uint8_t * +pm_strpbrk_utf8(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) { + size_t index = 0; + + while (index < maximum) { + if (strchr((const char *) charset, source[index]) != NULL) { + return source + index; + } + + if (source[index] < 0x80) { + index++; + } else { + size_t width = pm_encoding_utf_8_char_width(source + index, (ptrdiff_t) (maximum - index)); + + if (width > 0) { + index += width; + } else if (!validate) { + index++; + } else { + // At this point we know we have an invalid multibyte character. + // We'll walk forward as far as we can until we find the next + // valid character so that we don't spam the user with a ton of + // the same kind of error. + const size_t start = index; + + do { + index++; + } while (index < maximum && pm_encoding_utf_8_char_width(source + index, (ptrdiff_t) (maximum - index)) == 0); + + pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index); + } + } + } + + return NULL; +} + +/** + * This is the path when the encoding is ASCII-8BIT. + */ +static inline const uint8_t * +pm_strpbrk_ascii_8bit(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) { + size_t index = 0; + + while (index < maximum) { + if (strchr((const char *) charset, source[index]) != NULL) { + return source + index; + } + + if (validate && source[index] >= 0x80) pm_strpbrk_explicit_encoding_set(parser, source, 1); + index++; + } + + return NULL; +} + +/** + * This is the slow path that does care about the encoding. + */ +static inline const uint8_t * +pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) { + size_t index = 0; + const pm_encoding_t *encoding = parser->encoding; + + while (index < maximum) { + if (strchr((const char *) charset, source[index]) != NULL) { + return source + index; + } + + if (source[index] < 0x80) { + index++; + } else { + size_t width = encoding->char_width(source + index, (ptrdiff_t) (maximum - index)); + if (validate) pm_strpbrk_explicit_encoding_set(parser, source, width); + + if (width > 0) { + index += width; + } else if (!validate) { + index++; + } else { + // At this point we know we have an invalid multibyte character. + // We'll walk forward as far as we can until we find the next + // valid character so that we don't spam the user with a ton of + // the same kind of error. + const size_t start = index; + + do { + index++; + } while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0); + + pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index); + } + } + } + + return NULL; +} + +/** + * This is the fast path that does not care about the encoding because we know + * the encoding only supports single-byte characters. + */ +static inline const uint8_t * +pm_strpbrk_single_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) { + size_t index = 0; + const pm_encoding_t *encoding = parser->encoding; + + while (index < maximum) { + if (strchr((const char *) charset, source[index]) != NULL) { + return source + index; + } + + if (source[index] < 0x80 || !validate) { + index++; + } else { + size_t width = encoding->char_width(source + index, (ptrdiff_t) (maximum - index)); + pm_strpbrk_explicit_encoding_set(parser, source, width); + + if (width > 0) { + index += width; + } else { + // At this point we know we have an invalid multibyte character. + // We'll walk forward as far as we can until we find the next + // valid character so that we don't spam the user with a ton of + // the same kind of error. + const size_t start = index; + + do { + index++; + } while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0); + + pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index); + } + } + } + + return NULL; +} + +/** + * Here we have rolled our own version of strpbrk. The standard library strpbrk + * has undefined behavior when the source string is not null-terminated. We want + * to support strings that are not null-terminated because pm_parse does not + * have the contract that the string is null-terminated. (This is desirable + * because it means the extension can call pm_parse with the result of a call to + * mmap). + * + * The standard library strpbrk also does not support passing a maximum length + * to search. We want to support this for the reason mentioned above, but we + * also don't want it to stop on null bytes. Ruby actually allows null bytes + * within strings, comments, regular expressions, etc. So we need to be able to + * skip past them. + * + * Finally, we want to support encodings wherein the charset could contain + * characters that are trailing bytes of multi-byte characters. For example, in + * Shift_JIS, the backslash character can be a trailing byte. In that case we + * need to take a slower path and iterate one multi-byte character at a time. + */ +const uint8_t * +pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length, bool validate) { + if (length <= 0) { + return NULL; + } else if (!parser->encoding_changed) { + return pm_strpbrk_utf8(parser, source, charset, (size_t) length, validate); + } else if (parser->encoding == PM_ENCODING_ASCII_8BIT_ENTRY) { + return pm_strpbrk_ascii_8bit(parser, source, charset, (size_t) length, validate); + } else if (parser->encoding->multibyte) { + return pm_strpbrk_multi_byte(parser, source, charset, (size_t) length, validate); + } else { + return pm_strpbrk_single_byte(parser, source, charset, (size_t) length, validate); + } +} diff --git a/prism/util/pm_strpbrk.h b/prism/util/pm_strpbrk.h new file mode 100644 index 0000000000..f387bd5782 --- /dev/null +++ b/prism/util/pm_strpbrk.h @@ -0,0 +1,46 @@ +/** + * @file pm_strpbrk.h + * + * A custom strpbrk implementation. + */ +#ifndef PRISM_STRPBRK_H +#define PRISM_STRPBRK_H + +#include "prism/defines.h" +#include "prism/diagnostic.h" +#include "prism/parser.h" + +#include <stddef.h> +#include <string.h> + +/** + * Here we have rolled our own version of strpbrk. The standard library strpbrk + * has undefined behavior when the source string is not null-terminated. We want + * to support strings that are not null-terminated because pm_parse does not + * have the contract that the string is null-terminated. (This is desirable + * because it means the extension can call pm_parse with the result of a call to + * mmap). + * + * The standard library strpbrk also does not support passing a maximum length + * to search. We want to support this for the reason mentioned above, but we + * also don't want it to stop on null bytes. Ruby actually allows null bytes + * within strings, comments, regular expressions, etc. So we need to be able to + * skip past them. + * + * Finally, we want to support encodings wherein the charset could contain + * characters that are trailing bytes of multi-byte characters. For example, in + * Shift-JIS, the backslash character can be a trailing byte. In that case we + * need to take a slower path and iterate one multi-byte character at a time. + * + * @param parser The parser. + * @param source The source to search. + * @param charset The charset to search for. + * @param length The maximum number of bytes to search. + * @param validate Whether to validate that the source string is valid in the + * current encoding of the parser. + * @return A pointer to the first character in the source string that is in the + * charset, or NULL if no such character exists. + */ +const uint8_t * pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length, bool validate); + +#endif diff --git a/prism/version.h b/prism/version.h new file mode 100644 index 0000000000..0ef7435c17 --- /dev/null +++ b/prism/version.h @@ -0,0 +1,29 @@ +/** + * @file version.h + * + * The version of the Prism library. + */ +#ifndef PRISM_VERSION_H +#define PRISM_VERSION_H + +/** + * The major version of the Prism library as an int. + */ +#define PRISM_VERSION_MAJOR 1 + +/** + * The minor version of the Prism library as an int. + */ +#define PRISM_VERSION_MINOR 8 + +/** + * The patch version of the Prism library as an int. + */ +#define PRISM_VERSION_PATCH 0 + +/** + * The version of the Prism library as a constant string. + */ +#define PRISM_VERSION "1.8.0" + +#endif |
