diff options
author | Kevin Newton <kddnewton@gmail.com> | 2024-01-10 11:04:39 -0500 |
---|---|---|
committer | git <svn-admin@ruby-lang.org> | 2024-01-27 19:59:42 +0000 |
commit | f12ebe11888d9fdd121c98ca8a5155dc044f4cf4 (patch) | |
tree | 5049375d90965241d1809608f08ecbb7dcc4d094 /lib/prism/translation/parser/lexer.rb | |
parent | 223910b329751fbee36efe66ccd544e66dbe90f8 (diff) |
[ruby/prism] Add parser translation
https://github.com/ruby/prism/commit/8cdec8070c
Diffstat (limited to 'lib/prism/translation/parser/lexer.rb')
-rw-r--r-- | lib/prism/translation/parser/lexer.rb | 335 |
1 files changed, 335 insertions, 0 deletions
diff --git a/lib/prism/translation/parser/lexer.rb b/lib/prism/translation/parser/lexer.rb new file mode 100644 index 0000000000..6d3321b945 --- /dev/null +++ b/lib/prism/translation/parser/lexer.rb @@ -0,0 +1,335 @@ +# frozen_string_literal: true + +module Prism + module Translation + class Parser + # Accepts a list of prism tokens and converts them into the expected + # format for the parser gem. + class Lexer + # The direct translating of types between the two lexers. + TYPES = { + # These tokens should never appear in the output of the lexer. + EOF: nil, + MISSING: nil, + NOT_PROVIDED: nil, + IGNORED_NEWLINE: nil, + EMBDOC_END: nil, + EMBDOC_LINE: nil, + __END__: nil, + + # These tokens have more or less direct mappings. + AMPERSAND: :tAMPER2, + AMPERSAND_AMPERSAND: :tANDOP, + AMPERSAND_AMPERSAND_EQUAL: :tOP_ASGN, + AMPERSAND_DOT: :tANDDOT, + AMPERSAND_EQUAL: :tOP_ASGN, + BACK_REFERENCE: :tBACK_REF, + BACKTICK: :tXSTRING_BEG, + BANG: :tBANG, + BANG_EQUAL: :tNEQ, + BANG_TILDE: :tNMATCH, + BRACE_LEFT: :tLCURLY, + BRACE_RIGHT: :tRCURLY, + BRACKET_LEFT: :tLBRACK2, + BRACKET_LEFT_ARRAY: :tLBRACK, + BRACKET_LEFT_RIGHT: :tAREF, + BRACKET_LEFT_RIGHT_EQUAL: :tASET, + BRACKET_RIGHT: :tRBRACK, + CARET: :tCARET, + CARET_EQUAL: :tOP_ASGN, + CHARACTER_LITERAL: :tCHARACTER, + CLASS_VARIABLE: :tCVAR, + COLON: :tCOLON, + COLON_COLON: :tCOLON2, + COMMA: :tCOMMA, + COMMENT: :tCOMMENT, + CONSTANT: :tCONSTANT, + DOT: :tDOT, + DOT_DOT: :tDOT2, + DOT_DOT_DOT: :tDOT3, + EMBDOC_BEGIN: :tCOMMENT, + EMBEXPR_BEGIN: :tSTRING_DBEG, + EMBEXPR_END: :tSTRING_DEND, + EMBVAR: :tSTRING_DVAR, + EQUAL: :tEQL, + EQUAL_EQUAL: :tEQ, + EQUAL_EQUAL_EQUAL: :tEQQ, + EQUAL_GREATER: :tASSOC, + EQUAL_TILDE: :tMATCH, + FLOAT: :tFLOAT, + FLOAT_IMAGINARY: :tIMAGINARY, + FLOAT_RATIONAL: :tRATIONAL, + FLOAT_RATIONAL_IMAGINARY: :tIMAGINARY, + GLOBAL_VARIABLE: :tGVAR, + GREATER: :tGT, + GREATER_EQUAL: :tGEQ, + GREATER_GREATER: :tRSHFT, + GREATER_GREATER_EQUAL: :tOP_ASGN, + HEREDOC_START: :tSTRING_BEG, + HEREDOC_END: :tSTRING_END, + IDENTIFIER: :tIDENTIFIER, + INSTANCE_VARIABLE: :tIVAR, + INTEGER: :tINTEGER, + INTEGER_IMAGINARY: :tIMAGINARY, + INTEGER_RATIONAL: :tRATIONAL, + INTEGER_RATIONAL_IMAGINARY: :tIMAGINARY, + KEYWORD_ALIAS: :kALIAS, + KEYWORD_AND: :kAND, + KEYWORD_BEGIN: :kBEGIN, + KEYWORD_BEGIN_UPCASE: :klBEGIN, + KEYWORD_BREAK: :kBREAK, + KEYWORD_CASE: :kCASE, + KEYWORD_CLASS: :kCLASS, + KEYWORD_DEF: :kDEF, + KEYWORD_DEFINED: :kDEFINED, + KEYWORD_DO: :kDO, + KEYWORD_DO_LOOP: :kDO_COND, + KEYWORD_END: :kEND, + KEYWORD_END_UPCASE: :klEND, + KEYWORD_ENSURE: :kENSURE, + KEYWORD_ELSE: :kELSE, + KEYWORD_ELSIF: :kELSIF, + KEYWORD_FALSE: :kFALSE, + KEYWORD_FOR: :kFOR, + KEYWORD_IF: :kIF, + KEYWORD_IF_MODIFIER: :kIF_MOD, + KEYWORD_IN: :kIN, + KEYWORD_MODULE: :kMODULE, + KEYWORD_NEXT: :kNEXT, + KEYWORD_NIL: :kNIL, + KEYWORD_NOT: :kNOT, + KEYWORD_OR: :kOR, + KEYWORD_REDO: :kREDO, + KEYWORD_RESCUE: :kRESCUE, + KEYWORD_RESCUE_MODIFIER: :kRESCUE_MOD, + KEYWORD_RETRY: :kRETRY, + KEYWORD_RETURN: :kRETURN, + KEYWORD_SELF: :kSELF, + KEYWORD_SUPER: :kSUPER, + KEYWORD_THEN: :kTHEN, + KEYWORD_TRUE: :kTRUE, + KEYWORD_UNDEF: :kUNDEF, + KEYWORD_UNLESS: :kUNLESS, + KEYWORD_UNLESS_MODIFIER: :kUNLESS_MOD, + KEYWORD_UNTIL: :kUNTIL, + KEYWORD_UNTIL_MODIFIER: :kUNTIL_MOD, + KEYWORD_WHEN: :kWHEN, + KEYWORD_WHILE: :kWHILE, + KEYWORD_WHILE_MODIFIER: :kWHILE_MOD, + KEYWORD_YIELD: :kYIELD, + KEYWORD___ENCODING__: :k__ENCODING__, + KEYWORD___FILE__: :k__FILE__, + KEYWORD___LINE__: :k__LINE__, + LABEL: :tLABEL, + LABEL_END: :tLABEL_END, + LAMBDA_BEGIN: :tLAMBEG, + LESS: :tLT, + LESS_EQUAL: :tLEQ, + LESS_EQUAL_GREATER: :tCMP, + LESS_LESS: :tLSHFT, + LESS_LESS_EQUAL: :tOP_ASGN, + METHOD_NAME: :tFID, + MINUS: :tMINUS, + MINUS_EQUAL: :tOP_ASGN, + MINUS_GREATER: :tLAMBDA, + NEWLINE: :tNL, + NUMBERED_REFERENCE: :tNTH_REF, + PARENTHESIS_LEFT: :tLPAREN, + PARENTHESIS_LEFT_PARENTHESES: :tLPAREN_ARG, + PARENTHESIS_RIGHT: :tRPAREN, + PERCENT: :tPERCENT, + PERCENT_EQUAL: :tOP_ASGN, + PERCENT_LOWER_I: :tQSYMBOLS_BEG, + PERCENT_LOWER_W: :tQWORDS_BEG, + PERCENT_UPPER_I: :tSYMBOLS_BEG, + PERCENT_UPPER_W: :tWORDS_BEG, + PERCENT_LOWER_X: :tXSTRING_BEG, + PLUS: :tPLUS, + PLUS_EQUAL: :tOP_ASGN, + PIPE_EQUAL: :tOP_ASGN, + PIPE: :tPIPE, + PIPE_PIPE: :tOROP, + PIPE_PIPE_EQUAL: :tOP_ASGN, + QUESTION_MARK: :tEH, + REGEXP_BEGIN: :tREGEXP_BEG, + REGEXP_END: :tSTRING_END, + SEMICOLON: :tSEMI, + SLASH: :tDIVIDE, + SLASH_EQUAL: :tOP_ASGN, + STAR: :tSTAR2, + STAR_EQUAL: :tOP_ASGN, + STAR_STAR: :tPOW, + STAR_STAR_EQUAL: :tOP_ASGN, + STRING_BEGIN: :tSTRING_BEG, + STRING_CONTENT: :tSTRING_CONTENT, + STRING_END: :tSTRING_END, + SYMBOL_BEGIN: :tSYMBEG, + TILDE: :tTILDE, + UAMPERSAND: :tAMPER, + UCOLON_COLON: :tCOLON3, + UDOT_DOT: :tDOT2, + UDOT_DOT_DOT: :tBDOT3, + UMINUS: :tUMINUS, + UMINUS_NUM: :tUNARY_NUM, + UPLUS: :tUPLUS, + USTAR: :tSTAR, + USTAR_STAR: :tPOW, + WORDS_SEP: :tSPACE + } + + private_constant :TYPES + + # The Parser::Source::Buffer that the tokens were lexed from. + attr_reader :source_buffer + + # An array of prism tokens that we lexed. + attr_reader :lexed + + # A hash that maps offsets in bytes to offsets in characters. + attr_reader :offset_cache + + # Initialize the lexer with the given source buffer, prism tokens, and + # offset cache. + def initialize(source_buffer, lexed, offset_cache) + @source_buffer = source_buffer + @lexed = lexed + @offset_cache = offset_cache + end + + Range = ::Parser::Source::Range # :nodoc: + private_constant :Range + + # Convert the prism tokens into the expected format for the parser gem. + def to_a + tokens = [] + index = 0 + + while index < lexed.length + token, = lexed[index] + index += 1 + next if token.type == :IGNORED_NEWLINE || token.type == :EOF + + type = TYPES.fetch(token.type) + value = token.value + location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset]) + + case type + when :tCHARACTER + value.delete_prefix!("?") + when :tCOMMENT + if token.type == :EMBDOC_BEGIN + until (next_token = lexed[index]) && next_token.type == :EMBDOC_END + value += next_token.value + index += 1 + end + + value += next_token.value + location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[lexed[index].location.end_offset]) + index += 1 + else + value.chomp! + location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset - 1]) + end + when :tNL + value = nil + when :tFLOAT + value = Float(value) + when :tIMAGINARY + value = parse_complex(value) + when :tINTEGER + if value.start_with?("+") + tokens << [:tUNARY_NUM, ["+", Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.start_offset + 1])]] + location = Range.new(source_buffer, offset_cache[token.location.start_offset + 1], offset_cache[token.location.end_offset]) + end + + value = Integer(value) + when :tLABEL + value.chomp!(":") + when :tLABEL_END + value.chomp!(":") + when :tNTH_REF + value = Integer(value.delete_prefix("$")) + when :tOP_ASGN + value.chomp!("=") + when :tRATIONAL + value = parse_rational(value) + when :tSPACE + value = nil + when :tSTRING_BEG + if ["\"", "'"].include?(value) && (next_token = lexed[index]) && next_token.type == :STRING_END + next_location = token.location.join(next_token.location) + type = :tSTRING + value = "" + location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset]) + index += 1 + elsif ["\"", "'"].include?(value) && (next_token = lexed[index]) && next_token.type == :STRING_CONTENT && (next_next_token = lexed[index + 1]) && next_next_token.type == :STRING_END + next_location = token.location.join(next_next_token.location) + type = :tSTRING + value = next_token.value + location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset]) + index += 2 + elsif value.start_with?("<<") + quote = value[2] == "-" || value[2] == "~" ? value[3] : value[2] + value = "<<#{quote == "'" || quote == "\"" ? quote : "\""}" + end + when :tSTRING_DVAR + value = nil + when :tSTRING_END + if token.type == :REGEXP_END + value = value[0] + location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.start_offset + 1]) + end + when :tSYMBEG + if (next_token = lexed[index]) && next_token.type != :STRING_CONTENT && next_token.type != :EMBEXPR_BEGIN && next_token.type != :EMBVAR + next_location = token.location.join(next_token.location) + type = :tSYMBOL + value = next_token.value + value = { "~@" => "~", "!@" => "!" }.fetch(value, value) + location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset]) + index += 1 + end + when :tFID + if tokens[-1][0] == :kDEF + type = :tIDENTIFIER + end + end + + tokens << [type, [value, location]] + + if token.type == :REGEXP_END + tokens << [:tREGEXP_OPT, [token.value[1..], Range.new(source_buffer, offset_cache[token.location.start_offset + 1], offset_cache[token.location.end_offset])]] + end + end + + tokens + end + + private + + # Parse a complex from the string representation. + def parse_complex(value) + value.chomp!("i") + + if value.end_with?("r") + Complex(0, parse_rational(value)) + elsif value.start_with?(/0[BbOoDdXx]/) + Complex(0, Integer(value)) + else + Complex(0, value) + end + end + + # Parse a rational from the string representation. + def parse_rational(value) + value.chomp!("r") + + if value.start_with?(/0[BbOoDdXx]/) + Rational(Integer(value)) + else + Rational(value) + end + end + end + end + end +end |