1 files changed, 438 insertions, 0 deletions
diff --git a/lib/prism/translation/parser/lexer.rb b/lib/prism/translation/parser/lexer.rb
new file mode 100644
index 0000000000..db7dbb1c87
--- /dev/null
+++ b/lib/prism/translation/parser/lexer.rb
@@ -0,0 +1,438 @@
+# frozen_string_literal: true
+
+module Prism
+  module Translation
+    class Parser
+      # Accepts a list of prism tokens and converts them into the expected
+      # format for the parser gem.
+      class Lexer
+        # The direct translating of types between the two lexers.
+        TYPES = {
+          # These tokens should never appear in the output of the lexer.
+          EOF: nil,
+          MISSING: nil,
+          NOT_PROVIDED: nil,
+          IGNORED_NEWLINE: nil,
+          EMBDOC_END: nil,
+          EMBDOC_LINE: nil,
+          __END__: nil,
+
+          # These tokens have more or less direct mappings.
+          AMPERSAND: :tAMPER2,
+          AMPERSAND_AMPERSAND: :tANDOP,
+          AMPERSAND_AMPERSAND_EQUAL: :tOP_ASGN,
+          AMPERSAND_DOT: :tANDDOT,
+          AMPERSAND_EQUAL: :tOP_ASGN,
+          BACK_REFERENCE: :tBACK_REF,
+          BACKTICK: :tXSTRING_BEG,
+          BANG: :tBANG,
+          BANG_EQUAL: :tNEQ,
+          BANG_TILDE: :tNMATCH,
+          BRACE_LEFT: :tLCURLY,
+          BRACE_RIGHT: :tRCURLY,
+          BRACKET_LEFT: :tLBRACK2,
+          BRACKET_LEFT_ARRAY: :tLBRACK,
+          BRACKET_LEFT_RIGHT: :tAREF,
+          BRACKET_LEFT_RIGHT_EQUAL: :tASET,
+          BRACKET_RIGHT: :tRBRACK,
+          CARET: :tCARET,
+          CARET_EQUAL: :tOP_ASGN,
+          CHARACTER_LITERAL: :tCHARACTER,
+          CLASS_VARIABLE: :tCVAR,
+          COLON: :tCOLON,
+          COLON_COLON: :tCOLON2,
+          COMMA: :tCOMMA,
+          COMMENT: :tCOMMENT,
+          CONSTANT: :tCONSTANT,
+          DOT: :tDOT,
+          DOT_DOT: :tDOT2,
+          DOT_DOT_DOT: :tDOT3,
+          EMBDOC_BEGIN: :tCOMMENT,
+          EMBEXPR_BEGIN: :tSTRING_DBEG,
+          EMBEXPR_END: :tSTRING_DEND,
+          EMBVAR: :tSTRING_DVAR,
+          EQUAL: :tEQL,
+          EQUAL_EQUAL: :tEQ,
+          EQUAL_EQUAL_EQUAL: :tEQQ,
+          EQUAL_GREATER: :tASSOC,
+          EQUAL_TILDE: :tMATCH,
+          FLOAT: :tFLOAT,
+          FLOAT_IMAGINARY: :tIMAGINARY,
+          FLOAT_RATIONAL: :tRATIONAL,
+          FLOAT_RATIONAL_IMAGINARY: :tIMAGINARY,
+          GLOBAL_VARIABLE: :tGVAR,
+          GREATER: :tGT,
+          GREATER_EQUAL: :tGEQ,
+          GREATER_GREATER: :tRSHFT,
+          GREATER_GREATER_EQUAL: :tOP_ASGN,
+          HEREDOC_START: :tSTRING_BEG,
+          HEREDOC_END: :tSTRING_END,
+          IDENTIFIER: :tIDENTIFIER,
+          INSTANCE_VARIABLE: :tIVAR,
+          INTEGER: :tINTEGER,
+          INTEGER_IMAGINARY: :tIMAGINARY,
+          INTEGER_RATIONAL: :tRATIONAL,
+          INTEGER_RATIONAL_IMAGINARY: :tIMAGINARY,
+          KEYWORD_ALIAS: :kALIAS,
+          KEYWORD_AND: :kAND,
+          KEYWORD_BEGIN: :kBEGIN,
+          KEYWORD_BEGIN_UPCASE: :klBEGIN,
+          KEYWORD_BREAK: :kBREAK,
+          KEYWORD_CASE: :kCASE,
+          KEYWORD_CLASS: :kCLASS,
+          KEYWORD_DEF: :kDEF,
+          KEYWORD_DEFINED: :kDEFINED,
+          KEYWORD_DO: :kDO,
+          KEYWORD_DO_LOOP: :kDO_COND,
+          KEYWORD_END: :kEND,
+          KEYWORD_END_UPCASE: :klEND,
+          KEYWORD_ENSURE: :kENSURE,
+          KEYWORD_ELSE: :kELSE,
+          KEYWORD_ELSIF: :kELSIF,
+          KEYWORD_FALSE: :kFALSE,
+          KEYWORD_FOR: :kFOR,
+          KEYWORD_IF: :kIF,
+          KEYWORD_IF_MODIFIER: :kIF_MOD,
+          KEYWORD_IN: :kIN,
+          KEYWORD_MODULE: :kMODULE,
+          KEYWORD_NEXT: :kNEXT,
+          KEYWORD_NIL: :kNIL,
+          KEYWORD_NOT: :kNOT,
+          KEYWORD_OR: :kOR,
+          KEYWORD_REDO: :kREDO,
+          KEYWORD_RESCUE: :kRESCUE,
+          KEYWORD_RESCUE_MODIFIER: :kRESCUE_MOD,
+          KEYWORD_RETRY: :kRETRY,
+          KEYWORD_RETURN: :kRETURN,
+          KEYWORD_SELF: :kSELF,
+          KEYWORD_SUPER: :kSUPER,
+          KEYWORD_THEN: :kTHEN,
+          KEYWORD_TRUE: :kTRUE,
+          KEYWORD_UNDEF: :kUNDEF,
+          KEYWORD_UNLESS: :kUNLESS,
+          KEYWORD_UNLESS_MODIFIER: :kUNLESS_MOD,
+          KEYWORD_UNTIL: :kUNTIL,
+          KEYWORD_UNTIL_MODIFIER: :kUNTIL_MOD,
+          KEYWORD_WHEN: :kWHEN,
+          KEYWORD_WHILE: :kWHILE,
+          KEYWORD_WHILE_MODIFIER: :kWHILE_MOD,
+          KEYWORD_YIELD: :kYIELD,
+          KEYWORD___ENCODING__: :k__ENCODING__,
+          KEYWORD___FILE__: :k__FILE__,
+          KEYWORD___LINE__: :k__LINE__,
+          LABEL: :tLABEL,
+          LABEL_END: :tLABEL_END,
+          LAMBDA_BEGIN: :tLAMBEG,
+          LESS: :tLT,
+          LESS_EQUAL: :tLEQ,
+          LESS_EQUAL_GREATER: :tCMP,
+          LESS_LESS: :tLSHFT,
+          LESS_LESS_EQUAL: :tOP_ASGN,
+          METHOD_NAME: :tFID,
+          MINUS: :tMINUS,
+          MINUS_EQUAL: :tOP_ASGN,
+          MINUS_GREATER: :tLAMBDA,
+          NEWLINE: :tNL,
+          NUMBERED_REFERENCE: :tNTH_REF,
+          PARENTHESIS_LEFT: :tLPAREN2,
+          PARENTHESIS_LEFT_PARENTHESES: :tLPAREN_ARG,
+          PARENTHESIS_RIGHT: :tRPAREN,
+          PERCENT: :tPERCENT,
+          PERCENT_EQUAL: :tOP_ASGN,
+          PERCENT_LOWER_I: :tQSYMBOLS_BEG,
+          PERCENT_LOWER_W: :tQWORDS_BEG,
+          PERCENT_UPPER_I: :tSYMBOLS_BEG,
+          PERCENT_UPPER_W: :tWORDS_BEG,
+          PERCENT_LOWER_X: :tXSTRING_BEG,
+          PLUS: :tPLUS,
+          PLUS_EQUAL: :tOP_ASGN,
+          PIPE_EQUAL: :tOP_ASGN,
+          PIPE: :tPIPE,
+          PIPE_PIPE: :tOROP,
+          PIPE_PIPE_EQUAL: :tOP_ASGN,
+          QUESTION_MARK: :tEH,
+          REGEXP_BEGIN: :tREGEXP_BEG,
+          REGEXP_END: :tSTRING_END,
+          SEMICOLON: :tSEMI,
+          SLASH: :tDIVIDE,
+          SLASH_EQUAL: :tOP_ASGN,
+          STAR: :tSTAR2,
+          STAR_EQUAL: :tOP_ASGN,
+          STAR_STAR: :tPOW,
+          STAR_STAR_EQUAL: :tOP_ASGN,
+          STRING_BEGIN: :tSTRING_BEG,
+          STRING_CONTENT: :tSTRING_CONTENT,
+          STRING_END: :tSTRING_END,
+          SYMBOL_BEGIN: :tSYMBEG,
+          TILDE: :tTILDE,
+          UAMPERSAND: :tAMPER,
+          UCOLON_COLON: :tCOLON3,
+          UDOT_DOT: :tBDOT2,
+          UDOT_DOT_DOT: :tBDOT3,
+          UMINUS: :tUMINUS,
+          UMINUS_NUM: :tUNARY_NUM,
+          UPLUS: :tUPLUS,
+          USTAR: :tSTAR,
+          USTAR_STAR: :tDSTAR,
+          WORDS_SEP: :tSPACE
+        }
+
+        # These constants represent flags in our lex state. We really, really
+        # don't want to be using them and we really, really don't want to be
+        # exposing them as part of our public API. Unfortunately, we don't have
+        # another way of matching the exact tokens that the parser gem expects
+        # without them. We should find another way to do this, but in the
+        # meantime we'll hide them from the documentation and mark them as
+        # private constants.
+        EXPR_BEG = 0x1 # :nodoc:
+        EXPR_LABEL = 0x400 # :nodoc:
+
+        # It is used to determine whether `do` is of the token type `kDO` or `kDO_LAMBDA`.
+        #
+        # NOTE: In edge cases like `-> (foo = -> (bar) {}) do end`, please note that `kDO` is still returned
+        # instead of `kDO_LAMBDA`, which is expected: https://github.com/ruby/prism/pull/3046
+        LAMBDA_TOKEN_TYPES = [:kDO_LAMBDA, :tLAMBDA, :tLAMBEG]
+
+        # The `PARENTHESIS_LEFT` token in Prism is classified as either `tLPAREN` or `tLPAREN2` in the Parser gem.
+        # The following token types are listed as those classified as `tLPAREN`.
+        LPAREN_CONVERSION_TOKEN_TYPES = [
+          :kBREAK, :kCASE, :tDIVIDE, :kFOR, :kIF, :kNEXT, :kRETURN, :kUNTIL, :kWHILE, :tAMPER, :tANDOP, :tBANG, :tCOMMA, :tDOT2, :tDOT3,
+          :tEQL, :tLPAREN, :tLPAREN2, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS
+        ]
+
+        private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES
+
+        # The Parser::Source::Buffer that the tokens were lexed from.
+        attr_reader :source_buffer
+
+        # An array of tuples that contain prism tokens and their associated lex
+        # state when they were lexed.
+        attr_reader :lexed
+
+        # A hash that maps offsets in bytes to offsets in characters.
+        attr_reader :offset_cache
+
+        # Initialize the lexer with the given source buffer, prism tokens, and
+        # offset cache.
+        def initialize(source_buffer, lexed, offset_cache)
+          @source_buffer = source_buffer
+          @lexed = lexed
+          @offset_cache = offset_cache
+        end
+
+        Range = ::Parser::Source::Range # :nodoc:
+        private_constant :Range
+
+        # Convert the prism tokens into the expected format for the parser gem.
+        def to_a
+          tokens = []
+
+          index = 0
+          length = lexed.length
+
+          heredoc_identifier_stack = []
+
+          while index < length
+            token, state = lexed[index]
+            index += 1
+            next if %i[IGNORED_NEWLINE __END__ EOF].include?(token.type)
+
+            type = TYPES.fetch(token.type)
+            value = token.value
+            location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset])
+
+            case type
+            when :kDO
+              types = tokens.map(&:first)
+              nearest_lambda_token_type = types.reverse.find { |type| LAMBDA_TOKEN_TYPES.include?(type) }
+
+              if nearest_lambda_token_type == :tLAMBDA
+                type = :kDO_LAMBDA
+              end
+            when :tCHARACTER
+              value.delete_prefix!("?")
+            when :tCOMMENT
+              if token.type == :EMBDOC_BEGIN
+                start_index = index
+
+                while !((next_token = lexed[index][0]) && next_token.type == :EMBDOC_END) && (index < length - 1)
+                  value += next_token.value
+                  index += 1
+                end
+
+                if start_index != index
+                  value += next_token.value
+                  location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[lexed[index][0].location.end_offset])
+                  index += 1
+                end
+              else
+                value.chomp!
+                location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset - 1])
+              end
+            when :tNL
+              value = nil
+            when :tFLOAT
+              value = parse_float(value)
+            when :tIMAGINARY
+              value = parse_complex(value)
+            when :tINTEGER
+              if value.start_with?("+")
+                tokens << [:tUNARY_NUM, ["+", Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.start_offset + 1])]]
+                location = Range.new(source_buffer, offset_cache[token.location.start_offset + 1], offset_cache[token.location.end_offset])
+              end
+
+              value = parse_integer(value)
+            when :tLABEL
+              value.chomp!(":")
+            when :tLABEL_END
+              value.chomp!(":")
+            when :tLCURLY
+              type = :tLBRACE if state == EXPR_BEG | EXPR_LABEL
+            when :tLPAREN2
+              type = :tLPAREN if tokens.empty? || LPAREN_CONVERSION_TOKEN_TYPES.include?(tokens.dig(-1, 0))
+            when :tNTH_REF
+              value = parse_integer(value.delete_prefix("$"))
+            when :tOP_ASGN
+              value.chomp!("=")
+            when :tRATIONAL
+              value = parse_rational(value)
+            when :tSPACE
+              value = nil
+            when :tSTRING_BEG
+              if token.type == :HEREDOC_START
+                heredoc_identifier_stack.push(value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier])
+              end
+              if ["\"", "'"].include?(value) && (next_token = lexed[index][0]) && next_token.type == :STRING_END
+                next_location = token.location.join(next_token.location)
+                type = :tSTRING
+                value = ""
+                location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
+                index += 1
+              elsif ["\"", "'"].include?(value) && (next_token = lexed[index][0]) && next_token.type == :STRING_CONTENT && next_token.value.lines.count <= 1 && (next_next_token = lexed[index + 1][0]) && next_next_token.type == :STRING_END
+                next_location = token.location.join(next_next_token.location)
+                type = :tSTRING
+                value = next_token.value.gsub("\\\\", "\\")
+                location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
+                index += 2
+              elsif value.start_with?("<<")
+                quote = value[2] == "-" || value[2] == "~" ? value[3] : value[2]
+                if quote == "`"
+                  type = :tXSTRING_BEG
+                  value = "<<`"
+                else
+                  value = "<<#{quote == "'" || quote == "\"" ? quote : "\""}"
+                end
+              end
+            when :tSTRING_CONTENT
+              unless (lines = token.value.lines).one?
+                start_offset = offset_cache[token.location.start_offset]
+                lines.map do |line|
+                  newline = line.end_with?("\r\n") ? "\r\n" : "\n"
+                  chomped_line = line.chomp
+                  if match = chomped_line.match(/(?<backslashes>\\+)\z/)
+                    adjustment = match[:backslashes].size / 2
+                    adjusted_line = chomped_line.delete_suffix("\\" * adjustment)
+                    if match[:backslashes].size.odd?
+                      adjusted_line.delete_suffix!("\\")
+                      adjustment += 2
+                    else
+                      adjusted_line << newline
+                    end
+                  else
+                    adjusted_line = line
+                    adjustment = 0
+                  end
+
+                  end_offset = start_offset + adjusted_line.length + adjustment
+                  tokens << [:tSTRING_CONTENT, [adjusted_line, Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])]]
+                  start_offset = end_offset
+                end
+                next
+              end
+            when :tSTRING_DVAR
+              value = nil
+            when :tSTRING_END
+              if token.type == :HEREDOC_END && value.end_with?("\n")
+                newline_length = value.end_with?("\r\n") ? 2 : 1
+                value = heredoc_identifier_stack.pop
+                location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset - newline_length])
+              elsif token.type == :REGEXP_END
+                value = value[0]
+                location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.start_offset + 1])
+              end
+            when :tSYMBEG
+              if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :EMBEXPR_BEGIN && next_token.type != :EMBVAR && next_token.type != :STRING_END
+                next_location = token.location.join(next_token.location)
+                type = :tSYMBOL
+                value = next_token.value
+                value = { "~@" => "~", "!@" => "!" }.fetch(value, value)
+                location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
+                index += 1
+              end
+            when :tFID
+              if !tokens.empty? && tokens.dig(-1, 0) == :kDEF
+                type = :tIDENTIFIER
+              end
+            when :tXSTRING_BEG
+              if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :STRING_END
+                type = :tBACK_REF2
+              end
+            end
+
+            tokens << [type, [value, location]]
+
+            if token.type == :REGEXP_END
+              tokens << [:tREGEXP_OPT, [token.value[1..], Range.new(source_buffer, offset_cache[token.location.start_offset + 1], offset_cache[token.location.end_offset])]]
+            end
+          end
+
+          tokens
+        end
+
+        private
+
+        # Parse an integer from the string representation.
+        def parse_integer(value)
+          Integer(value)
+        rescue ArgumentError
+          0
+        end
+
+        # Parse a float from the string representation.
+        def parse_float(value)
+          Float(value)
+        rescue ArgumentError
+          0.0
+        end
+
+        # Parse a complex from the string representation.
+        def parse_complex(value)
+          value.chomp!("i")
+
+          if value.end_with?("r")
+            Complex(0, parse_rational(value))
+          elsif value.start_with?(/0[BbOoDdXx]/)
+            Complex(0, parse_integer(value))
+          else
+            Complex(0, value)
+          end
+        rescue ArgumentError
+          0i
+        end
+
+        # Parse a rational from the string representation.
+        def parse_rational(value)
+          value.chomp!("r")
+
+          if value.start_with?(/0[BbOoDdXx]/)
+            Rational(parse_integer(value))
+          else
+            Rational(value)
+          end
+        rescue ArgumentError
+          0r
+        end
+      end
+    end
+  end
+end