summaryrefslogtreecommitdiff
path: root/lib/prism/lex_compat.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/prism/lex_compat.rb')
-rw-r--r--lib/prism/lex_compat.rb906
1 files changed, 906 insertions, 0 deletions
diff --git a/lib/prism/lex_compat.rb b/lib/prism/lex_compat.rb
new file mode 100644
index 0000000000..7aacec037d
--- /dev/null
+++ b/lib/prism/lex_compat.rb
@@ -0,0 +1,906 @@
+# frozen_string_literal: true
+# :markup: markdown
+#--
+# rbs_inline: enabled
+
+module Prism
+ # @rbs!
+ # module Translation
+ # class Ripper
+ # EXPR_NONE: Integer
+ # EXPR_BEG: Integer
+ # EXPR_MID: Integer
+ # EXPR_END: Integer
+ # EXPR_CLASS: Integer
+ # EXPR_VALUE: Integer
+ # EXPR_ARG: Integer
+ # EXPR_CMDARG: Integer
+ # EXPR_ENDARG: Integer
+ # EXPR_ENDFN: Integer
+ #
+ # class Lexer < Ripper
+ # class State
+ # def self.[]: (Integer value) -> State
+ # end
+ # end
+ #
+ # class LineAndColumnCache
+ # def initialize: (Source source) -> void
+ #
+ # def line_and_column: (Integer byte_offset) -> [Integer, Integer]
+ # end
+ # end
+ # end
+
+ # This class is responsible for lexing the source using prism and then
+ # converting those tokens to be compatible with Ripper. In the vast majority
+ # of cases, this is a one-to-one mapping of the token type. Everything else
+ # generally lines up. However, there are a few cases that require special
+ # handling.
+ class LexCompat # :nodoc:
+ # @rbs!
+ # # A token produced by the Ripper lexer that Prism is replicating.
+ # type lex_compat_token = [[Integer, Integer], Symbol, String, untyped]
+
+ # A result class specialized for holding tokens produced by the lexer.
+ class Result < Prism::Result
+ # The list of tokens that were produced by the lexer.
+ attr_reader :value #: Array[lex_compat_token]
+
+ # Create a new lex compat result object with the given values.
+ #--
+ #: (Array[lex_compat_token] value, Array[Comment] comments, Array[MagicComment] magic_comments, Location? data_loc, Array[ParseError] errors, Array[ParseWarning] warnings, bool continuable, Source source) -> void
+ def initialize(value, comments, magic_comments, data_loc, errors, warnings, continuable, source)
+ @value = value
+ super(comments, magic_comments, data_loc, errors, warnings, continuable, source)
+ end
+
+ # Implement the hash pattern matching interface for Result.
+ #--
+ #: (Array[Symbol]? keys) -> Hash[Symbol, untyped]
+ def deconstruct_keys(keys) # :nodoc:
+ super.merge!(value: value)
+ end
+ end
+
+ # This is a mapping of prism token types to Ripper token types. This is a
+ # many-to-one mapping because we split up our token types, whereas Ripper
+ # tends to group them.
+ RIPPER = {
+ AMPERSAND: :on_op,
+ AMPERSAND_AMPERSAND: :on_op,
+ AMPERSAND_AMPERSAND_EQUAL: :on_op,
+ AMPERSAND_DOT: :on_op,
+ AMPERSAND_EQUAL: :on_op,
+ BACK_REFERENCE: :on_backref,
+ BACKTICK: :on_backtick,
+ BANG: :on_op,
+ BANG_EQUAL: :on_op,
+ BANG_TILDE: :on_op,
+ BRACE_LEFT: :on_lbrace,
+ BRACE_RIGHT: :on_rbrace,
+ BRACKET_LEFT: :on_lbracket,
+ BRACKET_LEFT_ARRAY: :on_lbracket,
+ BRACKET_LEFT_RIGHT: :on_op,
+ BRACKET_LEFT_RIGHT_EQUAL: :on_op,
+ BRACKET_RIGHT: :on_rbracket,
+ CARET: :on_op,
+ CARET_EQUAL: :on_op,
+ CHARACTER_LITERAL: :on_CHAR,
+ CLASS_VARIABLE: :on_cvar,
+ COLON: :on_op,
+ COLON_COLON: :on_op,
+ COMMA: :on_comma,
+ COMMENT: :on_comment,
+ CONSTANT: :on_const,
+ DOT: :on_period,
+ DOT_DOT: :on_op,
+ DOT_DOT_DOT: :on_op,
+ EMBDOC_BEGIN: :on_embdoc_beg,
+ EMBDOC_END: :on_embdoc_end,
+ EMBDOC_LINE: :on_embdoc,
+ EMBEXPR_BEGIN: :on_embexpr_beg,
+ EMBEXPR_END: :on_embexpr_end,
+ EMBVAR: :on_embvar,
+ EOF: :on_eof,
+ EQUAL: :on_op,
+ EQUAL_EQUAL: :on_op,
+ EQUAL_EQUAL_EQUAL: :on_op,
+ EQUAL_GREATER: :on_op,
+ EQUAL_TILDE: :on_op,
+ FLOAT: :on_float,
+ FLOAT_IMAGINARY: :on_imaginary,
+ FLOAT_RATIONAL: :on_rational,
+ FLOAT_RATIONAL_IMAGINARY: :on_imaginary,
+ GREATER: :on_op,
+ GREATER_EQUAL: :on_op,
+ GREATER_GREATER: :on_op,
+ GREATER_GREATER_EQUAL: :on_op,
+ GLOBAL_VARIABLE: :on_gvar,
+ HEREDOC_END: :on_heredoc_end,
+ HEREDOC_START: :on_heredoc_beg,
+ IDENTIFIER: :on_ident,
+ IGNORED_NEWLINE: :on_ignored_nl,
+ INTEGER: :on_int,
+ INTEGER_IMAGINARY: :on_imaginary,
+ INTEGER_RATIONAL: :on_rational,
+ INTEGER_RATIONAL_IMAGINARY: :on_imaginary,
+ INSTANCE_VARIABLE: :on_ivar,
+ INVALID: :INVALID,
+ KEYWORD___ENCODING__: :on_kw,
+ KEYWORD___LINE__: :on_kw,
+ KEYWORD___FILE__: :on_kw,
+ KEYWORD_ALIAS: :on_kw,
+ KEYWORD_AND: :on_kw,
+ KEYWORD_BEGIN: :on_kw,
+ KEYWORD_BEGIN_UPCASE: :on_kw,
+ KEYWORD_BREAK: :on_kw,
+ KEYWORD_CASE: :on_kw,
+ KEYWORD_CLASS: :on_kw,
+ KEYWORD_DEF: :on_kw,
+ KEYWORD_DEFINED: :on_kw,
+ KEYWORD_DO: :on_kw,
+ KEYWORD_DO_BLOCK: :on_kw,
+ KEYWORD_DO_LOOP: :on_kw,
+ KEYWORD_ELSE: :on_kw,
+ KEYWORD_ELSIF: :on_kw,
+ KEYWORD_END: :on_kw,
+ KEYWORD_END_UPCASE: :on_kw,
+ KEYWORD_ENSURE: :on_kw,
+ KEYWORD_FALSE: :on_kw,
+ KEYWORD_FOR: :on_kw,
+ KEYWORD_IF: :on_kw,
+ KEYWORD_IF_MODIFIER: :on_kw,
+ KEYWORD_IN: :on_kw,
+ KEYWORD_MODULE: :on_kw,
+ KEYWORD_NEXT: :on_kw,
+ KEYWORD_NIL: :on_kw,
+ KEYWORD_NOT: :on_kw,
+ KEYWORD_OR: :on_kw,
+ KEYWORD_REDO: :on_kw,
+ KEYWORD_RESCUE: :on_kw,
+ KEYWORD_RESCUE_MODIFIER: :on_kw,
+ KEYWORD_RETRY: :on_kw,
+ KEYWORD_RETURN: :on_kw,
+ KEYWORD_SELF: :on_kw,
+ KEYWORD_SUPER: :on_kw,
+ KEYWORD_THEN: :on_kw,
+ KEYWORD_TRUE: :on_kw,
+ KEYWORD_UNDEF: :on_kw,
+ KEYWORD_UNLESS: :on_kw,
+ KEYWORD_UNLESS_MODIFIER: :on_kw,
+ KEYWORD_UNTIL: :on_kw,
+ KEYWORD_UNTIL_MODIFIER: :on_kw,
+ KEYWORD_WHEN: :on_kw,
+ KEYWORD_WHILE: :on_kw,
+ KEYWORD_WHILE_MODIFIER: :on_kw,
+ KEYWORD_YIELD: :on_kw,
+ LABEL: :on_label,
+ LABEL_END: :on_label_end,
+ LAMBDA_BEGIN: :on_tlambeg,
+ LESS: :on_op,
+ LESS_EQUAL: :on_op,
+ LESS_EQUAL_GREATER: :on_op,
+ LESS_LESS: :on_op,
+ LESS_LESS_EQUAL: :on_op,
+ METHOD_NAME: :on_ident,
+ MINUS: :on_op,
+ MINUS_EQUAL: :on_op,
+ MINUS_GREATER: :on_tlambda,
+ NEWLINE: :on_nl,
+ NUMBERED_REFERENCE: :on_backref,
+ PARENTHESIS_LEFT: :on_lparen,
+ PARENTHESIS_LEFT_PARENTHESES: :on_lparen,
+ PARENTHESIS_RIGHT: :on_rparen,
+ PERCENT: :on_op,
+ PERCENT_EQUAL: :on_op,
+ PERCENT_LOWER_I: :on_qsymbols_beg,
+ PERCENT_LOWER_W: :on_qwords_beg,
+ PERCENT_LOWER_X: :on_backtick,
+ PERCENT_UPPER_I: :on_symbols_beg,
+ PERCENT_UPPER_W: :on_words_beg,
+ PIPE: :on_op,
+ PIPE_EQUAL: :on_op,
+ PIPE_PIPE: :on_op,
+ PIPE_PIPE_EQUAL: :on_op,
+ PLUS: :on_op,
+ PLUS_EQUAL: :on_op,
+ QUESTION_MARK: :on_op,
+ RATIONAL_FLOAT: :on_rational,
+ RATIONAL_INTEGER: :on_rational,
+ REGEXP_BEGIN: :on_regexp_beg,
+ REGEXP_END: :on_regexp_end,
+ SEMICOLON: :on_semicolon,
+ SLASH: :on_op,
+ SLASH_EQUAL: :on_op,
+ STAR: :on_op,
+ STAR_EQUAL: :on_op,
+ STAR_STAR: :on_op,
+ STAR_STAR_EQUAL: :on_op,
+ STRING_BEGIN: :on_tstring_beg,
+ STRING_CONTENT: :on_tstring_content,
+ STRING_END: :on_tstring_end,
+ SYMBOL_BEGIN: :on_symbeg,
+ TILDE: :on_op,
+ UAMPERSAND: :on_op,
+ UCOLON_COLON: :on_op,
+ UDOT_DOT: :on_op,
+ UDOT_DOT_DOT: :on_op,
+ UMINUS: :on_op,
+ UMINUS_NUM: :on_op,
+ UPLUS: :on_op,
+ USTAR: :on_op,
+ USTAR_STAR: :on_op,
+ WORDS_SEP: :on_words_sep,
+ "__END__": :on___end__
+ }.freeze
+
+ # A heredoc in this case is a list of tokens that belong to the body of the
+ # heredoc that should be appended onto the list of tokens when the heredoc
+ # closes.
+ module Heredoc # :nodoc:
+ # Heredocs that are no dash or tilde heredocs are just a list of tokens.
+ # We need to keep them around so that we can insert them in the correct
+ # order back into the token stream and set the state of the last token to
+ # the state that the heredoc was opened in.
+ class PlainHeredoc # :nodoc:
+ attr_reader :tokens #: Array[lex_compat_token]
+
+ #: () -> void
+ def initialize
+ @tokens = []
+ end
+
+ #: (lex_compat_token token) -> void
+ def <<(token)
+ tokens << token
+ end
+
+ #: () -> Array[lex_compat_token]
+ def to_a
+ tokens
+ end
+ end
+
+ # Dash heredocs are a little more complicated. They are a list of tokens
+ # that need to be split on "\\\n" to mimic Ripper's behavior. We also need
+ # to keep track of the state that the heredoc was opened in.
+ class DashHeredoc # :nodoc:
+ attr_reader :split #: bool
+ attr_reader :tokens #: Array[lex_compat_token]
+
+ #: (bool split) -> void
+ def initialize(split)
+ @split = split
+ @tokens = []
+ end
+
+ #: (lex_compat_token token) -> void
+ def <<(token)
+ tokens << token
+ end
+
+ #: () -> Array[lex_compat_token]
+ def to_a
+ embexpr_balance = 0
+
+ tokens.each_with_object([]) do |token, results| #$ Array[lex_compat_token]
+ case token[1]
+ when :on_embexpr_beg
+ embexpr_balance += 1
+ results << token
+ when :on_embexpr_end
+ embexpr_balance -= 1
+ results << token
+ when :on_tstring_content
+ if embexpr_balance == 0
+ lineno = token[0][0]
+ column = token[0][1]
+
+ if split
+ # Split on "\\\n" to mimic Ripper's behavior. Use a lookbehind
+ # to keep the delimiter in the result.
+ token[2].split(/(?<=[^\\]\\\n)|(?<=[^\\]\\\r\n)/).each_with_index do |value, index|
+ column = 0 if index > 0
+ results << [[lineno, column], :on_tstring_content, value, token[3]]
+ lineno += value.count("\n")
+ end
+ else
+ results << token
+ end
+ else
+ results << token
+ end
+ else
+ results << token
+ end
+ end
+ end
+ end
+
+ # Heredocs that are dedenting heredocs are a little more complicated.
+ # Ripper outputs on_ignored_sp tokens for the whitespace that is being
+ # removed from the output. prism only modifies the node itself and keeps
+ # the token the same. This simplifies prism, but makes comparing against
+ # Ripper much harder because there is a length mismatch.
+ #
+ # Fortunately, we already have to pull out the heredoc tokens in order to
+ # insert them into the stream in the correct order. As such, we can do
+ # some extra manipulation on the tokens to make them match Ripper's
+ # output by mirroring the dedent logic that Ripper uses.
+ class DedentingHeredoc # :nodoc:
+ TAB_WIDTH = 8
+
+ attr_reader :tokens #: Array[lex_compat_token]
+ attr_reader :dedent_next #: bool
+ attr_reader :dedent #: Integer?
+ attr_reader :embexpr_balance #: Integer
+ # @rbs @ended_on_newline: bool
+
+ #: () -> void
+ def initialize
+ @tokens = []
+ @dedent_next = true
+ @dedent = nil
+ @embexpr_balance = 0
+ @ended_on_newline = false
+ end
+
+ # As tokens are coming in, we track the minimum amount of common leading
+ # whitespace on plain string content tokens. This allows us to later
+ # remove that amount of whitespace from the beginning of each line.
+ #
+ #: (lex_compat_token token) -> void
+ def <<(token)
+ case token[1]
+ when :on_embexpr_beg, :on_heredoc_beg
+ @embexpr_balance += 1
+ @dedent = 0 if @dedent_next && @ended_on_newline
+ when :on_embexpr_end, :on_heredoc_end
+ @embexpr_balance -= 1
+ when :on_tstring_content
+ if embexpr_balance == 0
+ line = token[2]
+
+ if dedent_next && !(line.strip.empty? && line.end_with?("\n"))
+ leading = line[/\A(\s*)\n?/, 1] #: String
+ next_dedent = 0
+
+ leading.each_char do |char|
+ if char == "\t"
+ next_dedent = next_dedent - (next_dedent % TAB_WIDTH) + TAB_WIDTH
+ else
+ next_dedent += 1
+ end
+ end
+
+ @dedent = [dedent, next_dedent].compact.min
+ @dedent_next = true
+ @ended_on_newline = line.end_with?("\n")
+ tokens << token
+ return
+ end
+ end
+ end
+
+ @dedent_next = token[1] == :on_tstring_content && embexpr_balance == 0
+ @ended_on_newline = false
+ tokens << token
+ end
+
+ #: () -> Array[lex_compat_token]
+ def to_a
+ # If every line in the heredoc is blank, we still need to split up the
+ # string content token into multiple tokens.
+ if dedent.nil?
+ results = [] #: Array[lex_compat_token]
+ embexpr_balance = 0
+
+ tokens.each do |token|
+ case token[1]
+ when :on_embexpr_beg, :on_heredoc_beg
+ embexpr_balance += 1
+ results << token
+ when :on_embexpr_end, :on_heredoc_end
+ embexpr_balance -= 1
+ results << token
+ when :on_tstring_content
+ if embexpr_balance == 0
+ lineno = token[0][0]
+ column = token[0][1]
+
+ token[2].split(/(?<=\n)/).each_with_index do |value, index|
+ column = 0 if index > 0
+ results << [[lineno, column], :on_tstring_content, value, token[3]]
+ lineno += 1
+ end
+ else
+ results << token
+ end
+ else
+ results << token
+ end
+ end
+
+ return results
+ end
+
+ # If the minimum common whitespace is 0, then we need to concatenate
+ # string nodes together that are immediately adjacent.
+ if dedent == 0
+ results = [] #: Array[lex_compat_token]
+ embexpr_balance = 0
+
+ index = 0
+ max_index = tokens.length
+
+ while index < max_index
+ token = tokens[index]
+ results << token
+ index += 1
+
+ case token[1]
+ when :on_embexpr_beg, :on_heredoc_beg
+ embexpr_balance += 1
+ when :on_embexpr_end, :on_heredoc_end
+ embexpr_balance -= 1
+ when :on_tstring_content
+ if embexpr_balance == 0
+ while index < max_index && tokens[index][1] == :on_tstring_content && !token[2].match?(/\\\r?\n\z/)
+ token[2] << tokens[index][2]
+ index += 1
+ end
+ end
+ end
+ end
+
+ return results
+ end
+
+ # Otherwise, we're going to run through each token in the list and
+ # insert on_ignored_sp tokens for the amount of dedent that we need to
+ # perform. We also need to remove the dedent from the beginning of
+ # each line of plain string content tokens.
+ results = [] #: Array[lex_compat_token]
+ dedent_next = true
+ embexpr_balance = 0
+
+ tokens.each do |token|
+ # Notice that the structure of this conditional largely matches the
+ # whitespace calculation we performed above. This is because
+ # checking if the subsequent token needs to be dedented is common to
+ # both the dedent calculation and the ignored_sp insertion.
+ case token[1]
+ when :on_embexpr_beg
+ embexpr_balance += 1
+ results << token
+ when :on_embexpr_end
+ embexpr_balance -= 1
+ results << token
+ when :on_tstring_content
+ if embexpr_balance == 0
+ # Here we're going to split the string on newlines, but maintain
+ # the newlines in the resulting array. We'll do that with a look
+ # behind assertion.
+ splits = token[2].split(/(?<=\n)/)
+ index = 0
+
+ while index < splits.length
+ line = splits[index]
+ lineno = token[0][0] + index
+ column = token[0][1]
+
+ # Blank lines do not count toward common leading whitespace
+ # calculation and do not need to be dedented.
+ if dedent_next || index > 0
+ column = 0
+ end
+
+ # If the dedent is 0 and we're not supposed to dedent the next
+ # line or this line doesn't start with whitespace, then we
+ # should concatenate the rest of the string to match ripper.
+ if dedent == 0 && (!dedent_next || !line.start_with?(/\s/))
+ unjoined = splits[index..] #: Array[String]
+ line = unjoined.join
+ index = splits.length
+ end
+
+ # If we are supposed to dedent this line or if this is not the
+ # first line of the string and this line isn't entirely blank,
+ # then we need to insert an on_ignored_sp token and remove the
+ # dedent from the beginning of the line.
+ if (dedent > 0) && (dedent_next || index > 0)
+ deleting = 0
+ deleted_chars = [] #: Array[String]
+
+ # Gather up all of the characters that we're going to
+ # delete, stopping when you hit a character that would put
+ # you over the dedent amount.
+ line.each_char.with_index do |char, i|
+ case char
+ when "\r"
+ if line[i + 1] == "\n"
+ break
+ end
+ when "\n"
+ break
+ when "\t"
+ deleting = deleting - (deleting % TAB_WIDTH) + TAB_WIDTH
+ else
+ deleting += 1
+ end
+
+ break if deleting > dedent
+ deleted_chars << char
+ end
+
+ # If we have something to delete, then delete it from the
+ # string and insert an on_ignored_sp token.
+ if deleted_chars.any?
+ ignored = deleted_chars.join
+ line.delete_prefix!(ignored)
+
+ results << [[lineno, 0], :on_ignored_sp, ignored, token[3]]
+ column = ignored.length
+ end
+ end
+
+ results << [[lineno, column], token[1], line, token[3]] unless line.empty?
+ index += 1
+ end
+ else
+ results << token
+ end
+ else
+ results << token
+ end
+
+ dedent_next =
+ ((token[1] == :on_tstring_content) || (token[1] == :on_heredoc_end)) &&
+ embexpr_balance == 0
+ end
+
+ results
+ end
+ end
+
+ # Here we will split between the two types of heredocs and return the
+ # object that will store their tokens.
+ #--
+ #: (lex_compat_token opening) -> (PlainHeredoc | DashHeredoc | DedentingHeredoc)
+ def self.build(opening)
+ case opening[2][2]
+ when "~"
+ DedentingHeredoc.new
+ when "-"
+ DashHeredoc.new(opening[2][3] != "'")
+ else
+ PlainHeredoc.new
+ end
+ end
+ end
+
+ private_constant :Heredoc
+
+ # In previous versions of Ruby, Ripper wouldn't flush the bom before the
+ # first token, so we had to have a hack in place to account for that.
+ BOM_FLUSHED = RUBY_VERSION >= "3.3.0"
+ private_constant :BOM_FLUSHED
+
+ attr_reader :options #: Hash[Symbol, untyped]
+ # @rbs @source: String
+
+ #: (String source, **untyped options) -> void
+ def initialize(source, **options)
+ @source = source
+ @options = options
+ end
+
+ #: () -> Result
+ def result
+ tokens = [] #: Array[lex_compat_token]
+
+ state = :default
+ heredoc_stack = [[]] #: Array[Array[Heredoc::PlainHeredoc | Heredoc::DashHeredoc | Heredoc::DedentingHeredoc]]
+
+ result = Prism.lex(@source, **options)
+ source = result.source
+ result_value = result.value
+ previous_state = nil #: Translation::Ripper::Lexer::State?
+ last_heredoc_end = nil #: Integer?
+ eof_token = nil #: Token?
+
+ bom = source.slice(0, 3) == "\xEF\xBB\xBF"
+
+ result_value.each_with_index do |(prism_token, prism_state), index|
+ lineno = prism_token.location.start_line
+ column = prism_token.location.start_column
+
+ event = RIPPER.fetch(prism_token.type)
+ value = prism_token.value
+ lex_state = Translation::Ripper::Lexer::State[prism_state]
+
+ # If there's a UTF-8 byte-order mark as the start of the file, then for
+ # certain tokens ripper sets the first token back by 3 bytes. It also
+ # keeps the byte order mark in the first token's value. This is weird,
+ # and I don't want to mirror that in our parser. So instead, we'll match
+ # up the columns and values here.
+ if bom && lineno == 1
+ column -= 3
+
+ if index == 0 && column == 0 && !BOM_FLUSHED
+ flushed =
+ case prism_token.type
+ when :BACK_REFERENCE, :INSTANCE_VARIABLE, :CLASS_VARIABLE,
+ :GLOBAL_VARIABLE, :NUMBERED_REFERENCE, :PERCENT_LOWER_I,
+ :PERCENT_LOWER_X, :PERCENT_LOWER_W, :PERCENT_UPPER_I,
+ :PERCENT_UPPER_W, :STRING_BEGIN
+ true
+ when :REGEXP_BEGIN, :SYMBOL_BEGIN
+ value.start_with?("%")
+ else
+ false
+ end
+
+ unless flushed
+ column -= 3
+ value.prepend(String.new("\xEF\xBB\xBF", encoding: value.encoding))
+ end
+ end
+ end
+
+ lex_compat_token =
+ case event
+ when :on___end__
+ # Ripper doesn't include the rest of the token in the event, so we need to
+ # trim it down to just the content on the first line.
+ value = value[0..value.index("\n")] #: String
+ [[lineno, column], event, value, lex_state]
+ when :on_comment
+ [[lineno, column], event, value, lex_state]
+ when :on_heredoc_end
+ # Heredoc end tokens can be emitted in an odd order, so we don't
+ # want to bother comparing the state on them.
+ last_heredoc_end = prism_token.location.end_offset
+ [[lineno, column], event, value, lex_state]
+ when :on_embexpr_end
+ [[lineno, column], event, value, lex_state]
+ when :on_words_sep
+ # Ripper emits one token each per line.
+ value.each_line.with_index do |line, index|
+ if index > 0
+ lineno += 1
+ column = 0
+ end
+ tokens << [[lineno, column], event, line, lex_state]
+ end
+ tokens.pop #: lex_compat_token
+ when :on_regexp_end
+ # On regex end, Ripper scans and then sets end state, so the ripper
+ # lexed output is begin, when it should be end. prism sets lex state
+ # correctly to end state, but we want to be able to compare against
+ # Ripper's lexed state. So here, if it's a regexp end token, we
+ # output the state as the previous state, solely for the sake of
+ # comparison.
+ previous_token = result_value[index - 1][0]
+ lex_state =
+ if RIPPER.fetch(previous_token.type) == :on_embexpr_end
+ # If the previous token is embexpr_end, then we have to do even
+ # more processing. The end of an embedded expression sets the
+ # state to the state that it had at the beginning of the
+ # embedded expression. So we have to go and find that state and
+ # set it here.
+ counter = 1
+ current_index = index - 1
+
+ until counter == 0
+ current_index -= 1
+ current_event = RIPPER.fetch(result_value[current_index][0].type)
+ counter += { on_embexpr_beg: -1, on_embexpr_end: 1 }[current_event] || 0
+ end
+
+ Translation::Ripper::Lexer::State[result_value[current_index][1]]
+ else
+ previous_state
+ end
+
+ [[lineno, column], event, value, lex_state]
+ when :on_eof
+ eof_token = prism_token
+ previous_token = result_value[index - 1][0]
+
+ # If we're at the end of the file and the previous token was a
+ # comment and there is still whitespace after the comment, then
+ # Ripper will append a on_nl token (even though there isn't
+ # necessarily a newline). We mirror that here.
+ if previous_token.type == :COMMENT
+ # If the comment is at the start of a heredoc: <<HEREDOC # comment
+ # then the comment's end_offset is up near the heredoc_beg.
+ # This is not the correct offset to use for figuring out if
+ # there is trailing whitespace after the last token.
+ # Use the greater offset of the two to determine the start of
+ # the trailing whitespace.
+ start_offset = [previous_token.location.end_offset, last_heredoc_end].compact.max
+ end_offset = prism_token.location.start_offset
+
+ if start_offset < end_offset
+ if bom
+ start_offset += 3
+ end_offset += 3
+ end
+
+ tokens << [[lineno, 0], :on_nl, source.slice(start_offset, end_offset - start_offset), lex_state]
+ end
+ end
+
+ [[lineno, column], event, value, lex_state]
+ else
+ [[lineno, column], event, value, lex_state]
+ end #: lex_compat_token
+
+ previous_state = lex_state
+
+ # The order in which tokens appear in our lexer is different from the
+ # order that they appear in Ripper. When we hit the declaration of a
+ # heredoc in prism, we skip forward and lex the rest of the content of
+ # the heredoc before going back and lexing at the end of the heredoc
+ # identifier.
+ #
+ # To match up to ripper, we keep a small state variable around here to
+ # track whether we're in the middle of a heredoc or not. In this way we
+ # can shuffle around the token to match Ripper's output.
+ case state
+ when :default
+ # The default state is when there are no heredocs at all. In this
+ # state we can append the token to the list of tokens and move on.
+ tokens << lex_compat_token
+
+ # If we get the declaration of a heredoc, then we open a new heredoc
+ # and move into the heredoc_opened state.
+ if event == :on_heredoc_beg
+ state = :heredoc_opened
+ heredoc_stack.last << Heredoc.build(lex_compat_token)
+ end
+ when :heredoc_opened
+ # The heredoc_opened state is when we've seen the declaration of a
+ # heredoc and are now lexing the body of the heredoc. In this state we
+ # push tokens onto the most recently created heredoc.
+ heredoc_stack.last.last << lex_compat_token
+
+ case event
+ when :on_heredoc_beg
+ # If we receive a heredoc declaration while lexing the body of a
+ # heredoc, this means we have nested heredocs. In this case we'll
+ # push a new heredoc onto the stack and stay in the heredoc_opened
+ # state since we're now lexing the body of the new heredoc.
+ heredoc_stack << [Heredoc.build(lex_compat_token)]
+ when :on_heredoc_end
+ # If we receive the end of a heredoc, then we're done lexing the
+ # body of the heredoc. In this case we now have a completed heredoc
+ # but need to wait for the next newline to push it into the token
+ # stream.
+ state = :heredoc_closed
+ end
+ when :heredoc_closed
+ if %i[on_nl on_ignored_nl on_comment].include?(event) || ((event == :on_tstring_content) && value.end_with?("\n"))
+ if heredoc_stack.size > 1
+ flushing = heredoc_stack.pop #: Array[Heredoc::PlainHeredoc | Heredoc::DashHeredoc | Heredoc::DedentingHeredoc]
+ heredoc_stack.last.last << lex_compat_token
+
+ flushing.each do |heredoc|
+ heredoc.to_a.each do |flushed_token|
+ heredoc_stack.last.last << flushed_token
+ end
+ end
+
+ state = :heredoc_opened
+ next
+ end
+ elsif event == :on_heredoc_beg
+ tokens << lex_compat_token
+ state = :heredoc_opened
+ heredoc_stack.last << Heredoc.build(lex_compat_token)
+ next
+ elsif heredoc_stack.size > 1
+ heredoc_stack[-2].last << lex_compat_token
+ next
+ end
+
+ heredoc_stack.last.each do |heredoc|
+ tokens.concat(heredoc.to_a)
+ end
+
+ heredoc_stack.last.clear
+ state = :default
+
+ tokens << lex_compat_token
+ end
+ end
+
+ # Drop the EOF token from the list. The EOF token may not be
+ # present if the source was syntax invalid
+ if tokens.dig(-1, 1) == :on_eof
+ tokens = tokens[0...-1] #: Array[lex_compat_token]
+ end
+
+ # We sort by location because Ripper.lex sorts.
+ tokens.sort_by! do |token|
+ line, column = token[0]
+ source.byte_offset(line, column)
+ end
+
+ tokens = post_process_tokens(tokens, source, result.data_loc, bom, eof_token)
+
+ Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, result.continuable?, source)
+ end
+
+ private
+
+ #: (Array[lex_compat_token] tokens, Source source, Location? data_loc, bool bom, Token? eof_token) -> Array[lex_compat_token]
+ def post_process_tokens(tokens, source, data_loc, bom, eof_token)
+ new_tokens = [] #: Array[lex_compat_token]
+
+ prev_token_state = Translation::Ripper::Lexer::State[Translation::Ripper::EXPR_BEG]
+ prev_token_end = bom ? 3 : 0
+
+ cache = Translation::Ripper::LineAndColumnCache.new(source)
+
+ tokens.each do |token|
+ # Skip missing heredoc ends.
+ next if token[1] == :on_heredoc_end && token[2] == ""
+
+ # Add :on_sp tokens.
+ line, column = token[0]
+ start_offset = source.byte_offset(line, column)
+
+ # Ripper reports columns on line 1 without counting the BOM, so we
+ # adjust to get the real offset
+ start_offset += 3 if line == 1 && bom
+
+ if start_offset > prev_token_end
+ sp_value = source.slice(prev_token_end, start_offset - prev_token_end)
+ sp_line, sp_column = cache.line_and_column(prev_token_end)
+ # Ripper reports columns on line 1 without counting the BOM
+ sp_column -= 3 if sp_line == 1 && bom
+ continuation_index = sp_value.byteindex("\\")
+
+ # ripper emits up to three :on_sp tokens when line continuations are used
+ if continuation_index
+ next_whitespace_index = continuation_index + 1
+ next_whitespace_index += 1 if sp_value.byteslice(next_whitespace_index) == "\r"
+ next_whitespace_index += 1
+ first_whitespace = sp_value[0...continuation_index] #: String
+ continuation = sp_value[continuation_index...next_whitespace_index] #: String
+ second_whitespace = sp_value[next_whitespace_index..] || ""
+
+ new_tokens << [[sp_line, sp_column], :on_sp, first_whitespace, prev_token_state] unless first_whitespace.empty?
+ new_tokens << [[sp_line, sp_column + continuation_index], :on_sp, continuation, prev_token_state]
+ new_tokens << [[sp_line + 1, 0], :on_sp, second_whitespace, prev_token_state] unless second_whitespace.empty?
+ else
+ new_tokens << [[sp_line, sp_column], :on_sp, sp_value, prev_token_state]
+ end
+ end
+
+ new_tokens << token
+ prev_token_state = token[3]
+ prev_token_end = start_offset + token[2].bytesize
+ end
+
+ if !data_loc && eof_token # no trailing :on_sp with __END__ as it is always preceded by :on_nl
+ end_offset = eof_token.location.end_offset
+ if prev_token_end < end_offset
+ new_tokens << [
+ [source.line(prev_token_end), source.column(prev_token_end)],
+ :on_sp,
+ source.slice(prev_token_end, end_offset - prev_token_end),
+ prev_token_state
+ ]
+ end
+ end
+
+ new_tokens
+ end
+ end
+
+ private_constant :LexCompat
+end