diff options
Diffstat (limited to 'lib/prism/lex_compat.rb')
-rw-r--r-- | lib/prism/lex_compat.rb | 927 |
1 files changed, 927 insertions, 0 deletions
diff --git a/lib/prism/lex_compat.rb b/lib/prism/lex_compat.rb new file mode 100644 index 0000000000..4f8e443a3b --- /dev/null +++ b/lib/prism/lex_compat.rb @@ -0,0 +1,927 @@ +# frozen_string_literal: true + +require "delegate" +require "ripper" + +module Prism + # This class is responsible for lexing the source using prism and then + # converting those tokens to be compatible with Ripper. In the vast majority + # of cases, this is a one-to-one mapping of the token type. Everything else + # generally lines up. However, there are a few cases that require special + # handling. + class LexCompat # :nodoc: + # A result class specialized for holding tokens produced by the lexer. + class Result < Prism::Result + # The list of tokens that were produced by the lexer. + attr_reader :value + + # Create a new lex compat result object with the given values. + def initialize(value, comments, magic_comments, data_loc, errors, warnings, source) + @value = value + super(comments, magic_comments, data_loc, errors, warnings, source) + end + + # Implement the hash pattern matching interface for Result. + def deconstruct_keys(keys) + super.merge!(value: value) + end + end + + # This is a mapping of prism token types to Ripper token types. This is a + # many-to-one mapping because we split up our token types, whereas Ripper + # tends to group them. + RIPPER = { + AMPERSAND: :on_op, + AMPERSAND_AMPERSAND: :on_op, + AMPERSAND_AMPERSAND_EQUAL: :on_op, + AMPERSAND_DOT: :on_op, + AMPERSAND_EQUAL: :on_op, + BACK_REFERENCE: :on_backref, + BACKTICK: :on_backtick, + BANG: :on_op, + BANG_EQUAL: :on_op, + BANG_TILDE: :on_op, + BRACE_LEFT: :on_lbrace, + BRACE_RIGHT: :on_rbrace, + BRACKET_LEFT: :on_lbracket, + BRACKET_LEFT_ARRAY: :on_lbracket, + BRACKET_LEFT_RIGHT: :on_op, + BRACKET_LEFT_RIGHT_EQUAL: :on_op, + BRACKET_RIGHT: :on_rbracket, + CARET: :on_op, + CARET_EQUAL: :on_op, + CHARACTER_LITERAL: :on_CHAR, + CLASS_VARIABLE: :on_cvar, + COLON: :on_op, + COLON_COLON: :on_op, + COMMA: :on_comma, + COMMENT: :on_comment, + CONSTANT: :on_const, + DOT: :on_period, + DOT_DOT: :on_op, + DOT_DOT_DOT: :on_op, + EMBDOC_BEGIN: :on_embdoc_beg, + EMBDOC_END: :on_embdoc_end, + EMBDOC_LINE: :on_embdoc, + EMBEXPR_BEGIN: :on_embexpr_beg, + EMBEXPR_END: :on_embexpr_end, + EMBVAR: :on_embvar, + EOF: :on_eof, + EQUAL: :on_op, + EQUAL_EQUAL: :on_op, + EQUAL_EQUAL_EQUAL: :on_op, + EQUAL_GREATER: :on_op, + EQUAL_TILDE: :on_op, + FLOAT: :on_float, + FLOAT_IMAGINARY: :on_imaginary, + FLOAT_RATIONAL: :on_rational, + FLOAT_RATIONAL_IMAGINARY: :on_imaginary, + GREATER: :on_op, + GREATER_EQUAL: :on_op, + GREATER_GREATER: :on_op, + GREATER_GREATER_EQUAL: :on_op, + GLOBAL_VARIABLE: :on_gvar, + HEREDOC_END: :on_heredoc_end, + HEREDOC_START: :on_heredoc_beg, + IDENTIFIER: :on_ident, + IGNORED_NEWLINE: :on_ignored_nl, + INTEGER: :on_int, + INTEGER_IMAGINARY: :on_imaginary, + INTEGER_RATIONAL: :on_rational, + INTEGER_RATIONAL_IMAGINARY: :on_imaginary, + INSTANCE_VARIABLE: :on_ivar, + INVALID: :INVALID, + KEYWORD___ENCODING__: :on_kw, + KEYWORD___LINE__: :on_kw, + KEYWORD___FILE__: :on_kw, + KEYWORD_ALIAS: :on_kw, + KEYWORD_AND: :on_kw, + KEYWORD_BEGIN: :on_kw, + KEYWORD_BEGIN_UPCASE: :on_kw, + KEYWORD_BREAK: :on_kw, + KEYWORD_CASE: :on_kw, + KEYWORD_CLASS: :on_kw, + KEYWORD_DEF: :on_kw, + KEYWORD_DEFINED: :on_kw, + KEYWORD_DO: :on_kw, + KEYWORD_DO_LOOP: :on_kw, + KEYWORD_ELSE: :on_kw, + KEYWORD_ELSIF: :on_kw, + KEYWORD_END: :on_kw, + KEYWORD_END_UPCASE: :on_kw, + KEYWORD_ENSURE: :on_kw, + KEYWORD_FALSE: :on_kw, + KEYWORD_FOR: :on_kw, + KEYWORD_IF: :on_kw, + KEYWORD_IF_MODIFIER: :on_kw, + KEYWORD_IN: :on_kw, + KEYWORD_MODULE: :on_kw, + KEYWORD_NEXT: :on_kw, + KEYWORD_NIL: :on_kw, + KEYWORD_NOT: :on_kw, + KEYWORD_OR: :on_kw, + KEYWORD_REDO: :on_kw, + KEYWORD_RESCUE: :on_kw, + KEYWORD_RESCUE_MODIFIER: :on_kw, + KEYWORD_RETRY: :on_kw, + KEYWORD_RETURN: :on_kw, + KEYWORD_SELF: :on_kw, + KEYWORD_SUPER: :on_kw, + KEYWORD_THEN: :on_kw, + KEYWORD_TRUE: :on_kw, + KEYWORD_UNDEF: :on_kw, + KEYWORD_UNLESS: :on_kw, + KEYWORD_UNLESS_MODIFIER: :on_kw, + KEYWORD_UNTIL: :on_kw, + KEYWORD_UNTIL_MODIFIER: :on_kw, + KEYWORD_WHEN: :on_kw, + KEYWORD_WHILE: :on_kw, + KEYWORD_WHILE_MODIFIER: :on_kw, + KEYWORD_YIELD: :on_kw, + LABEL: :on_label, + LABEL_END: :on_label_end, + LAMBDA_BEGIN: :on_tlambeg, + LESS: :on_op, + LESS_EQUAL: :on_op, + LESS_EQUAL_GREATER: :on_op, + LESS_LESS: :on_op, + LESS_LESS_EQUAL: :on_op, + METHOD_NAME: :on_ident, + MINUS: :on_op, + MINUS_EQUAL: :on_op, + MINUS_GREATER: :on_tlambda, + NEWLINE: :on_nl, + NUMBERED_REFERENCE: :on_backref, + PARENTHESIS_LEFT: :on_lparen, + PARENTHESIS_LEFT_PARENTHESES: :on_lparen, + PARENTHESIS_RIGHT: :on_rparen, + PERCENT: :on_op, + PERCENT_EQUAL: :on_op, + PERCENT_LOWER_I: :on_qsymbols_beg, + PERCENT_LOWER_W: :on_qwords_beg, + PERCENT_LOWER_X: :on_backtick, + PERCENT_UPPER_I: :on_symbols_beg, + PERCENT_UPPER_W: :on_words_beg, + PIPE: :on_op, + PIPE_EQUAL: :on_op, + PIPE_PIPE: :on_op, + PIPE_PIPE_EQUAL: :on_op, + PLUS: :on_op, + PLUS_EQUAL: :on_op, + QUESTION_MARK: :on_op, + RATIONAL_FLOAT: :on_rational, + RATIONAL_INTEGER: :on_rational, + REGEXP_BEGIN: :on_regexp_beg, + REGEXP_END: :on_regexp_end, + SEMICOLON: :on_semicolon, + SLASH: :on_op, + SLASH_EQUAL: :on_op, + STAR: :on_op, + STAR_EQUAL: :on_op, + STAR_STAR: :on_op, + STAR_STAR_EQUAL: :on_op, + STRING_BEGIN: :on_tstring_beg, + STRING_CONTENT: :on_tstring_content, + STRING_END: :on_tstring_end, + SYMBOL_BEGIN: :on_symbeg, + TILDE: :on_op, + UAMPERSAND: :on_op, + UCOLON_COLON: :on_op, + UDOT_DOT: :on_op, + UDOT_DOT_DOT: :on_op, + UMINUS: :on_op, + UMINUS_NUM: :on_op, + UPLUS: :on_op, + USTAR: :on_op, + USTAR_STAR: :on_op, + WORDS_SEP: :on_words_sep, + "__END__": :on___end__ + }.freeze + + # When we produce tokens, we produce the same arrays that Ripper does. + # However, we add a couple of convenience methods onto them to make them a + # little easier to work with. We delegate all other methods to the array. + class Token < SimpleDelegator + # @dynamic initialize, each, [] + + # The location of the token in the source. + def location + self[0] + end + + # The type of the token. + def event + self[1] + end + + # The slice of the source that this token represents. + def value + self[2] + end + + # The state of the lexer when this token was produced. + def state + self[3] + end + end + + # Ripper doesn't include the rest of the token in the event, so we need to + # trim it down to just the content on the first line when comparing. + class EndContentToken < Token + def ==(other) # :nodoc: + [self[0], self[1], self[2][0..self[2].index("\n")], self[3]] == other + end + end + + # Tokens where state should be ignored + # used for :on_comment, :on_heredoc_end, :on_embexpr_end + class IgnoreStateToken < Token + def ==(other) # :nodoc: + self[0...-1] == other[0...-1] + end + end + + # Ident tokens for the most part are exactly the same, except sometimes we + # know an ident is a local when ripper doesn't (when they are introduced + # through named captures in regular expressions). In that case we don't + # compare the state. + class IdentToken < Token + def ==(other) # :nodoc: + (self[0...-1] == other[0...-1]) && ( + (other[3] == Ripper::EXPR_LABEL | Ripper::EXPR_END) || + (other[3] & Ripper::EXPR_ARG_ANY != 0) + ) + end + end + + # Ignored newlines can occasionally have a LABEL state attached to them, so + # we compare the state differently here. + class IgnoredNewlineToken < Token + def ==(other) # :nodoc: + return false unless self[0...-1] == other[0...-1] + + if self[3] == Ripper::EXPR_ARG | Ripper::EXPR_LABELED + other[3] & Ripper::EXPR_ARG | Ripper::EXPR_LABELED != 0 + else + self[3] == other[3] + end + end + end + + # If we have an identifier that follows a method name like: + # + # def foo bar + # + # then Ripper will mark bar as END|LABEL if there is a local in a parent + # scope named bar because it hasn't pushed the local table yet. We do this + # more accurately, so we need to allow comparing against both END and + # END|LABEL. + class ParamToken < Token + def ==(other) # :nodoc: + (self[0...-1] == other[0...-1]) && ( + (other[3] == Ripper::EXPR_END) || + (other[3] == Ripper::EXPR_END | Ripper::EXPR_LABEL) + ) + end + end + + # A heredoc in this case is a list of tokens that belong to the body of the + # heredoc that should be appended onto the list of tokens when the heredoc + # closes. + module Heredoc # :nodoc: + # Heredocs that are no dash or tilde heredocs are just a list of tokens. + # We need to keep them around so that we can insert them in the correct + # order back into the token stream and set the state of the last token to + # the state that the heredoc was opened in. + class PlainHeredoc # :nodoc: + attr_reader :tokens + + def initialize + @tokens = [] + end + + def <<(token) + tokens << token + end + + def to_a + tokens + end + end + + # Dash heredocs are a little more complicated. They are a list of tokens + # that need to be split on "\\\n" to mimic Ripper's behavior. We also need + # to keep track of the state that the heredoc was opened in. + class DashHeredoc # :nodoc: + attr_reader :split, :tokens + + def initialize(split) + @split = split + @tokens = [] + end + + def <<(token) + tokens << token + end + + def to_a + embexpr_balance = 0 + + tokens.each_with_object([]) do |token, results| #$ Array[Token] + case token.event + when :on_embexpr_beg + embexpr_balance += 1 + results << token + when :on_embexpr_end + embexpr_balance -= 1 + results << token + when :on_tstring_content + if embexpr_balance == 0 + lineno = token[0][0] + column = token[0][1] + + if split + # Split on "\\\n" to mimic Ripper's behavior. Use a lookbehind + # to keep the delimiter in the result. + token.value.split(/(?<=[^\\]\\\n)|(?<=[^\\]\\\r\n)/).each_with_index do |value, index| + column = 0 if index > 0 + results << Token.new([[lineno, column], :on_tstring_content, value, token.state]) + lineno += value.count("\n") + end + else + results << token + end + else + results << token + end + else + results << token + end + end + end + end + + # Heredocs that are dedenting heredocs are a little more complicated. + # Ripper outputs on_ignored_sp tokens for the whitespace that is being + # removed from the output. prism only modifies the node itself and keeps + # the token the same. This simplifies prism, but makes comparing against + # Ripper much harder because there is a length mismatch. + # + # Fortunately, we already have to pull out the heredoc tokens in order to + # insert them into the stream in the correct order. As such, we can do + # some extra manipulation on the tokens to make them match Ripper's + # output by mirroring the dedent logic that Ripper uses. + class DedentingHeredoc # :nodoc: + TAB_WIDTH = 8 + + attr_reader :tokens, :dedent_next, :dedent, :embexpr_balance + + def initialize + @tokens = [] + @dedent_next = true + @dedent = nil + @embexpr_balance = 0 + @ended_on_newline = false + end + + # As tokens are coming in, we track the minimum amount of common leading + # whitespace on plain string content tokens. This allows us to later + # remove that amount of whitespace from the beginning of each line. + def <<(token) + case token.event + when :on_embexpr_beg, :on_heredoc_beg + @embexpr_balance += 1 + @dedent = 0 if @dedent_next && @ended_on_newline + when :on_embexpr_end, :on_heredoc_end + @embexpr_balance -= 1 + when :on_tstring_content + if embexpr_balance == 0 + line = token.value + + if dedent_next && !(line.strip.empty? && line.end_with?("\n")) + leading = line[/\A(\s*)\n?/, 1] + next_dedent = 0 + + leading.each_char do |char| + if char == "\t" + next_dedent = next_dedent - (next_dedent % TAB_WIDTH) + TAB_WIDTH + else + next_dedent += 1 + end + end + + @dedent = [dedent, next_dedent].compact.min + @dedent_next = true + @ended_on_newline = line.end_with?("\n") + tokens << token + return + end + end + end + + @dedent_next = token.event == :on_tstring_content && embexpr_balance == 0 + @ended_on_newline = false + tokens << token + end + + def to_a + # If every line in the heredoc is blank, we still need to split up the + # string content token into multiple tokens. + if dedent.nil? + results = [] #: Array[Token] + embexpr_balance = 0 + + tokens.each do |token| + case token.event + when :on_embexpr_beg, :on_heredoc_beg + embexpr_balance += 1 + results << token + when :on_embexpr_end, :on_heredoc_end + embexpr_balance -= 1 + results << token + when :on_tstring_content + if embexpr_balance == 0 + lineno = token[0][0] + column = token[0][1] + + token.value.split(/(?<=\n)/).each_with_index do |value, index| + column = 0 if index > 0 + results << Token.new([[lineno, column], :on_tstring_content, value, token.state]) + lineno += 1 + end + else + results << token + end + else + results << token + end + end + + return results + end + + # If the minimum common whitespace is 0, then we need to concatenate + # string nodes together that are immediately adjacent. + if dedent == 0 + results = [] #: Array[Token] + embexpr_balance = 0 + + index = 0 + max_index = tokens.length + + while index < max_index + token = tokens[index] + results << token + index += 1 + + case token.event + when :on_embexpr_beg, :on_heredoc_beg + embexpr_balance += 1 + when :on_embexpr_end, :on_heredoc_end + embexpr_balance -= 1 + when :on_tstring_content + if embexpr_balance == 0 + while index < max_index && tokens[index].event == :on_tstring_content + token.value << tokens[index].value + index += 1 + end + end + end + end + + return results + end + + # Otherwise, we're going to run through each token in the list and + # insert on_ignored_sp tokens for the amount of dedent that we need to + # perform. We also need to remove the dedent from the beginning of + # each line of plain string content tokens. + results = [] #: Array[Token] + dedent_next = true + embexpr_balance = 0 + + tokens.each do |token| + # Notice that the structure of this conditional largely matches the + # whitespace calculation we performed above. This is because + # checking if the subsequent token needs to be dedented is common to + # both the dedent calculation and the ignored_sp insertion. + case token.event + when :on_embexpr_beg + embexpr_balance += 1 + results << token + when :on_embexpr_end + embexpr_balance -= 1 + results << token + when :on_tstring_content + if embexpr_balance == 0 + # Here we're going to split the string on newlines, but maintain + # the newlines in the resulting array. We'll do that with a look + # behind assertion. + splits = token.value.split(/(?<=\n)/) + index = 0 + + while index < splits.length + line = splits[index] + lineno = token[0][0] + index + column = token[0][1] + + # Blank lines do not count toward common leading whitespace + # calculation and do not need to be dedented. + if dedent_next || index > 0 + column = 0 + end + + # If the dedent is 0 and we're not supposed to dedent the next + # line or this line doesn't start with whitespace, then we + # should concatenate the rest of the string to match ripper. + if dedent == 0 && (!dedent_next || !line.start_with?(/\s/)) + line = splits[index..].join + index = splits.length + end + + # If we are supposed to dedent this line or if this is not the + # first line of the string and this line isn't entirely blank, + # then we need to insert an on_ignored_sp token and remove the + # dedent from the beginning of the line. + if (dedent > 0) && (dedent_next || index > 0) + deleting = 0 + deleted_chars = [] #: Array[String] + + # Gather up all of the characters that we're going to + # delete, stopping when you hit a character that would put + # you over the dedent amount. + line.each_char.with_index do |char, i| + case char + when "\r" + if line[i + 1] == "\n" + break + end + when "\n" + break + when "\t" + deleting = deleting - (deleting % TAB_WIDTH) + TAB_WIDTH + else + deleting += 1 + end + + break if deleting > dedent + deleted_chars << char + end + + # If we have something to delete, then delete it from the + # string and insert an on_ignored_sp token. + if deleted_chars.any? + ignored = deleted_chars.join + line.delete_prefix!(ignored) + + results << Token.new([[lineno, 0], :on_ignored_sp, ignored, token[3]]) + column = ignored.length + end + end + + results << Token.new([[lineno, column], token[1], line, token[3]]) unless line.empty? + index += 1 + end + else + results << token + end + else + results << token + end + + dedent_next = + ((token.event == :on_tstring_content) || (token.event == :on_heredoc_end)) && + embexpr_balance == 0 + end + + results + end + end + + # Here we will split between the two types of heredocs and return the + # object that will store their tokens. + def self.build(opening) + case opening.value[2] + when "~" + DedentingHeredoc.new + when "-" + DashHeredoc.new(opening.value[3] != "'") + else + PlainHeredoc.new + end + end + end + + private_constant :Heredoc + + attr_reader :source, :options + + def initialize(source, **options) + @source = source + @options = options + end + + def result + tokens = [] #: Array[LexCompat::Token] + + state = :default + heredoc_stack = [[]] #: Array[Array[Heredoc::PlainHeredoc | Heredoc::DashHeredoc | Heredoc::DedentingHeredoc]] + + result = Prism.lex(source, **options) + result_value = result.value + previous_state = nil #: Ripper::Lexer::State? + last_heredoc_end = nil #: Integer? + + # In previous versions of Ruby, Ripper wouldn't flush the bom before the + # first token, so we had to have a hack in place to account for that. This + # checks for that behavior. + bom_flushed = Ripper.lex("\xEF\xBB\xBF# test")[0][0][1] == 0 + bom = source.byteslice(0..2) == "\xEF\xBB\xBF" + + result_value.each_with_index do |(token, lex_state), index| + lineno = token.location.start_line + column = token.location.start_column + + # If there's a UTF-8 byte-order mark as the start of the file, then for + # certain tokens ripper sets the first token back by 3 bytes. It also + # keeps the byte order mark in the first token's value. This is weird, + # and I don't want to mirror that in our parser. So instead, we'll match + # up the columns and values here. + if bom && lineno == 1 + column -= 3 + + if index == 0 && column == 0 && !bom_flushed + flushed = + case token.type + when :BACK_REFERENCE, :INSTANCE_VARIABLE, :CLASS_VARIABLE, + :GLOBAL_VARIABLE, :NUMBERED_REFERENCE, :PERCENT_LOWER_I, + :PERCENT_LOWER_X, :PERCENT_LOWER_W, :PERCENT_UPPER_I, + :PERCENT_UPPER_W, :STRING_BEGIN + true + when :REGEXP_BEGIN, :SYMBOL_BEGIN + token.value.start_with?("%") + else + false + end + + unless flushed + column -= 3 + value = token.value + value.prepend(String.new("\xEF\xBB\xBF", encoding: value.encoding)) + end + end + end + + event = RIPPER.fetch(token.type) + value = token.value + lex_state = Ripper::Lexer::State.new(lex_state) + + token = + case event + when :on___end__ + EndContentToken.new([[lineno, column], event, value, lex_state]) + when :on_comment + IgnoreStateToken.new([[lineno, column], event, value, lex_state]) + when :on_heredoc_end + # Heredoc end tokens can be emitted in an odd order, so we don't + # want to bother comparing the state on them. + last_heredoc_end = token.location.end_offset + IgnoreStateToken.new([[lineno, column], event, value, lex_state]) + when :on_ident + if lex_state == Ripper::EXPR_END + # If we have an identifier that follows a method name like: + # + # def foo bar + # + # then Ripper will mark bar as END|LABEL if there is a local in a + # parent scope named bar because it hasn't pushed the local table + # yet. We do this more accurately, so we need to allow comparing + # against both END and END|LABEL. + ParamToken.new([[lineno, column], event, value, lex_state]) + elsif lex_state == Ripper::EXPR_END | Ripper::EXPR_LABEL + # In the event that we're comparing identifiers, we're going to + # allow a little divergence. Ripper doesn't account for local + # variables introduced through named captures in regexes, and we + # do, which accounts for this difference. + IdentToken.new([[lineno, column], event, value, lex_state]) + else + Token.new([[lineno, column], event, value, lex_state]) + end + when :on_embexpr_end + IgnoreStateToken.new([[lineno, column], event, value, lex_state]) + when :on_ignored_nl + # Ignored newlines can occasionally have a LABEL state attached to + # them which doesn't actually impact anything. We don't mirror that + # state so we ignored it. + IgnoredNewlineToken.new([[lineno, column], event, value, lex_state]) + when :on_regexp_end + # On regex end, Ripper scans and then sets end state, so the ripper + # lexed output is begin, when it should be end. prism sets lex state + # correctly to end state, but we want to be able to compare against + # Ripper's lexed state. So here, if it's a regexp end token, we + # output the state as the previous state, solely for the sake of + # comparison. + previous_token = result_value[index - 1][0] + lex_state = + if RIPPER.fetch(previous_token.type) == :on_embexpr_end + # If the previous token is embexpr_end, then we have to do even + # more processing. The end of an embedded expression sets the + # state to the state that it had at the beginning of the + # embedded expression. So we have to go and find that state and + # set it here. + counter = 1 + current_index = index - 1 + + until counter == 0 + current_index -= 1 + current_event = RIPPER.fetch(result_value[current_index][0].type) + counter += { on_embexpr_beg: -1, on_embexpr_end: 1 }[current_event] || 0 + end + + Ripper::Lexer::State.new(result_value[current_index][1]) + else + previous_state + end + + Token.new([[lineno, column], event, value, lex_state]) + when :on_eof + previous_token = result_value[index - 1][0] + + # If we're at the end of the file and the previous token was a + # comment and there is still whitespace after the comment, then + # Ripper will append a on_nl token (even though there isn't + # necessarily a newline). We mirror that here. + if previous_token.type == :COMMENT + # If the comment is at the start of a heredoc: <<HEREDOC # comment + # then the comment's end_offset is up near the heredoc_beg. + # This is not the correct offset to use for figuring out if + # there is trailing whitespace after the last token. + # Use the greater offset of the two to determine the start of + # the trailing whitespace. + start_offset = [previous_token.location.end_offset, last_heredoc_end].compact.max + end_offset = token.location.start_offset + + if start_offset < end_offset + if bom + start_offset += 3 + end_offset += 3 + end + + tokens << Token.new([[lineno, 0], :on_nl, source.byteslice(start_offset...end_offset), lex_state]) + end + end + + Token.new([[lineno, column], event, value, lex_state]) + else + Token.new([[lineno, column], event, value, lex_state]) + end + + previous_state = lex_state + + # The order in which tokens appear in our lexer is different from the + # order that they appear in Ripper. When we hit the declaration of a + # heredoc in prism, we skip forward and lex the rest of the content of + # the heredoc before going back and lexing at the end of the heredoc + # identifier. + # + # To match up to ripper, we keep a small state variable around here to + # track whether we're in the middle of a heredoc or not. In this way we + # can shuffle around the token to match Ripper's output. + case state + when :default + # The default state is when there are no heredocs at all. In this + # state we can append the token to the list of tokens and move on. + tokens << token + + # If we get the declaration of a heredoc, then we open a new heredoc + # and move into the heredoc_opened state. + if event == :on_heredoc_beg + state = :heredoc_opened + heredoc_stack.last << Heredoc.build(token) + end + when :heredoc_opened + # The heredoc_opened state is when we've seen the declaration of a + # heredoc and are now lexing the body of the heredoc. In this state we + # push tokens onto the most recently created heredoc. + heredoc_stack.last.last << token + + case event + when :on_heredoc_beg + # If we receive a heredoc declaration while lexing the body of a + # heredoc, this means we have nested heredocs. In this case we'll + # push a new heredoc onto the stack and stay in the heredoc_opened + # state since we're now lexing the body of the new heredoc. + heredoc_stack << [Heredoc.build(token)] + when :on_heredoc_end + # If we receive the end of a heredoc, then we're done lexing the + # body of the heredoc. In this case we now have a completed heredoc + # but need to wait for the next newline to push it into the token + # stream. + state = :heredoc_closed + end + when :heredoc_closed + if %i[on_nl on_ignored_nl on_comment].include?(event) || (event == :on_tstring_content && value.end_with?("\n")) + if heredoc_stack.size > 1 + flushing = heredoc_stack.pop + heredoc_stack.last.last << token + + flushing.each do |heredoc| + heredoc.to_a.each do |flushed_token| + heredoc_stack.last.last << flushed_token + end + end + + state = :heredoc_opened + next + end + elsif event == :on_heredoc_beg + tokens << token + state = :heredoc_opened + heredoc_stack.last << Heredoc.build(token) + next + elsif heredoc_stack.size > 1 + heredoc_stack[-2].last << token + next + end + + heredoc_stack.last.each do |heredoc| + tokens.concat(heredoc.to_a) + end + + heredoc_stack.last.clear + state = :default + + tokens << token + end + end + + # Drop the EOF token from the list + tokens = tokens[0...-1] + + # We sort by location to compare against Ripper's output + tokens.sort_by!(&:location) + + Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, Source.for(source)) + end + end + + private_constant :LexCompat + + # This is a class that wraps the Ripper lexer to produce almost exactly the + # same tokens. + class LexRipper # :nodoc: + attr_reader :source + + def initialize(source) + @source = source + end + + def result + previous = [] #: [[Integer, Integer], Symbol, String, untyped] | [] + results = [] #: Array[[[Integer, Integer], Symbol, String, untyped]] + + lex(source).each do |token| + case token[1] + when :on_sp + # skip + when :on_tstring_content + if previous[1] == :on_tstring_content && (token[2].start_with?("\#$") || token[2].start_with?("\#@")) + previous[2] << token[2] + else + results << token + previous = token + end + when :on_words_sep + if previous[1] == :on_words_sep + previous[2] << token[2] + else + results << token + previous = token + end + else + results << token + previous = token + end + end + + results + end + + private + + if Ripper.method(:lex).parameters.assoc(:keyrest) + def lex(source) + Ripper.lex(source, raise_errors: true) + end + else + def lex(source) + ripper = Ripper::Lexer.new(source) + ripper.lex.tap do |result| + raise SyntaxError, ripper.errors.map(&:message).join(' ;') if ripper.errors.any? + end + end + end + end + + private_constant :LexRipper +end |