diff options
Diffstat (limited to 'lib/prism/lex_compat.rb')
| -rw-r--r-- | lib/prism/lex_compat.rb | 906 |
1 files changed, 906 insertions, 0 deletions
diff --git a/lib/prism/lex_compat.rb b/lib/prism/lex_compat.rb new file mode 100644 index 0000000000..7aacec037d --- /dev/null +++ b/lib/prism/lex_compat.rb @@ -0,0 +1,906 @@ +# frozen_string_literal: true +# :markup: markdown +#-- +# rbs_inline: enabled + +module Prism + # @rbs! + # module Translation + # class Ripper + # EXPR_NONE: Integer + # EXPR_BEG: Integer + # EXPR_MID: Integer + # EXPR_END: Integer + # EXPR_CLASS: Integer + # EXPR_VALUE: Integer + # EXPR_ARG: Integer + # EXPR_CMDARG: Integer + # EXPR_ENDARG: Integer + # EXPR_ENDFN: Integer + # + # class Lexer < Ripper + # class State + # def self.[]: (Integer value) -> State + # end + # end + # + # class LineAndColumnCache + # def initialize: (Source source) -> void + # + # def line_and_column: (Integer byte_offset) -> [Integer, Integer] + # end + # end + # end + + # This class is responsible for lexing the source using prism and then + # converting those tokens to be compatible with Ripper. In the vast majority + # of cases, this is a one-to-one mapping of the token type. Everything else + # generally lines up. However, there are a few cases that require special + # handling. + class LexCompat # :nodoc: + # @rbs! + # # A token produced by the Ripper lexer that Prism is replicating. + # type lex_compat_token = [[Integer, Integer], Symbol, String, untyped] + + # A result class specialized for holding tokens produced by the lexer. + class Result < Prism::Result + # The list of tokens that were produced by the lexer. + attr_reader :value #: Array[lex_compat_token] + + # Create a new lex compat result object with the given values. + #-- + #: (Array[lex_compat_token] value, Array[Comment] comments, Array[MagicComment] magic_comments, Location? data_loc, Array[ParseError] errors, Array[ParseWarning] warnings, bool continuable, Source source) -> void + def initialize(value, comments, magic_comments, data_loc, errors, warnings, continuable, source) + @value = value + super(comments, magic_comments, data_loc, errors, warnings, continuable, source) + end + + # Implement the hash pattern matching interface for Result. + #-- + #: (Array[Symbol]? keys) -> Hash[Symbol, untyped] + def deconstruct_keys(keys) # :nodoc: + super.merge!(value: value) + end + end + + # This is a mapping of prism token types to Ripper token types. This is a + # many-to-one mapping because we split up our token types, whereas Ripper + # tends to group them. + RIPPER = { + AMPERSAND: :on_op, + AMPERSAND_AMPERSAND: :on_op, + AMPERSAND_AMPERSAND_EQUAL: :on_op, + AMPERSAND_DOT: :on_op, + AMPERSAND_EQUAL: :on_op, + BACK_REFERENCE: :on_backref, + BACKTICK: :on_backtick, + BANG: :on_op, + BANG_EQUAL: :on_op, + BANG_TILDE: :on_op, + BRACE_LEFT: :on_lbrace, + BRACE_RIGHT: :on_rbrace, + BRACKET_LEFT: :on_lbracket, + BRACKET_LEFT_ARRAY: :on_lbracket, + BRACKET_LEFT_RIGHT: :on_op, + BRACKET_LEFT_RIGHT_EQUAL: :on_op, + BRACKET_RIGHT: :on_rbracket, + CARET: :on_op, + CARET_EQUAL: :on_op, + CHARACTER_LITERAL: :on_CHAR, + CLASS_VARIABLE: :on_cvar, + COLON: :on_op, + COLON_COLON: :on_op, + COMMA: :on_comma, + COMMENT: :on_comment, + CONSTANT: :on_const, + DOT: :on_period, + DOT_DOT: :on_op, + DOT_DOT_DOT: :on_op, + EMBDOC_BEGIN: :on_embdoc_beg, + EMBDOC_END: :on_embdoc_end, + EMBDOC_LINE: :on_embdoc, + EMBEXPR_BEGIN: :on_embexpr_beg, + EMBEXPR_END: :on_embexpr_end, + EMBVAR: :on_embvar, + EOF: :on_eof, + EQUAL: :on_op, + EQUAL_EQUAL: :on_op, + EQUAL_EQUAL_EQUAL: :on_op, + EQUAL_GREATER: :on_op, + EQUAL_TILDE: :on_op, + FLOAT: :on_float, + FLOAT_IMAGINARY: :on_imaginary, + FLOAT_RATIONAL: :on_rational, + FLOAT_RATIONAL_IMAGINARY: :on_imaginary, + GREATER: :on_op, + GREATER_EQUAL: :on_op, + GREATER_GREATER: :on_op, + GREATER_GREATER_EQUAL: :on_op, + GLOBAL_VARIABLE: :on_gvar, + HEREDOC_END: :on_heredoc_end, + HEREDOC_START: :on_heredoc_beg, + IDENTIFIER: :on_ident, + IGNORED_NEWLINE: :on_ignored_nl, + INTEGER: :on_int, + INTEGER_IMAGINARY: :on_imaginary, + INTEGER_RATIONAL: :on_rational, + INTEGER_RATIONAL_IMAGINARY: :on_imaginary, + INSTANCE_VARIABLE: :on_ivar, + INVALID: :INVALID, + KEYWORD___ENCODING__: :on_kw, + KEYWORD___LINE__: :on_kw, + KEYWORD___FILE__: :on_kw, + KEYWORD_ALIAS: :on_kw, + KEYWORD_AND: :on_kw, + KEYWORD_BEGIN: :on_kw, + KEYWORD_BEGIN_UPCASE: :on_kw, + KEYWORD_BREAK: :on_kw, + KEYWORD_CASE: :on_kw, + KEYWORD_CLASS: :on_kw, + KEYWORD_DEF: :on_kw, + KEYWORD_DEFINED: :on_kw, + KEYWORD_DO: :on_kw, + KEYWORD_DO_BLOCK: :on_kw, + KEYWORD_DO_LOOP: :on_kw, + KEYWORD_ELSE: :on_kw, + KEYWORD_ELSIF: :on_kw, + KEYWORD_END: :on_kw, + KEYWORD_END_UPCASE: :on_kw, + KEYWORD_ENSURE: :on_kw, + KEYWORD_FALSE: :on_kw, + KEYWORD_FOR: :on_kw, + KEYWORD_IF: :on_kw, + KEYWORD_IF_MODIFIER: :on_kw, + KEYWORD_IN: :on_kw, + KEYWORD_MODULE: :on_kw, + KEYWORD_NEXT: :on_kw, + KEYWORD_NIL: :on_kw, + KEYWORD_NOT: :on_kw, + KEYWORD_OR: :on_kw, + KEYWORD_REDO: :on_kw, + KEYWORD_RESCUE: :on_kw, + KEYWORD_RESCUE_MODIFIER: :on_kw, + KEYWORD_RETRY: :on_kw, + KEYWORD_RETURN: :on_kw, + KEYWORD_SELF: :on_kw, + KEYWORD_SUPER: :on_kw, + KEYWORD_THEN: :on_kw, + KEYWORD_TRUE: :on_kw, + KEYWORD_UNDEF: :on_kw, + KEYWORD_UNLESS: :on_kw, + KEYWORD_UNLESS_MODIFIER: :on_kw, + KEYWORD_UNTIL: :on_kw, + KEYWORD_UNTIL_MODIFIER: :on_kw, + KEYWORD_WHEN: :on_kw, + KEYWORD_WHILE: :on_kw, + KEYWORD_WHILE_MODIFIER: :on_kw, + KEYWORD_YIELD: :on_kw, + LABEL: :on_label, + LABEL_END: :on_label_end, + LAMBDA_BEGIN: :on_tlambeg, + LESS: :on_op, + LESS_EQUAL: :on_op, + LESS_EQUAL_GREATER: :on_op, + LESS_LESS: :on_op, + LESS_LESS_EQUAL: :on_op, + METHOD_NAME: :on_ident, + MINUS: :on_op, + MINUS_EQUAL: :on_op, + MINUS_GREATER: :on_tlambda, + NEWLINE: :on_nl, + NUMBERED_REFERENCE: :on_backref, + PARENTHESIS_LEFT: :on_lparen, + PARENTHESIS_LEFT_PARENTHESES: :on_lparen, + PARENTHESIS_RIGHT: :on_rparen, + PERCENT: :on_op, + PERCENT_EQUAL: :on_op, + PERCENT_LOWER_I: :on_qsymbols_beg, + PERCENT_LOWER_W: :on_qwords_beg, + PERCENT_LOWER_X: :on_backtick, + PERCENT_UPPER_I: :on_symbols_beg, + PERCENT_UPPER_W: :on_words_beg, + PIPE: :on_op, + PIPE_EQUAL: :on_op, + PIPE_PIPE: :on_op, + PIPE_PIPE_EQUAL: :on_op, + PLUS: :on_op, + PLUS_EQUAL: :on_op, + QUESTION_MARK: :on_op, + RATIONAL_FLOAT: :on_rational, + RATIONAL_INTEGER: :on_rational, + REGEXP_BEGIN: :on_regexp_beg, + REGEXP_END: :on_regexp_end, + SEMICOLON: :on_semicolon, + SLASH: :on_op, + SLASH_EQUAL: :on_op, + STAR: :on_op, + STAR_EQUAL: :on_op, + STAR_STAR: :on_op, + STAR_STAR_EQUAL: :on_op, + STRING_BEGIN: :on_tstring_beg, + STRING_CONTENT: :on_tstring_content, + STRING_END: :on_tstring_end, + SYMBOL_BEGIN: :on_symbeg, + TILDE: :on_op, + UAMPERSAND: :on_op, + UCOLON_COLON: :on_op, + UDOT_DOT: :on_op, + UDOT_DOT_DOT: :on_op, + UMINUS: :on_op, + UMINUS_NUM: :on_op, + UPLUS: :on_op, + USTAR: :on_op, + USTAR_STAR: :on_op, + WORDS_SEP: :on_words_sep, + "__END__": :on___end__ + }.freeze + + # A heredoc in this case is a list of tokens that belong to the body of the + # heredoc that should be appended onto the list of tokens when the heredoc + # closes. + module Heredoc # :nodoc: + # Heredocs that are no dash or tilde heredocs are just a list of tokens. + # We need to keep them around so that we can insert them in the correct + # order back into the token stream and set the state of the last token to + # the state that the heredoc was opened in. + class PlainHeredoc # :nodoc: + attr_reader :tokens #: Array[lex_compat_token] + + #: () -> void + def initialize + @tokens = [] + end + + #: (lex_compat_token token) -> void + def <<(token) + tokens << token + end + + #: () -> Array[lex_compat_token] + def to_a + tokens + end + end + + # Dash heredocs are a little more complicated. They are a list of tokens + # that need to be split on "\\\n" to mimic Ripper's behavior. We also need + # to keep track of the state that the heredoc was opened in. + class DashHeredoc # :nodoc: + attr_reader :split #: bool + attr_reader :tokens #: Array[lex_compat_token] + + #: (bool split) -> void + def initialize(split) + @split = split + @tokens = [] + end + + #: (lex_compat_token token) -> void + def <<(token) + tokens << token + end + + #: () -> Array[lex_compat_token] + def to_a + embexpr_balance = 0 + + tokens.each_with_object([]) do |token, results| #$ Array[lex_compat_token] + case token[1] + when :on_embexpr_beg + embexpr_balance += 1 + results << token + when :on_embexpr_end + embexpr_balance -= 1 + results << token + when :on_tstring_content + if embexpr_balance == 0 + lineno = token[0][0] + column = token[0][1] + + if split + # Split on "\\\n" to mimic Ripper's behavior. Use a lookbehind + # to keep the delimiter in the result. + token[2].split(/(?<=[^\\]\\\n)|(?<=[^\\]\\\r\n)/).each_with_index do |value, index| + column = 0 if index > 0 + results << [[lineno, column], :on_tstring_content, value, token[3]] + lineno += value.count("\n") + end + else + results << token + end + else + results << token + end + else + results << token + end + end + end + end + + # Heredocs that are dedenting heredocs are a little more complicated. + # Ripper outputs on_ignored_sp tokens for the whitespace that is being + # removed from the output. prism only modifies the node itself and keeps + # the token the same. This simplifies prism, but makes comparing against + # Ripper much harder because there is a length mismatch. + # + # Fortunately, we already have to pull out the heredoc tokens in order to + # insert them into the stream in the correct order. As such, we can do + # some extra manipulation on the tokens to make them match Ripper's + # output by mirroring the dedent logic that Ripper uses. + class DedentingHeredoc # :nodoc: + TAB_WIDTH = 8 + + attr_reader :tokens #: Array[lex_compat_token] + attr_reader :dedent_next #: bool + attr_reader :dedent #: Integer? + attr_reader :embexpr_balance #: Integer + # @rbs @ended_on_newline: bool + + #: () -> void + def initialize + @tokens = [] + @dedent_next = true + @dedent = nil + @embexpr_balance = 0 + @ended_on_newline = false + end + + # As tokens are coming in, we track the minimum amount of common leading + # whitespace on plain string content tokens. This allows us to later + # remove that amount of whitespace from the beginning of each line. + # + #: (lex_compat_token token) -> void + def <<(token) + case token[1] + when :on_embexpr_beg, :on_heredoc_beg + @embexpr_balance += 1 + @dedent = 0 if @dedent_next && @ended_on_newline + when :on_embexpr_end, :on_heredoc_end + @embexpr_balance -= 1 + when :on_tstring_content + if embexpr_balance == 0 + line = token[2] + + if dedent_next && !(line.strip.empty? && line.end_with?("\n")) + leading = line[/\A(\s*)\n?/, 1] #: String + next_dedent = 0 + + leading.each_char do |char| + if char == "\t" + next_dedent = next_dedent - (next_dedent % TAB_WIDTH) + TAB_WIDTH + else + next_dedent += 1 + end + end + + @dedent = [dedent, next_dedent].compact.min + @dedent_next = true + @ended_on_newline = line.end_with?("\n") + tokens << token + return + end + end + end + + @dedent_next = token[1] == :on_tstring_content && embexpr_balance == 0 + @ended_on_newline = false + tokens << token + end + + #: () -> Array[lex_compat_token] + def to_a + # If every line in the heredoc is blank, we still need to split up the + # string content token into multiple tokens. + if dedent.nil? + results = [] #: Array[lex_compat_token] + embexpr_balance = 0 + + tokens.each do |token| + case token[1] + when :on_embexpr_beg, :on_heredoc_beg + embexpr_balance += 1 + results << token + when :on_embexpr_end, :on_heredoc_end + embexpr_balance -= 1 + results << token + when :on_tstring_content + if embexpr_balance == 0 + lineno = token[0][0] + column = token[0][1] + + token[2].split(/(?<=\n)/).each_with_index do |value, index| + column = 0 if index > 0 + results << [[lineno, column], :on_tstring_content, value, token[3]] + lineno += 1 + end + else + results << token + end + else + results << token + end + end + + return results + end + + # If the minimum common whitespace is 0, then we need to concatenate + # string nodes together that are immediately adjacent. + if dedent == 0 + results = [] #: Array[lex_compat_token] + embexpr_balance = 0 + + index = 0 + max_index = tokens.length + + while index < max_index + token = tokens[index] + results << token + index += 1 + + case token[1] + when :on_embexpr_beg, :on_heredoc_beg + embexpr_balance += 1 + when :on_embexpr_end, :on_heredoc_end + embexpr_balance -= 1 + when :on_tstring_content + if embexpr_balance == 0 + while index < max_index && tokens[index][1] == :on_tstring_content && !token[2].match?(/\\\r?\n\z/) + token[2] << tokens[index][2] + index += 1 + end + end + end + end + + return results + end + + # Otherwise, we're going to run through each token in the list and + # insert on_ignored_sp tokens for the amount of dedent that we need to + # perform. We also need to remove the dedent from the beginning of + # each line of plain string content tokens. + results = [] #: Array[lex_compat_token] + dedent_next = true + embexpr_balance = 0 + + tokens.each do |token| + # Notice that the structure of this conditional largely matches the + # whitespace calculation we performed above. This is because + # checking if the subsequent token needs to be dedented is common to + # both the dedent calculation and the ignored_sp insertion. + case token[1] + when :on_embexpr_beg + embexpr_balance += 1 + results << token + when :on_embexpr_end + embexpr_balance -= 1 + results << token + when :on_tstring_content + if embexpr_balance == 0 + # Here we're going to split the string on newlines, but maintain + # the newlines in the resulting array. We'll do that with a look + # behind assertion. + splits = token[2].split(/(?<=\n)/) + index = 0 + + while index < splits.length + line = splits[index] + lineno = token[0][0] + index + column = token[0][1] + + # Blank lines do not count toward common leading whitespace + # calculation and do not need to be dedented. + if dedent_next || index > 0 + column = 0 + end + + # If the dedent is 0 and we're not supposed to dedent the next + # line or this line doesn't start with whitespace, then we + # should concatenate the rest of the string to match ripper. + if dedent == 0 && (!dedent_next || !line.start_with?(/\s/)) + unjoined = splits[index..] #: Array[String] + line = unjoined.join + index = splits.length + end + + # If we are supposed to dedent this line or if this is not the + # first line of the string and this line isn't entirely blank, + # then we need to insert an on_ignored_sp token and remove the + # dedent from the beginning of the line. + if (dedent > 0) && (dedent_next || index > 0) + deleting = 0 + deleted_chars = [] #: Array[String] + + # Gather up all of the characters that we're going to + # delete, stopping when you hit a character that would put + # you over the dedent amount. + line.each_char.with_index do |char, i| + case char + when "\r" + if line[i + 1] == "\n" + break + end + when "\n" + break + when "\t" + deleting = deleting - (deleting % TAB_WIDTH) + TAB_WIDTH + else + deleting += 1 + end + + break if deleting > dedent + deleted_chars << char + end + + # If we have something to delete, then delete it from the + # string and insert an on_ignored_sp token. + if deleted_chars.any? + ignored = deleted_chars.join + line.delete_prefix!(ignored) + + results << [[lineno, 0], :on_ignored_sp, ignored, token[3]] + column = ignored.length + end + end + + results << [[lineno, column], token[1], line, token[3]] unless line.empty? + index += 1 + end + else + results << token + end + else + results << token + end + + dedent_next = + ((token[1] == :on_tstring_content) || (token[1] == :on_heredoc_end)) && + embexpr_balance == 0 + end + + results + end + end + + # Here we will split between the two types of heredocs and return the + # object that will store their tokens. + #-- + #: (lex_compat_token opening) -> (PlainHeredoc | DashHeredoc | DedentingHeredoc) + def self.build(opening) + case opening[2][2] + when "~" + DedentingHeredoc.new + when "-" + DashHeredoc.new(opening[2][3] != "'") + else + PlainHeredoc.new + end + end + end + + private_constant :Heredoc + + # In previous versions of Ruby, Ripper wouldn't flush the bom before the + # first token, so we had to have a hack in place to account for that. + BOM_FLUSHED = RUBY_VERSION >= "3.3.0" + private_constant :BOM_FLUSHED + + attr_reader :options #: Hash[Symbol, untyped] + # @rbs @source: String + + #: (String source, **untyped options) -> void + def initialize(source, **options) + @source = source + @options = options + end + + #: () -> Result + def result + tokens = [] #: Array[lex_compat_token] + + state = :default + heredoc_stack = [[]] #: Array[Array[Heredoc::PlainHeredoc | Heredoc::DashHeredoc | Heredoc::DedentingHeredoc]] + + result = Prism.lex(@source, **options) + source = result.source + result_value = result.value + previous_state = nil #: Translation::Ripper::Lexer::State? + last_heredoc_end = nil #: Integer? + eof_token = nil #: Token? + + bom = source.slice(0, 3) == "\xEF\xBB\xBF" + + result_value.each_with_index do |(prism_token, prism_state), index| + lineno = prism_token.location.start_line + column = prism_token.location.start_column + + event = RIPPER.fetch(prism_token.type) + value = prism_token.value + lex_state = Translation::Ripper::Lexer::State[prism_state] + + # If there's a UTF-8 byte-order mark as the start of the file, then for + # certain tokens ripper sets the first token back by 3 bytes. It also + # keeps the byte order mark in the first token's value. This is weird, + # and I don't want to mirror that in our parser. So instead, we'll match + # up the columns and values here. + if bom && lineno == 1 + column -= 3 + + if index == 0 && column == 0 && !BOM_FLUSHED + flushed = + case prism_token.type + when :BACK_REFERENCE, :INSTANCE_VARIABLE, :CLASS_VARIABLE, + :GLOBAL_VARIABLE, :NUMBERED_REFERENCE, :PERCENT_LOWER_I, + :PERCENT_LOWER_X, :PERCENT_LOWER_W, :PERCENT_UPPER_I, + :PERCENT_UPPER_W, :STRING_BEGIN + true + when :REGEXP_BEGIN, :SYMBOL_BEGIN + value.start_with?("%") + else + false + end + + unless flushed + column -= 3 + value.prepend(String.new("\xEF\xBB\xBF", encoding: value.encoding)) + end + end + end + + lex_compat_token = + case event + when :on___end__ + # Ripper doesn't include the rest of the token in the event, so we need to + # trim it down to just the content on the first line. + value = value[0..value.index("\n")] #: String + [[lineno, column], event, value, lex_state] + when :on_comment + [[lineno, column], event, value, lex_state] + when :on_heredoc_end + # Heredoc end tokens can be emitted in an odd order, so we don't + # want to bother comparing the state on them. + last_heredoc_end = prism_token.location.end_offset + [[lineno, column], event, value, lex_state] + when :on_embexpr_end + [[lineno, column], event, value, lex_state] + when :on_words_sep + # Ripper emits one token each per line. + value.each_line.with_index do |line, index| + if index > 0 + lineno += 1 + column = 0 + end + tokens << [[lineno, column], event, line, lex_state] + end + tokens.pop #: lex_compat_token + when :on_regexp_end + # On regex end, Ripper scans and then sets end state, so the ripper + # lexed output is begin, when it should be end. prism sets lex state + # correctly to end state, but we want to be able to compare against + # Ripper's lexed state. So here, if it's a regexp end token, we + # output the state as the previous state, solely for the sake of + # comparison. + previous_token = result_value[index - 1][0] + lex_state = + if RIPPER.fetch(previous_token.type) == :on_embexpr_end + # If the previous token is embexpr_end, then we have to do even + # more processing. The end of an embedded expression sets the + # state to the state that it had at the beginning of the + # embedded expression. So we have to go and find that state and + # set it here. + counter = 1 + current_index = index - 1 + + until counter == 0 + current_index -= 1 + current_event = RIPPER.fetch(result_value[current_index][0].type) + counter += { on_embexpr_beg: -1, on_embexpr_end: 1 }[current_event] || 0 + end + + Translation::Ripper::Lexer::State[result_value[current_index][1]] + else + previous_state + end + + [[lineno, column], event, value, lex_state] + when :on_eof + eof_token = prism_token + previous_token = result_value[index - 1][0] + + # If we're at the end of the file and the previous token was a + # comment and there is still whitespace after the comment, then + # Ripper will append a on_nl token (even though there isn't + # necessarily a newline). We mirror that here. + if previous_token.type == :COMMENT + # If the comment is at the start of a heredoc: <<HEREDOC # comment + # then the comment's end_offset is up near the heredoc_beg. + # This is not the correct offset to use for figuring out if + # there is trailing whitespace after the last token. + # Use the greater offset of the two to determine the start of + # the trailing whitespace. + start_offset = [previous_token.location.end_offset, last_heredoc_end].compact.max + end_offset = prism_token.location.start_offset + + if start_offset < end_offset + if bom + start_offset += 3 + end_offset += 3 + end + + tokens << [[lineno, 0], :on_nl, source.slice(start_offset, end_offset - start_offset), lex_state] + end + end + + [[lineno, column], event, value, lex_state] + else + [[lineno, column], event, value, lex_state] + end #: lex_compat_token + + previous_state = lex_state + + # The order in which tokens appear in our lexer is different from the + # order that they appear in Ripper. When we hit the declaration of a + # heredoc in prism, we skip forward and lex the rest of the content of + # the heredoc before going back and lexing at the end of the heredoc + # identifier. + # + # To match up to ripper, we keep a small state variable around here to + # track whether we're in the middle of a heredoc or not. In this way we + # can shuffle around the token to match Ripper's output. + case state + when :default + # The default state is when there are no heredocs at all. In this + # state we can append the token to the list of tokens and move on. + tokens << lex_compat_token + + # If we get the declaration of a heredoc, then we open a new heredoc + # and move into the heredoc_opened state. + if event == :on_heredoc_beg + state = :heredoc_opened + heredoc_stack.last << Heredoc.build(lex_compat_token) + end + when :heredoc_opened + # The heredoc_opened state is when we've seen the declaration of a + # heredoc and are now lexing the body of the heredoc. In this state we + # push tokens onto the most recently created heredoc. + heredoc_stack.last.last << lex_compat_token + + case event + when :on_heredoc_beg + # If we receive a heredoc declaration while lexing the body of a + # heredoc, this means we have nested heredocs. In this case we'll + # push a new heredoc onto the stack and stay in the heredoc_opened + # state since we're now lexing the body of the new heredoc. + heredoc_stack << [Heredoc.build(lex_compat_token)] + when :on_heredoc_end + # If we receive the end of a heredoc, then we're done lexing the + # body of the heredoc. In this case we now have a completed heredoc + # but need to wait for the next newline to push it into the token + # stream. + state = :heredoc_closed + end + when :heredoc_closed + if %i[on_nl on_ignored_nl on_comment].include?(event) || ((event == :on_tstring_content) && value.end_with?("\n")) + if heredoc_stack.size > 1 + flushing = heredoc_stack.pop #: Array[Heredoc::PlainHeredoc | Heredoc::DashHeredoc | Heredoc::DedentingHeredoc] + heredoc_stack.last.last << lex_compat_token + + flushing.each do |heredoc| + heredoc.to_a.each do |flushed_token| + heredoc_stack.last.last << flushed_token + end + end + + state = :heredoc_opened + next + end + elsif event == :on_heredoc_beg + tokens << lex_compat_token + state = :heredoc_opened + heredoc_stack.last << Heredoc.build(lex_compat_token) + next + elsif heredoc_stack.size > 1 + heredoc_stack[-2].last << lex_compat_token + next + end + + heredoc_stack.last.each do |heredoc| + tokens.concat(heredoc.to_a) + end + + heredoc_stack.last.clear + state = :default + + tokens << lex_compat_token + end + end + + # Drop the EOF token from the list. The EOF token may not be + # present if the source was syntax invalid + if tokens.dig(-1, 1) == :on_eof + tokens = tokens[0...-1] #: Array[lex_compat_token] + end + + # We sort by location because Ripper.lex sorts. + tokens.sort_by! do |token| + line, column = token[0] + source.byte_offset(line, column) + end + + tokens = post_process_tokens(tokens, source, result.data_loc, bom, eof_token) + + Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, result.continuable?, source) + end + + private + + #: (Array[lex_compat_token] tokens, Source source, Location? data_loc, bool bom, Token? eof_token) -> Array[lex_compat_token] + def post_process_tokens(tokens, source, data_loc, bom, eof_token) + new_tokens = [] #: Array[lex_compat_token] + + prev_token_state = Translation::Ripper::Lexer::State[Translation::Ripper::EXPR_BEG] + prev_token_end = bom ? 3 : 0 + + cache = Translation::Ripper::LineAndColumnCache.new(source) + + tokens.each do |token| + # Skip missing heredoc ends. + next if token[1] == :on_heredoc_end && token[2] == "" + + # Add :on_sp tokens. + line, column = token[0] + start_offset = source.byte_offset(line, column) + + # Ripper reports columns on line 1 without counting the BOM, so we + # adjust to get the real offset + start_offset += 3 if line == 1 && bom + + if start_offset > prev_token_end + sp_value = source.slice(prev_token_end, start_offset - prev_token_end) + sp_line, sp_column = cache.line_and_column(prev_token_end) + # Ripper reports columns on line 1 without counting the BOM + sp_column -= 3 if sp_line == 1 && bom + continuation_index = sp_value.byteindex("\\") + + # ripper emits up to three :on_sp tokens when line continuations are used + if continuation_index + next_whitespace_index = continuation_index + 1 + next_whitespace_index += 1 if sp_value.byteslice(next_whitespace_index) == "\r" + next_whitespace_index += 1 + first_whitespace = sp_value[0...continuation_index] #: String + continuation = sp_value[continuation_index...next_whitespace_index] #: String + second_whitespace = sp_value[next_whitespace_index..] || "" + + new_tokens << [[sp_line, sp_column], :on_sp, first_whitespace, prev_token_state] unless first_whitespace.empty? + new_tokens << [[sp_line, sp_column + continuation_index], :on_sp, continuation, prev_token_state] + new_tokens << [[sp_line + 1, 0], :on_sp, second_whitespace, prev_token_state] unless second_whitespace.empty? + else + new_tokens << [[sp_line, sp_column], :on_sp, sp_value, prev_token_state] + end + end + + new_tokens << token + prev_token_state = token[3] + prev_token_end = start_offset + token[2].bytesize + end + + if !data_loc && eof_token # no trailing :on_sp with __END__ as it is always preceded by :on_nl + end_offset = eof_token.location.end_offset + if prev_token_end < end_offset + new_tokens << [ + [source.line(prev_token_end), source.column(prev_token_end)], + :on_sp, + source.slice(prev_token_end, end_offset - prev_token_end), + prev_token_state + ] + end + end + + new_tokens + end + end + + private_constant :LexCompat +end |
