diff options
| -rw-r--r-- | lib/prism/lex_compat.rb | 150 | ||||
| -rw-r--r-- | lib/prism/translation/ripper.rb | 2 | ||||
| -rw-r--r-- | test/prism/ruby/ripper_test.rb | 14 |
3 files changed, 46 insertions, 120 deletions
diff --git a/lib/prism/lex_compat.rb b/lib/prism/lex_compat.rb index c23adda241..4c516a9de0 100644 --- a/lib/prism/lex_compat.rb +++ b/lib/prism/lex_compat.rb @@ -196,57 +196,6 @@ module Prism "__END__": :on___end__ }.freeze - # When we produce tokens, we produce the same arrays that Ripper does. - # However, we add a couple of convenience methods onto them to make them a - # little easier to work with. We delegate all other methods to the array. - class Token < BasicObject - # Create a new token object with the given ripper-compatible array. - def initialize(array) - @array = array - end - - # The location of the token in the source. - def location - @array[0] - end - - # The type of the token. - def event - @array[1] - end - - # The slice of the source that this token represents. - def value - @array[2] - end - - # The state of the lexer when this token was produced. - def state - @array[3] - end - - # We want to pretend that this is just an Array. - def ==(other) # :nodoc: - @array == other - end - - def respond_to_missing?(name, include_private = false) # :nodoc: - @array.respond_to?(name, include_private) - end - - def method_missing(name, ...) # :nodoc: - @array.send(name, ...) - end - end - - # Tokens where state should be ignored - # used for :on_sp, :on_comment, :on_heredoc_end, :on_embexpr_end - class IgnoreStateToken < Token - def ==(other) # :nodoc: - self[0...-1] == other[0...-1] - end - end - # A heredoc in this case is a list of tokens that belong to the body of the # heredoc that should be appended onto the list of tokens when the heredoc # closes. @@ -290,7 +239,7 @@ module Prism embexpr_balance = 0 tokens.each_with_object([]) do |token, results| #$ Array[Token] - case token.event + case token[1] when :on_embexpr_beg embexpr_balance += 1 results << token @@ -305,9 +254,9 @@ module Prism if split # Split on "\\\n" to mimic Ripper's behavior. Use a lookbehind # to keep the delimiter in the result. - token.value.split(/(?<=[^\\]\\\n)|(?<=[^\\]\\\r\n)/).each_with_index do |value, index| + token[2].split(/(?<=[^\\]\\\n)|(?<=[^\\]\\\r\n)/).each_with_index do |value, index| column = 0 if index > 0 - results << Token.new([[lineno, column], :on_tstring_content, value, token.state]) + results << [[lineno, column], :on_tstring_content, value, token[3]] lineno += value.count("\n") end else @@ -350,7 +299,7 @@ module Prism # whitespace on plain string content tokens. This allows us to later # remove that amount of whitespace from the beginning of each line. def <<(token) - case token.event + case token[1] when :on_embexpr_beg, :on_heredoc_beg @embexpr_balance += 1 @dedent = 0 if @dedent_next && @ended_on_newline @@ -358,7 +307,7 @@ module Prism @embexpr_balance -= 1 when :on_tstring_content if embexpr_balance == 0 - line = token.value + line = token[2] if dedent_next && !(line.strip.empty? && line.end_with?("\n")) leading = line[/\A(\s*)\n?/, 1] @@ -381,7 +330,7 @@ module Prism end end - @dedent_next = token.event == :on_tstring_content && embexpr_balance == 0 + @dedent_next = token[1] == :on_tstring_content && embexpr_balance == 0 @ended_on_newline = false tokens << token end @@ -394,7 +343,7 @@ module Prism embexpr_balance = 0 tokens.each do |token| - case token.event + case token[1] when :on_embexpr_beg, :on_heredoc_beg embexpr_balance += 1 results << token @@ -406,9 +355,9 @@ module Prism lineno = token[0][0] column = token[0][1] - token.value.split(/(?<=\n)/).each_with_index do |value, index| + token[2].split(/(?<=\n)/).each_with_index do |value, index| column = 0 if index > 0 - results << Token.new([[lineno, column], :on_tstring_content, value, token.state]) + results << [[lineno, column], :on_tstring_content, value, token[3]] lineno += 1 end else @@ -436,15 +385,15 @@ module Prism results << token index += 1 - case token.event + case token[1] when :on_embexpr_beg, :on_heredoc_beg embexpr_balance += 1 when :on_embexpr_end, :on_heredoc_end embexpr_balance -= 1 when :on_tstring_content if embexpr_balance == 0 - while index < max_index && tokens[index].event == :on_tstring_content && !token.value.match?(/\\\r?\n\z/) - token.value << tokens[index].value + while index < max_index && tokens[index][1] == :on_tstring_content && !token[2].match?(/\\\r?\n\z/) + token[2] << tokens[index][2] index += 1 end end @@ -467,7 +416,7 @@ module Prism # whitespace calculation we performed above. This is because # checking if the subsequent token needs to be dedented is common to # both the dedent calculation and the ignored_sp insertion. - case token.event + case token[1] when :on_embexpr_beg embexpr_balance += 1 results << token @@ -479,7 +428,7 @@ module Prism # Here we're going to split the string on newlines, but maintain # the newlines in the resulting array. We'll do that with a look # behind assertion. - splits = token.value.split(/(?<=\n)/) + splits = token[2].split(/(?<=\n)/) index = 0 while index < splits.length @@ -536,12 +485,12 @@ module Prism ignored = deleted_chars.join line.delete_prefix!(ignored) - results << Token.new([[lineno, 0], :on_ignored_sp, ignored, token[3]]) + results << [[lineno, 0], :on_ignored_sp, ignored, token[3]] column = ignored.length end end - results << Token.new([[lineno, column], token[1], line, token[3]]) unless line.empty? + results << [[lineno, column], token[1], line, token[3]] unless line.empty? index += 1 end else @@ -552,7 +501,7 @@ module Prism end dedent_next = - ((token.event == :on_tstring_content) || (token.event == :on_heredoc_end)) && + ((token[1] == :on_tstring_content) || (token[1] == :on_heredoc_end)) && embexpr_balance == 0 end @@ -563,11 +512,11 @@ module Prism # Here we will split between the two types of heredocs and return the # object that will store their tokens. def self.build(opening) - case opening.value[2] + case opening[2][2] when "~" DedentingHeredoc.new when "-" - DashHeredoc.new(opening.value[3] != "'") + DashHeredoc.new(opening[2][3] != "'") else PlainHeredoc.new end @@ -647,16 +596,16 @@ module Prism # Ripper doesn't include the rest of the token in the event, so we need to # trim it down to just the content on the first line. value = value[0..value.index("\n")] - Token.new([[lineno, column], event, value, lex_state]) + [[lineno, column], event, value, lex_state] when :on_comment - IgnoreStateToken.new([[lineno, column], event, value, lex_state]) + [[lineno, column], event, value, lex_state] when :on_heredoc_end # Heredoc end tokens can be emitted in an odd order, so we don't # want to bother comparing the state on them. last_heredoc_end = token.location.end_offset - IgnoreStateToken.new([[lineno, column], event, value, lex_state]) + [[lineno, column], event, value, lex_state] when :on_embexpr_end - IgnoreStateToken.new([[lineno, column], event, value, lex_state]) + [[lineno, column], event, value, lex_state] when :on_words_sep # Ripper emits one token each per line. value.each_line.with_index do |line, index| @@ -664,7 +613,7 @@ module Prism lineno += 1 column = 0 end - tokens << Token.new([[lineno, column], event, line, lex_state]) + tokens << [[lineno, column], event, line, lex_state] end tokens.pop when :on_regexp_end @@ -696,7 +645,7 @@ module Prism previous_state end - Token.new([[lineno, column], event, value, lex_state]) + [[lineno, column], event, value, lex_state] when :on_eof eof_token = token previous_token = result_value[index - 1][0] @@ -721,13 +670,13 @@ module Prism end_offset += 3 end - tokens << Token.new([[lineno, 0], :on_nl, source.slice(start_offset, end_offset - start_offset), lex_state]) + tokens << [[lineno, 0], :on_nl, source.slice(start_offset, end_offset - start_offset), lex_state] end end - Token.new([[lineno, column], event, value, lex_state]) + [[lineno, column], event, value, lex_state] else - Token.new([[lineno, column], event, value, lex_state]) + [[lineno, column], event, value, lex_state] end previous_state = lex_state @@ -813,9 +762,8 @@ module Prism tokens = tokens[0...-1] # We sort by location because Ripper.lex sorts. - # Manually implemented instead of `sort_by!(&:location)` for performance. tokens.sort_by! do |token| - line, column = token.location + line, column = token[0] source.byte_offset(line, column) end @@ -834,7 +782,7 @@ module Prism prev_token_end = bom ? 3 : 0 tokens.each do |token| - line, column = token.location + line, column = token[0] start_offset = source.byte_offset(line, column) # Ripper reports columns on line 1 without counting the BOM, so we @@ -858,50 +806,28 @@ module Prism continuation = sp_value[continuation_index...next_whitespace_index] second_whitespace = sp_value[next_whitespace_index..] - new_tokens << IgnoreStateToken.new([ - [sp_line, sp_column], - :on_sp, - first_whitespace, - prev_token_state - ]) unless first_whitespace.empty? - - new_tokens << IgnoreStateToken.new([ - [sp_line, sp_column + continuation_index], - :on_sp, - continuation, - prev_token_state - ]) - - new_tokens << IgnoreStateToken.new([ - [sp_line + 1, 0], - :on_sp, - second_whitespace, - prev_token_state - ]) unless second_whitespace.empty? + new_tokens << [[sp_line, sp_column], :on_sp, first_whitespace, prev_token_state] unless first_whitespace.empty? + new_tokens << [[sp_line, sp_column + continuation_index], :on_sp, continuation, prev_token_state] + new_tokens << [[sp_line + 1, 0], :on_sp, second_whitespace, prev_token_state] unless second_whitespace.empty? else - new_tokens << IgnoreStateToken.new([ - [sp_line, sp_column], - :on_sp, - sp_value, - prev_token_state - ]) + new_tokens << [[sp_line, sp_column], :on_sp, sp_value, prev_token_state] end end new_tokens << token - prev_token_state = token.state - prev_token_end = start_offset + token.value.bytesize + prev_token_state = token[3] + prev_token_end = start_offset + token[2].bytesize end unless data_loc # no trailing :on_sp with __END__ as it is always preceded by :on_nl end_offset = eof_token.location.end_offset if prev_token_end < end_offset - new_tokens << IgnoreStateToken.new([ + new_tokens << [ [source.line(prev_token_end), source.column(prev_token_end)], :on_sp, source.slice(prev_token_end, end_offset - prev_token_end), prev_token_state - ]) + ] end end diff --git a/lib/prism/translation/ripper.rb b/lib/prism/translation/ripper.rb index ccce226d7d..054ad88ce3 100644 --- a/lib/prism/translation/ripper.rb +++ b/lib/prism/translation/ripper.rb @@ -88,7 +88,7 @@ module Prism # # => ["def", " ", "m", "(", "a", ")", " ", "nil", " ", "end"] # def self.tokenize(...) - lex(...).map(&:value) + lex(...).map { |token| token[2] } end # This contains a table of all of the parser events and their diff --git a/test/prism/ruby/ripper_test.rb b/test/prism/ruby/ripper_test.rb index c8d259135f..a89a9503b9 100644 --- a/test/prism/ruby/ripper_test.rb +++ b/test/prism/ruby/ripper_test.rb @@ -136,7 +136,7 @@ module Prism assert_equal(expected, lexer.parse[0].to_a) assert_equal(lexer.parse[0].to_a, lexer.scan[0].to_a) - assert_equal(%i[on_int on_sp on_op], Translation::Ripper::Lexer.new("1 +").lex.map(&:event)) + assert_equal(%i[on_int on_sp on_op], Translation::Ripper::Lexer.new("1 +").lex.map { |token| token[1] }) assert_raise(SyntaxError) { Translation::Ripper::Lexer.new("1 +").lex(raise_errors: true) } end @@ -169,13 +169,13 @@ module Prism # Prism emits tokens by their order in the code, not in parse order ripper.sort_by! { |elem| elem[0] } - [prism.size, ripper.size].max.times do |i| - expected = ripper[i] - actual = prism[i] + [prism.size, ripper.size].max.times do |index| + expected = ripper[index] + actual = prism[index] - # Since tokens related to heredocs are not emitted in the same order, - # the state also doesn't line up. - if expected && actual && expected[1] == :on_heredoc_end && actual[1] == :on_heredoc_end + # There are some tokens that have slightly different state that do not + # effect the parse tree, so they may not match. + if expected && actual && expected[1] == actual[1] && %i[on_comment on_heredoc_end on_embexpr_end on_sp].include?(expected[1]) expected[3] = actual[3] = nil end |
