summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKevin Newton <kddnewton@gmail.com>2026-01-28 09:12:51 -0500
committergit <svn-admin@ruby-lang.org>2026-01-28 14:58:55 +0000
commit01ace0655ed84708f0afdcc74fb779e680bfc4e0 (patch)
treea63b6f791e2b56213a81c16ea282dfcd71e1f1f3
parent3b22e32fa50c2c18663be87dad4d11a266954773 (diff)
[ruby/prism] Remove tokens from lex compat
Instead of having custom classes, use arrays and track which tokens we should ignore the state for in the test. https://github.com/ruby/prism/commit/a333b56ada
-rw-r--r--lib/prism/lex_compat.rb150
-rw-r--r--lib/prism/translation/ripper.rb2
-rw-r--r--test/prism/ruby/ripper_test.rb14
3 files changed, 46 insertions, 120 deletions
diff --git a/lib/prism/lex_compat.rb b/lib/prism/lex_compat.rb
index c23adda241..4c516a9de0 100644
--- a/lib/prism/lex_compat.rb
+++ b/lib/prism/lex_compat.rb
@@ -196,57 +196,6 @@ module Prism
"__END__": :on___end__
}.freeze
- # When we produce tokens, we produce the same arrays that Ripper does.
- # However, we add a couple of convenience methods onto them to make them a
- # little easier to work with. We delegate all other methods to the array.
- class Token < BasicObject
- # Create a new token object with the given ripper-compatible array.
- def initialize(array)
- @array = array
- end
-
- # The location of the token in the source.
- def location
- @array[0]
- end
-
- # The type of the token.
- def event
- @array[1]
- end
-
- # The slice of the source that this token represents.
- def value
- @array[2]
- end
-
- # The state of the lexer when this token was produced.
- def state
- @array[3]
- end
-
- # We want to pretend that this is just an Array.
- def ==(other) # :nodoc:
- @array == other
- end
-
- def respond_to_missing?(name, include_private = false) # :nodoc:
- @array.respond_to?(name, include_private)
- end
-
- def method_missing(name, ...) # :nodoc:
- @array.send(name, ...)
- end
- end
-
- # Tokens where state should be ignored
- # used for :on_sp, :on_comment, :on_heredoc_end, :on_embexpr_end
- class IgnoreStateToken < Token
- def ==(other) # :nodoc:
- self[0...-1] == other[0...-1]
- end
- end
-
# A heredoc in this case is a list of tokens that belong to the body of the
# heredoc that should be appended onto the list of tokens when the heredoc
# closes.
@@ -290,7 +239,7 @@ module Prism
embexpr_balance = 0
tokens.each_with_object([]) do |token, results| #$ Array[Token]
- case token.event
+ case token[1]
when :on_embexpr_beg
embexpr_balance += 1
results << token
@@ -305,9 +254,9 @@ module Prism
if split
# Split on "\\\n" to mimic Ripper's behavior. Use a lookbehind
# to keep the delimiter in the result.
- token.value.split(/(?<=[^\\]\\\n)|(?<=[^\\]\\\r\n)/).each_with_index do |value, index|
+ token[2].split(/(?<=[^\\]\\\n)|(?<=[^\\]\\\r\n)/).each_with_index do |value, index|
column = 0 if index > 0
- results << Token.new([[lineno, column], :on_tstring_content, value, token.state])
+ results << [[lineno, column], :on_tstring_content, value, token[3]]
lineno += value.count("\n")
end
else
@@ -350,7 +299,7 @@ module Prism
# whitespace on plain string content tokens. This allows us to later
# remove that amount of whitespace from the beginning of each line.
def <<(token)
- case token.event
+ case token[1]
when :on_embexpr_beg, :on_heredoc_beg
@embexpr_balance += 1
@dedent = 0 if @dedent_next && @ended_on_newline
@@ -358,7 +307,7 @@ module Prism
@embexpr_balance -= 1
when :on_tstring_content
if embexpr_balance == 0
- line = token.value
+ line = token[2]
if dedent_next && !(line.strip.empty? && line.end_with?("\n"))
leading = line[/\A(\s*)\n?/, 1]
@@ -381,7 +330,7 @@ module Prism
end
end
- @dedent_next = token.event == :on_tstring_content && embexpr_balance == 0
+ @dedent_next = token[1] == :on_tstring_content && embexpr_balance == 0
@ended_on_newline = false
tokens << token
end
@@ -394,7 +343,7 @@ module Prism
embexpr_balance = 0
tokens.each do |token|
- case token.event
+ case token[1]
when :on_embexpr_beg, :on_heredoc_beg
embexpr_balance += 1
results << token
@@ -406,9 +355,9 @@ module Prism
lineno = token[0][0]
column = token[0][1]
- token.value.split(/(?<=\n)/).each_with_index do |value, index|
+ token[2].split(/(?<=\n)/).each_with_index do |value, index|
column = 0 if index > 0
- results << Token.new([[lineno, column], :on_tstring_content, value, token.state])
+ results << [[lineno, column], :on_tstring_content, value, token[3]]
lineno += 1
end
else
@@ -436,15 +385,15 @@ module Prism
results << token
index += 1
- case token.event
+ case token[1]
when :on_embexpr_beg, :on_heredoc_beg
embexpr_balance += 1
when :on_embexpr_end, :on_heredoc_end
embexpr_balance -= 1
when :on_tstring_content
if embexpr_balance == 0
- while index < max_index && tokens[index].event == :on_tstring_content && !token.value.match?(/\\\r?\n\z/)
- token.value << tokens[index].value
+ while index < max_index && tokens[index][1] == :on_tstring_content && !token[2].match?(/\\\r?\n\z/)
+ token[2] << tokens[index][2]
index += 1
end
end
@@ -467,7 +416,7 @@ module Prism
# whitespace calculation we performed above. This is because
# checking if the subsequent token needs to be dedented is common to
# both the dedent calculation and the ignored_sp insertion.
- case token.event
+ case token[1]
when :on_embexpr_beg
embexpr_balance += 1
results << token
@@ -479,7 +428,7 @@ module Prism
# Here we're going to split the string on newlines, but maintain
# the newlines in the resulting array. We'll do that with a look
# behind assertion.
- splits = token.value.split(/(?<=\n)/)
+ splits = token[2].split(/(?<=\n)/)
index = 0
while index < splits.length
@@ -536,12 +485,12 @@ module Prism
ignored = deleted_chars.join
line.delete_prefix!(ignored)
- results << Token.new([[lineno, 0], :on_ignored_sp, ignored, token[3]])
+ results << [[lineno, 0], :on_ignored_sp, ignored, token[3]]
column = ignored.length
end
end
- results << Token.new([[lineno, column], token[1], line, token[3]]) unless line.empty?
+ results << [[lineno, column], token[1], line, token[3]] unless line.empty?
index += 1
end
else
@@ -552,7 +501,7 @@ module Prism
end
dedent_next =
- ((token.event == :on_tstring_content) || (token.event == :on_heredoc_end)) &&
+ ((token[1] == :on_tstring_content) || (token[1] == :on_heredoc_end)) &&
embexpr_balance == 0
end
@@ -563,11 +512,11 @@ module Prism
# Here we will split between the two types of heredocs and return the
# object that will store their tokens.
def self.build(opening)
- case opening.value[2]
+ case opening[2][2]
when "~"
DedentingHeredoc.new
when "-"
- DashHeredoc.new(opening.value[3] != "'")
+ DashHeredoc.new(opening[2][3] != "'")
else
PlainHeredoc.new
end
@@ -647,16 +596,16 @@ module Prism
# Ripper doesn't include the rest of the token in the event, so we need to
# trim it down to just the content on the first line.
value = value[0..value.index("\n")]
- Token.new([[lineno, column], event, value, lex_state])
+ [[lineno, column], event, value, lex_state]
when :on_comment
- IgnoreStateToken.new([[lineno, column], event, value, lex_state])
+ [[lineno, column], event, value, lex_state]
when :on_heredoc_end
# Heredoc end tokens can be emitted in an odd order, so we don't
# want to bother comparing the state on them.
last_heredoc_end = token.location.end_offset
- IgnoreStateToken.new([[lineno, column], event, value, lex_state])
+ [[lineno, column], event, value, lex_state]
when :on_embexpr_end
- IgnoreStateToken.new([[lineno, column], event, value, lex_state])
+ [[lineno, column], event, value, lex_state]
when :on_words_sep
# Ripper emits one token each per line.
value.each_line.with_index do |line, index|
@@ -664,7 +613,7 @@ module Prism
lineno += 1
column = 0
end
- tokens << Token.new([[lineno, column], event, line, lex_state])
+ tokens << [[lineno, column], event, line, lex_state]
end
tokens.pop
when :on_regexp_end
@@ -696,7 +645,7 @@ module Prism
previous_state
end
- Token.new([[lineno, column], event, value, lex_state])
+ [[lineno, column], event, value, lex_state]
when :on_eof
eof_token = token
previous_token = result_value[index - 1][0]
@@ -721,13 +670,13 @@ module Prism
end_offset += 3
end
- tokens << Token.new([[lineno, 0], :on_nl, source.slice(start_offset, end_offset - start_offset), lex_state])
+ tokens << [[lineno, 0], :on_nl, source.slice(start_offset, end_offset - start_offset), lex_state]
end
end
- Token.new([[lineno, column], event, value, lex_state])
+ [[lineno, column], event, value, lex_state]
else
- Token.new([[lineno, column], event, value, lex_state])
+ [[lineno, column], event, value, lex_state]
end
previous_state = lex_state
@@ -813,9 +762,8 @@ module Prism
tokens = tokens[0...-1]
# We sort by location because Ripper.lex sorts.
- # Manually implemented instead of `sort_by!(&:location)` for performance.
tokens.sort_by! do |token|
- line, column = token.location
+ line, column = token[0]
source.byte_offset(line, column)
end
@@ -834,7 +782,7 @@ module Prism
prev_token_end = bom ? 3 : 0
tokens.each do |token|
- line, column = token.location
+ line, column = token[0]
start_offset = source.byte_offset(line, column)
# Ripper reports columns on line 1 without counting the BOM, so we
@@ -858,50 +806,28 @@ module Prism
continuation = sp_value[continuation_index...next_whitespace_index]
second_whitespace = sp_value[next_whitespace_index..]
- new_tokens << IgnoreStateToken.new([
- [sp_line, sp_column],
- :on_sp,
- first_whitespace,
- prev_token_state
- ]) unless first_whitespace.empty?
-
- new_tokens << IgnoreStateToken.new([
- [sp_line, sp_column + continuation_index],
- :on_sp,
- continuation,
- prev_token_state
- ])
-
- new_tokens << IgnoreStateToken.new([
- [sp_line + 1, 0],
- :on_sp,
- second_whitespace,
- prev_token_state
- ]) unless second_whitespace.empty?
+ new_tokens << [[sp_line, sp_column], :on_sp, first_whitespace, prev_token_state] unless first_whitespace.empty?
+ new_tokens << [[sp_line, sp_column + continuation_index], :on_sp, continuation, prev_token_state]
+ new_tokens << [[sp_line + 1, 0], :on_sp, second_whitespace, prev_token_state] unless second_whitespace.empty?
else
- new_tokens << IgnoreStateToken.new([
- [sp_line, sp_column],
- :on_sp,
- sp_value,
- prev_token_state
- ])
+ new_tokens << [[sp_line, sp_column], :on_sp, sp_value, prev_token_state]
end
end
new_tokens << token
- prev_token_state = token.state
- prev_token_end = start_offset + token.value.bytesize
+ prev_token_state = token[3]
+ prev_token_end = start_offset + token[2].bytesize
end
unless data_loc # no trailing :on_sp with __END__ as it is always preceded by :on_nl
end_offset = eof_token.location.end_offset
if prev_token_end < end_offset
- new_tokens << IgnoreStateToken.new([
+ new_tokens << [
[source.line(prev_token_end), source.column(prev_token_end)],
:on_sp,
source.slice(prev_token_end, end_offset - prev_token_end),
prev_token_state
- ])
+ ]
end
end
diff --git a/lib/prism/translation/ripper.rb b/lib/prism/translation/ripper.rb
index ccce226d7d..054ad88ce3 100644
--- a/lib/prism/translation/ripper.rb
+++ b/lib/prism/translation/ripper.rb
@@ -88,7 +88,7 @@ module Prism
# # => ["def", " ", "m", "(", "a", ")", " ", "nil", " ", "end"]
#
def self.tokenize(...)
- lex(...).map(&:value)
+ lex(...).map { |token| token[2] }
end
# This contains a table of all of the parser events and their
diff --git a/test/prism/ruby/ripper_test.rb b/test/prism/ruby/ripper_test.rb
index c8d259135f..a89a9503b9 100644
--- a/test/prism/ruby/ripper_test.rb
+++ b/test/prism/ruby/ripper_test.rb
@@ -136,7 +136,7 @@ module Prism
assert_equal(expected, lexer.parse[0].to_a)
assert_equal(lexer.parse[0].to_a, lexer.scan[0].to_a)
- assert_equal(%i[on_int on_sp on_op], Translation::Ripper::Lexer.new("1 +").lex.map(&:event))
+ assert_equal(%i[on_int on_sp on_op], Translation::Ripper::Lexer.new("1 +").lex.map { |token| token[1] })
assert_raise(SyntaxError) { Translation::Ripper::Lexer.new("1 +").lex(raise_errors: true) }
end
@@ -169,13 +169,13 @@ module Prism
# Prism emits tokens by their order in the code, not in parse order
ripper.sort_by! { |elem| elem[0] }
- [prism.size, ripper.size].max.times do |i|
- expected = ripper[i]
- actual = prism[i]
+ [prism.size, ripper.size].max.times do |index|
+ expected = ripper[index]
+ actual = prism[index]
- # Since tokens related to heredocs are not emitted in the same order,
- # the state also doesn't line up.
- if expected && actual && expected[1] == :on_heredoc_end && actual[1] == :on_heredoc_end
+ # There are some tokens that have slightly different state that do not
+ # effect the parse tree, so they may not match.
+ if expected && actual && expected[1] == actual[1] && %i[on_comment on_heredoc_end on_embexpr_end on_sp].include?(expected[1])
expected[3] = actual[3] = nil
end