summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBenoit Daloze <eregontp@gmail.com>2026-01-20 08:53:39 +0100
committergit <svn-admin@ruby-lang.org>2026-01-20 09:53:08 +0000
commit35a7b5159f39de2cac848c072674e5350cc41aa4 (patch)
tree9b8fd22580bc78d5579de76fba6421978b0452f1
parent53fe9933fd6c62f3a7f1ed2908a99510c2f27adc (diff)
[ruby/prism] Add Ripper :on_sp events for Prism.lex_compat and Prism::Translation::Ripper
* Handle line continuations. * Handle space at the end of file in LexCompat. https://github.com/ruby/prism/commit/32bd13eb7d Co-authored-by: Earlopain <14981592+Earlopain@users.noreply.github.com>
-rw-r--r--lib/prism.rb8
-rw-r--r--lib/prism/lex_compat.rb101
-rw-r--r--lib/prism/lex_ripper.rb2
-rw-r--r--test/prism/fixtures/bom_leading_space.txt1
-rw-r--r--test/prism/fixtures/bom_spaces.txt1
-rw-r--r--test/prism/ruby/ripper_test.rb12
6 files changed, 106 insertions, 19 deletions
diff --git a/lib/prism.rb b/lib/prism.rb
index d809557fce..dab3420377 100644
--- a/lib/prism.rb
+++ b/lib/prism.rb
@@ -61,8 +61,7 @@ module Prism
# Prism::lex_compat(source, **options) -> LexCompat::Result
#
# Returns a parse result whose value is an array of tokens that closely
- # resembles the return value of Ripper::lex. The main difference is that the
- # `:on_sp` token is not emitted.
+ # resembles the return value of Ripper::lex.
#
# For supported options, see Prism::parse.
def self.lex_compat(source, **options)
@@ -72,9 +71,8 @@ module Prism
# :call-seq:
# Prism::lex_ripper(source) -> Array
#
- # This lexes with the Ripper lex. It drops any space events but otherwise
- # returns the same tokens. Raises SyntaxError if the syntax in source is
- # invalid.
+ # This wraps the result of Ripper.lex. It produces almost exactly the
+ # same tokens. Raises SyntaxError if the syntax in source is invalid.
def self.lex_ripper(source)
LexRipper.new(source).result # steep:ignore
end
diff --git a/lib/prism/lex_compat.rb b/lib/prism/lex_compat.rb
index f7b9a0effc..597e63c73e 100644
--- a/lib/prism/lex_compat.rb
+++ b/lib/prism/lex_compat.rb
@@ -226,7 +226,7 @@ module Prism
end
# Tokens where state should be ignored
- # used for :on_comment, :on_heredoc_end, :on_embexpr_end
+ # used for :on_sp, :on_comment, :on_heredoc_end, :on_embexpr_end
class IgnoreStateToken < Token
def ==(other) # :nodoc:
self[0...-1] == other[0...-1]
@@ -611,10 +611,10 @@ module Prism
BOM_FLUSHED = RUBY_VERSION >= "3.3.0"
private_constant :BOM_FLUSHED
- attr_reader :source, :options
+ attr_reader :options
- def initialize(source, **options)
- @source = source
+ def initialize(code, **options)
+ @code = code
@options = options
end
@@ -624,12 +624,14 @@ module Prism
state = :default
heredoc_stack = [[]] #: Array[Array[Heredoc::PlainHeredoc | Heredoc::DashHeredoc | Heredoc::DedentingHeredoc]]
- result = Prism.lex(source, **options)
+ result = Prism.lex(@code, **options)
+ source = result.source
result_value = result.value
previous_state = nil #: State?
last_heredoc_end = nil #: Integer?
+ eof_token = nil
- bom = source.byteslice(0..2) == "\xEF\xBB\xBF"
+ bom = source.slice(0, 3) == "\xEF\xBB\xBF"
result_value.each_with_index do |(token, lex_state), index|
lineno = token.location.start_line
@@ -741,6 +743,7 @@ module Prism
Token.new([[lineno, column], event, value, lex_state])
when :on_eof
+ eof_token = token
previous_token = result_value[index - 1][0]
# If we're at the end of the file and the previous token was a
@@ -763,7 +766,7 @@ module Prism
end_offset += 3
end
- tokens << Token.new([[lineno, 0], :on_nl, source.byteslice(start_offset...end_offset), lex_state])
+ tokens << Token.new([[lineno, 0], :on_nl, source.slice(start_offset, end_offset - start_offset), lex_state])
end
end
@@ -857,7 +860,89 @@ module Prism
# We sort by location to compare against Ripper's output
tokens.sort_by!(&:location)
- Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, Source.for(source))
+ # Add :on_sp tokens
+ tokens = add_on_sp_tokens(tokens, source, result.data_loc, bom, eof_token)
+
+ Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, source)
+ end
+
+ def add_on_sp_tokens(tokens, source, data_loc, bom, eof_token)
+ new_tokens = []
+
+ prev_token_state = Translation::Ripper::Lexer::State.cached(Translation::Ripper::EXPR_BEG)
+ prev_token_end = bom ? 3 : 0
+
+ tokens.each do |token|
+ line, column = token.location
+ start_offset = source.line_to_byte_offset(line) + column
+ # Ripper reports columns on line 1 without counting the BOM, so we adjust to get the real offset
+ start_offset += 3 if line == 1 && bom
+
+ if start_offset > prev_token_end
+ sp_value = source.slice(prev_token_end, start_offset - prev_token_end)
+ sp_line = source.line(prev_token_end)
+ sp_column = source.column(prev_token_end)
+ # Ripper reports columns on line 1 without counting the BOM
+ sp_column -= 3 if sp_line == 1 && bom
+ continuation_index = sp_value.byteindex("\\")
+
+ # ripper emits up to three :on_sp tokens when line continuations are used
+ if continuation_index
+ next_whitespace_index = continuation_index + 1
+ next_whitespace_index += 1 if sp_value.byteslice(next_whitespace_index) == "\r"
+ next_whitespace_index += 1
+ first_whitespace = sp_value[0...continuation_index]
+ continuation = sp_value[continuation_index...next_whitespace_index]
+ second_whitespace = sp_value[next_whitespace_index..]
+
+ new_tokens << IgnoreStateToken.new([
+ [sp_line, sp_column],
+ :on_sp,
+ first_whitespace,
+ prev_token_state
+ ]) unless first_whitespace.empty?
+
+ new_tokens << IgnoreStateToken.new([
+ [sp_line, sp_column + continuation_index],
+ :on_sp,
+ continuation,
+ prev_token_state
+ ])
+
+ new_tokens << IgnoreStateToken.new([
+ [sp_line + 1, 0],
+ :on_sp,
+ second_whitespace,
+ prev_token_state
+ ]) unless second_whitespace.empty?
+ else
+ new_tokens << IgnoreStateToken.new([
+ [sp_line, sp_column],
+ :on_sp,
+ sp_value,
+ prev_token_state
+ ])
+ end
+ end
+
+ new_tokens << token
+ prev_token_state = token.state
+ prev_token_end = start_offset + token.value.bytesize
+ end
+
+ unless data_loc # no trailing :on_sp with __END__ as it is always preceded by :on_nl
+ end_offset = eof_token.location.end_offset
+ if prev_token_end < end_offset
+ new_tokens << IgnoreStateToken.new([
+ [source.line(prev_token_end), source.column(prev_token_end)],
+ :on_sp,
+ source.slice(prev_token_end, end_offset - prev_token_end),
+ prev_token_state
+ ])
+ end
+ end
+
+ new_tokens
end
end
diff --git a/lib/prism/lex_ripper.rb b/lib/prism/lex_ripper.rb
index 4b5c3b77fd..2054cf55ac 100644
--- a/lib/prism/lex_ripper.rb
+++ b/lib/prism/lex_ripper.rb
@@ -19,8 +19,6 @@ module Prism
lex(source).each do |token|
case token[1]
- when :on_sp
- # skip
when :on_tstring_content
if previous[1] == :on_tstring_content && (token[2].start_with?("\#$") || token[2].start_with?("\#@"))
previous[2] << token[2]
diff --git a/test/prism/fixtures/bom_leading_space.txt b/test/prism/fixtures/bom_leading_space.txt
new file mode 100644
index 0000000000..48d3ee50ea
--- /dev/null
+++ b/test/prism/fixtures/bom_leading_space.txt
@@ -0,0 +1 @@
+ p (42)
diff --git a/test/prism/fixtures/bom_spaces.txt b/test/prism/fixtures/bom_spaces.txt
new file mode 100644
index 0000000000..c18ad4c21a
--- /dev/null
+++ b/test/prism/fixtures/bom_spaces.txt
@@ -0,0 +1 @@
+p ( 42 )
diff --git a/test/prism/ruby/ripper_test.rb b/test/prism/ruby/ripper_test.rb
index 2a0504c19f..280abd94ea 100644
--- a/test/prism/ruby/ripper_test.rb
+++ b/test/prism/ruby/ripper_test.rb
@@ -39,6 +39,8 @@ module Prism
# Skip these tests that we haven't implemented yet.
omitted_sexp_raw = [
+ "bom_leading_space.txt",
+ "bom_spaces.txt",
"dos_endings.txt",
"heredocs_with_fake_newlines.txt",
"heredocs_with_ignored_newlines.txt",
@@ -92,7 +94,7 @@ module Prism
assert_equal(expected, lexer.parse[0].to_a)
assert_equal(lexer.parse[0].to_a, lexer.scan[0].to_a)
- assert_equal(%i[on_int on_op], Translation::Ripper::Lexer.new("1 +").lex.map(&:event))
+ assert_equal(%i[on_int on_sp on_op], Translation::Ripper::Lexer.new("1 +").lex.map(&:event))
assert_raise(SyntaxError) { Translation::Ripper::Lexer.new("1 +").lex(raise_errors: true) }
end
@@ -121,15 +123,17 @@ module Prism
def assert_ripper_lex(source)
prism = Translation::Ripper.lex(source)
ripper = Ripper.lex(source)
- ripper.reject! { |elem| elem[1] == :on_sp } # Prism doesn't emit on_sp
- ripper.sort_by! { |elem| elem[0] } # Prism emits tokens by their order in the code, not in parse order
+
+ # Prism emits tokens by their order in the code, not in parse order
+ ripper.sort_by! { |elem| elem[0] }
[prism.size, ripper.size].max.times do |i|
expected = ripper[i]
actual = prism[i]
+
# Since tokens related to heredocs are not emitted in the same order,
# the state also doesn't line up.
- if expected[1] == :on_heredoc_end && actual[1] == :on_heredoc_end
+ if expected && actual && expected[1] == :on_heredoc_end && actual[1] == :on_heredoc_end
expected[3] = actual[3] = nil
end