summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/prism/parse_result.rb29
-rw-r--r--prism/templates/lib/prism/serialize.rb.erb11
-rw-r--r--test/prism/ruby/location_test.rb33
3 files changed, 61 insertions, 12 deletions
diff --git a/lib/prism/parse_result.rb b/lib/prism/parse_result.rb
index aea5dee9fa..e3ba7e7c8e 100644
--- a/lib/prism/parse_result.rb
+++ b/lib/prism/parse_result.rb
@@ -12,6 +12,21 @@ module Prism
def self.for(source, start_line = 1, offsets = [])
if source.ascii_only?
ASCIISource.new(source, start_line, offsets)
+ elsif source.encoding == Encoding::BINARY
+ source.force_encoding(Encoding::UTF_8)
+
+ if source.valid_encoding?
+ new(source, start_line, offsets)
+ else
+ # This is an extremely niche use case where the file is marked as
+ # binary, contains multi-byte characters, and those characters are not
+ # valid UTF-8. In this case we'll mark it as binary and fall back to
+ # treating everything as a single-byte character. This _may_ cause
+ # problems when asking for code units, but it appears to be the
+ # cleanest solution at the moment.
+ source.force_encoding(Encoding::BINARY)
+ ASCIISource.new(source, start_line, offsets)
+ end
else
new(source, start_line, offsets)
end
@@ -89,6 +104,12 @@ module Prism
# This method is tested with UTF-8, UTF-16, and UTF-32. If there is the
# concept of code units that differs from the number of characters in other
# encodings, it is not captured here.
+ #
+ # We purposefully replace invalid and undefined characters with replacement
+ # characters in this conversion. This happens for two reasons. First, it's
+ # possible that the given byte offset will not occur on a character
+ # boundary. Second, it's possible that the source code will contain a
+ # character that has no equivalent in the given encoding.
def code_units_offset(byte_offset, encoding)
byteslice = (source.byteslice(0, byte_offset) or raise).encode(encoding, invalid: :replace, undef: :replace)
@@ -130,8 +151,12 @@ module Prism
# Specialized version of Prism::Source for source code that includes ASCII
# characters only. This class is used to apply performance optimizations that
- # cannot be applied to sources that include multibyte characters. Sources that
- # include multibyte characters are represented by the Prism::Source class.
+ # cannot be applied to sources that include multibyte characters.
+ #
+ # In the extremely rare case that a source includes multi-byte characters but
+ # is marked as binary because of a magic encoding comment and it cannot be
+ # eagerly converted to UTF-8, this class will be used as well. This is because
+ # at that point we will treat everything as single-byte characters.
class ASCIISource < Source
# Return the character offset for the given byte offset.
def character_offset(byte_offset)
diff --git a/prism/templates/lib/prism/serialize.rb.erb b/prism/templates/lib/prism/serialize.rb.erb
index 62108ec28a..1c1cf6e22d 100644
--- a/prism/templates/lib/prism/serialize.rb.erb
+++ b/prism/templates/lib/prism/serialize.rb.erb
@@ -20,10 +20,21 @@ module Prism
def self.load(input, serialized)
input = input.dup
source = Source.for(input)
+
loader = Loader.new(source, serialized)
result = loader.load_result
input.force_encoding(loader.encoding)
+
+ # This is an extremely niche use-case where the file was marked as binary
+ # but it contained UTF-8-encoded characters. In that case we will actually
+ # put it back to UTF-8 to give the location APIs the best chance of being
+ # correct.
+ if !input.ascii_only? && input.encoding == Encoding::BINARY
+ input.force_encoding(Encoding::UTF_8)
+ input.force_encoding(Encoding::BINARY) unless input.valid_encoding?
+ end
+
result
end
diff --git a/test/prism/ruby/location_test.rb b/test/prism/ruby/location_test.rb
index e360a0db72..3d3e7dd562 100644
--- a/test/prism/ruby/location_test.rb
+++ b/test/prism/ruby/location_test.rb
@@ -140,23 +140,36 @@ module Prism
assert_equal 7, location.end_code_units_column(Encoding::UTF_32LE)
end
- def test_code_units_handles_binary_encoding_with_multibyte_characters
- # If the encoding is set to binary and the source contains multibyte
- # characters, we avoid breaking the code unit offsets, but they will
- # still be incorrect.
-
+ def test_code_units_binary_valid_utf8
program = Prism.parse(<<~RUBY).value
# -*- encoding: binary -*-
😀 + 😀
RUBY
- # first 😀
- location = program.statements.body.first.receiver.location
+ receiver = program.statements.body.first.receiver
+ assert_equal "😀".b.to_sym, receiver.name
+
+ location = receiver.location
+ assert_equal 1, location.end_code_units_column(Encoding::UTF_8)
+ assert_equal 2, location.end_code_units_column(Encoding::UTF_16LE)
+ assert_equal 1, location.end_code_units_column(Encoding::UTF_32LE)
+ end
- assert_equal 4, location.end_code_units_column(Encoding::UTF_8)
- assert_equal 4, location.end_code_units_column(Encoding::UTF_16LE)
- assert_equal 4, location.end_code_units_column(Encoding::UTF_32LE)
+ def test_code_units_binary_invalid_utf8
+ program = Prism.parse(<<~RUBY).value
+ # -*- encoding: binary -*-
+
+ \x90 + \x90
+ RUBY
+
+ receiver = program.statements.body.first.receiver
+ assert_equal "\x90".b.to_sym, receiver.name
+
+ location = receiver.location
+ assert_equal 1, location.end_code_units_column(Encoding::UTF_8)
+ assert_equal 1, location.end_code_units_column(Encoding::UTF_16LE)
+ assert_equal 1, location.end_code_units_column(Encoding::UTF_32LE)
end
def test_chop