summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/prism/parse_result.rb112
-rw-r--r--test/prism/ruby/location_test.rb46
2 files changed, 158 insertions, 0 deletions
diff --git a/lib/prism/parse_result.rb b/lib/prism/parse_result.rb
index e3ba7e7c8e..46bd33d1db 100644
--- a/lib/prism/parse_result.rb
+++ b/lib/prism/parse_result.rb
@@ -120,6 +120,12 @@ module Prism
end
end
+ # Generate a cache that targets a specific encoding for calculating code
+ # unit offsets.
+ def code_units_cache(encoding)
+ CodeUnitsCache.new(source, encoding)
+ end
+
# Returns the column number in code units for the given encoding for the
# given byte offset.
def code_units_column(byte_offset, encoding)
@@ -149,6 +155,76 @@ module Prism
end
end
+ # A cache that can be used to quickly compute code unit offsets from byte
+ # offsets. It purposefully provides only a single #[] method to access the
+ # cache in order to minimize surface area.
+ #
+ # Note that there are some known issues here that may or may not be addressed
+ # in the future:
+ #
+ # * The first is that there are issues when the cache computes values that are
+ # not on character boundaries. This can result in subsequent computations
+ # being off by one or more code units.
+ # * The second is that this cache is currently unbounded. In theory we could
+ # introduce some kind of LRU cache to limit the number of entries, but this
+ # has not yet been implemented.
+ #
+ class CodeUnitsCache
+ class UTF16Counter # :nodoc:
+ def initialize(source, encoding)
+ @source = source
+ @encoding = encoding
+ end
+
+ def count(byte_offset, byte_length)
+ @source.byteslice(byte_offset, byte_length).encode(@encoding, invalid: :replace, undef: :replace).bytesize / 2
+ end
+ end
+
+ class LengthCounter # :nodoc:
+ def initialize(source, encoding)
+ @source = source
+ @encoding = encoding
+ end
+
+ def count(byte_offset, byte_length)
+ @source.byteslice(byte_offset, byte_length).encode(@encoding, invalid: :replace, undef: :replace).length
+ end
+ end
+
+ private_constant :UTF16Counter, :LengthCounter
+
+ # Initialize a new cache with the given source and encoding.
+ def initialize(source, encoding)
+ @source = source
+ @counter =
+ if encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE
+ UTF16Counter.new(source, encoding)
+ else
+ LengthCounter.new(source, encoding)
+ end
+
+ @cache = {}
+ @offsets = []
+ end
+
+ # Retrieve the code units offset from the given byte offset.
+ def [](byte_offset)
+ @cache[byte_offset] ||=
+ if (index = @offsets.bsearch_index { |offset| offset > byte_offset }).nil?
+ @offsets << byte_offset
+ @counter.count(0, byte_offset)
+ elsif index == 0
+ @offsets.unshift(byte_offset)
+ @counter.count(0, byte_offset)
+ else
+ @offsets.insert(index, byte_offset)
+ offset = @offsets[index - 1]
+ @cache[offset] + @counter.count(offset, byte_offset - offset)
+ end
+ end
+ end
+
# Specialized version of Prism::Source for source code that includes ASCII
# characters only. This class is used to apply performance optimizations that
# cannot be applied to sources that include multibyte characters.
@@ -178,6 +254,13 @@ module Prism
byte_offset
end
+ # Returns a cache that is the identity function in order to maintain the
+ # same interface. We can do this because code units are always equivalent to
+ # byte offsets for ASCII-only sources.
+ def code_units_cache(encoding)
+ ->(byte_offset) { byte_offset }
+ end
+
# Specialized version of `code_units_column` that does not depend on
# `code_units_offset`, which is a more expensive operation. This is
# essentially the same as `Prism::Source#column`.
@@ -287,6 +370,12 @@ module Prism
source.code_units_offset(start_offset, encoding)
end
+ # The start offset from the start of the file in code units using the given
+ # cache to fetch or calculate the value.
+ def cached_start_code_units_offset(cache)
+ cache[start_offset]
+ end
+
# The byte offset from the beginning of the source where this location ends.
def end_offset
start_offset + length
@@ -303,6 +392,12 @@ module Prism
source.code_units_offset(end_offset, encoding)
end
+ # The end offset from the start of the file in code units using the given
+ # cache to fetch or calculate the value.
+ def cached_end_code_units_offset(cache)
+ cache[end_offset]
+ end
+
# The line number where this location starts.
def start_line
source.line(start_offset)
@@ -337,6 +432,12 @@ module Prism
source.code_units_column(start_offset, encoding)
end
+ # The start column in code units using the given cache to fetch or calculate
+ # the value.
+ def cached_start_code_units_column(cache)
+ cache[start_offset] - cache[source.line_start(start_offset)]
+ end
+
# The column number in bytes where this location ends from the start of the
# line.
def end_column
@@ -355,6 +456,12 @@ module Prism
source.code_units_column(end_offset, encoding)
end
+ # The end column in code units using the given cache to fetch or calculate
+ # the value.
+ def cached_end_code_units_column(cache)
+ cache[end_offset] - cache[source.line_start(end_offset)]
+ end
+
# Implement the hash pattern matching interface for Location.
def deconstruct_keys(keys)
{ start_offset: start_offset, end_offset: end_offset }
@@ -604,6 +711,11 @@ module Prism
def failure?
!success?
end
+
+ # Create a code units cache for the given encoding.
+ def code_units_cache(encoding)
+ source.code_units_cache(encoding)
+ end
end
# This is a result specific to the `parse` and `parse_file` methods.
diff --git a/test/prism/ruby/location_test.rb b/test/prism/ruby/location_test.rb
index 3d3e7dd562..33f844243c 100644
--- a/test/prism/ruby/location_test.rb
+++ b/test/prism/ruby/location_test.rb
@@ -140,6 +140,52 @@ module Prism
assert_equal 7, location.end_code_units_column(Encoding::UTF_32LE)
end
+ def test_cached_code_units
+ result = Prism.parse("šŸ˜€ + šŸ˜€\nšŸ˜ ||= šŸ˜")
+
+ utf8_cache = result.code_units_cache(Encoding::UTF_8)
+ utf16_cache = result.code_units_cache(Encoding::UTF_16LE)
+ utf32_cache = result.code_units_cache(Encoding::UTF_32LE)
+
+ # first šŸ˜€
+ location = result.value.statements.body.first.receiver.location
+
+ assert_equal 0, location.cached_start_code_units_offset(utf8_cache)
+ assert_equal 0, location.cached_start_code_units_offset(utf16_cache)
+ assert_equal 0, location.cached_start_code_units_offset(utf32_cache)
+
+ assert_equal 1, location.cached_end_code_units_offset(utf8_cache)
+ assert_equal 2, location.cached_end_code_units_offset(utf16_cache)
+ assert_equal 1, location.cached_end_code_units_offset(utf32_cache)
+
+ assert_equal 0, location.cached_start_code_units_column(utf8_cache)
+ assert_equal 0, location.cached_start_code_units_column(utf16_cache)
+ assert_equal 0, location.cached_start_code_units_column(utf32_cache)
+
+ assert_equal 1, location.cached_end_code_units_column(utf8_cache)
+ assert_equal 2, location.cached_end_code_units_column(utf16_cache)
+ assert_equal 1, location.cached_end_code_units_column(utf32_cache)
+
+ # second šŸ˜€
+ location = result.value.statements.body.first.arguments.arguments.first.location
+
+ assert_equal 4, location.cached_start_code_units_offset(utf8_cache)
+ assert_equal 5, location.cached_start_code_units_offset(utf16_cache)
+ assert_equal 4, location.cached_start_code_units_offset(utf32_cache)
+
+ assert_equal 5, location.cached_end_code_units_offset(utf8_cache)
+ assert_equal 7, location.cached_end_code_units_offset(utf16_cache)
+ assert_equal 5, location.cached_end_code_units_offset(utf32_cache)
+
+ assert_equal 4, location.cached_start_code_units_column(utf8_cache)
+ assert_equal 5, location.cached_start_code_units_column(utf16_cache)
+ assert_equal 4, location.cached_start_code_units_column(utf32_cache)
+
+ assert_equal 5, location.cached_end_code_units_column(utf8_cache)
+ assert_equal 7, location.cached_end_code_units_column(utf16_cache)
+ assert_equal 5, location.cached_end_code_units_column(utf32_cache)
+ end
+
def test_code_units_binary_valid_utf8
program = Prism.parse(<<~RUBY).value
# -*- encoding: binary -*-