2 files changed, 158 insertions, 0 deletions
diff --git a/lib/prism/parse_result.rb b/lib/prism/parse_result.rb
index e3ba7e7c8e..46bd33d1db 100644
--- a/lib/prism/parse_result.rb
+++ b/lib/prism/parse_result.rb
@@ -120,6 +120,12 @@ module Prism
       end
     end
 
+    # Generate a cache that targets a specific encoding for calculating code
+    # unit offsets.
+    def code_units_cache(encoding)
+      CodeUnitsCache.new(source, encoding)
+    end
+
     # Returns the column number in code units for the given encoding for the
     # given byte offset.
     def code_units_column(byte_offset, encoding)
@@ -149,6 +155,76 @@ module Prism
     end
   end
 
+  # A cache that can be used to quickly compute code unit offsets from byte
+  # offsets. It purposefully provides only a single #[] method to access the
+  # cache in order to minimize surface area.
+  #
+  # Note that there are some known issues here that may or may not be addressed
+  # in the future:
+  #
+  # * The first is that there are issues when the cache computes values that are
+  #   not on character boundaries. This can result in subsequent computations
+  #   being off by one or more code units.
+  # * The second is that this cache is currently unbounded. In theory we could
+  #   introduce some kind of LRU cache to limit the number of entries, but this
+  #   has not yet been implemented.
+  #
+  class CodeUnitsCache
+    class UTF16Counter # :nodoc:
+      def initialize(source, encoding)
+        @source = source
+        @encoding = encoding
+      end
+
+      def count(byte_offset, byte_length)
+        @source.byteslice(byte_offset, byte_length).encode(@encoding, invalid: :replace, undef: :replace).bytesize / 2
+      end
+    end
+
+    class LengthCounter # :nodoc:
+      def initialize(source, encoding)
+        @source = source
+        @encoding = encoding
+      end
+
+      def count(byte_offset, byte_length)
+        @source.byteslice(byte_offset, byte_length).encode(@encoding, invalid: :replace, undef: :replace).length
+      end
+    end
+
+    private_constant :UTF16Counter, :LengthCounter
+
+    # Initialize a new cache with the given source and encoding.
+    def initialize(source, encoding)
+      @source = source
+      @counter =
+        if encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE
+          UTF16Counter.new(source, encoding)
+        else
+          LengthCounter.new(source, encoding)
+        end
+
+      @cache = {}
+      @offsets = []
+    end
+
+    # Retrieve the code units offset from the given byte offset.
+    def [](byte_offset)
+      @cache[byte_offset] ||=
+        if (index = @offsets.bsearch_index { |offset| offset > byte_offset }).nil?
+          @offsets << byte_offset
+          @counter.count(0, byte_offset)
+        elsif index == 0
+          @offsets.unshift(byte_offset)
+          @counter.count(0, byte_offset)
+        else
+          @offsets.insert(index, byte_offset)
+          offset = @offsets[index - 1]
+          @cache[offset] + @counter.count(offset, byte_offset - offset)
+        end
+    end
+  end
+
   # Specialized version of Prism::Source for source code that includes ASCII
   # characters only. This class is used to apply performance optimizations that
   # cannot be applied to sources that include multibyte characters.
@@ -178,6 +254,13 @@ module Prism
       byte_offset
     end
 
+    # Returns a cache that is the identity function in order to maintain the
+    # same interface. We can do this because code units are always equivalent to
+    # byte offsets for ASCII-only sources.
+    def code_units_cache(encoding)
+      ->(byte_offset) { byte_offset }
+    end
+
     # Specialized version of `code_units_column` that does not depend on
     # `code_units_offset`, which is a more expensive operation. This is
     # essentially the same as `Prism::Source#column`.
@@ -287,6 +370,12 @@ module Prism
       source.code_units_offset(start_offset, encoding)
     end
 
+    # The start offset from the start of the file in code units using the given
+    # cache to fetch or calculate the value.
+    def cached_start_code_units_offset(cache)
+      cache[start_offset]
+    end
+
     # The byte offset from the beginning of the source where this location ends.
     def end_offset
       start_offset + length
@@ -303,6 +392,12 @@ module Prism
       source.code_units_offset(end_offset, encoding)
     end
 
+    # The end offset from the start of the file in code units using the given
+    # cache to fetch or calculate the value.
+    def cached_end_code_units_offset(cache)
+      cache[end_offset]
+    end
+
     # The line number where this location starts.
     def start_line
       source.line(start_offset)
@@ -337,6 +432,12 @@ module Prism
       source.code_units_column(start_offset, encoding)
     end
 
+    # The start column in code units using the given cache to fetch or calculate
+    # the value.
+    def cached_start_code_units_column(cache)
+      cache[start_offset] - cache[source.line_start(start_offset)]
+    end
+
     # The column number in bytes where this location ends from the start of the
     # line.
     def end_column
@@ -355,6 +456,12 @@ module Prism
       source.code_units_column(end_offset, encoding)
     end
 
+    # The end column in code units using the given cache to fetch or calculate
+    # the value.
+    def cached_end_code_units_column(cache)
+      cache[end_offset] - cache[source.line_start(end_offset)]
+    end
+
     # Implement the hash pattern matching interface for Location.
     def deconstruct_keys(keys)
       { start_offset: start_offset, end_offset: end_offset }
@@ -604,6 +711,11 @@ module Prism
     def failure?
       !success?
     end
+
+    # Create a code units cache for the given encoding.
+    def code_units_cache(encoding)
+      source.code_units_cache(encoding)
+    end
   end
 
   # This is a result specific to the `parse` and `parse_file` methods.
diff --git a/test/prism/ruby/location_test.rb b/test/prism/ruby/location_test.rb
index 3d3e7dd562..33f844243c 100644
--- a/test/prism/ruby/location_test.rb
+++ b/test/prism/ruby/location_test.rb
@@ -140,6 +140,52 @@ module Prism
       assert_equal 7, location.end_code_units_column(Encoding::UTF_32LE)
     end
 
+    def test_cached_code_units
+      result = Prism.parse("😀 + 😀\n😍 ||= 😍")
+
+      utf8_cache = result.code_units_cache(Encoding::UTF_8)
+      utf16_cache = result.code_units_cache(Encoding::UTF_16LE)
+      utf32_cache = result.code_units_cache(Encoding::UTF_32LE)
+
+      # first 😀
+      location = result.value.statements.body.first.receiver.location
+
+      assert_equal 0, location.cached_start_code_units_offset(utf8_cache)
+      assert_equal 0, location.cached_start_code_units_offset(utf16_cache)
+      assert_equal 0, location.cached_start_code_units_offset(utf32_cache)
+
+      assert_equal 1, location.cached_end_code_units_offset(utf8_cache)
+      assert_equal 2, location.cached_end_code_units_offset(utf16_cache)
+      assert_equal 1, location.cached_end_code_units_offset(utf32_cache)
+
+      assert_equal 0, location.cached_start_code_units_column(utf8_cache)
+      assert_equal 0, location.cached_start_code_units_column(utf16_cache)
+      assert_equal 0, location.cached_start_code_units_column(utf32_cache)
+
+      assert_equal 1, location.cached_end_code_units_column(utf8_cache)
+      assert_equal 2, location.cached_end_code_units_column(utf16_cache)
+      assert_equal 1, location.cached_end_code_units_column(utf32_cache)
+
+      # second 😀
+      location = result.value.statements.body.first.arguments.arguments.first.location
+
+      assert_equal 4, location.cached_start_code_units_offset(utf8_cache)
+      assert_equal 5, location.cached_start_code_units_offset(utf16_cache)
+      assert_equal 4, location.cached_start_code_units_offset(utf32_cache)
+
+      assert_equal 5, location.cached_end_code_units_offset(utf8_cache)
+      assert_equal 7, location.cached_end_code_units_offset(utf16_cache)
+      assert_equal 5, location.cached_end_code_units_offset(utf32_cache)
+
+      assert_equal 4, location.cached_start_code_units_column(utf8_cache)
+      assert_equal 5, location.cached_start_code_units_column(utf16_cache)
+      assert_equal 4, location.cached_start_code_units_column(utf32_cache)
+
+      assert_equal 5, location.cached_end_code_units_column(utf8_cache)
+      assert_equal 7, location.cached_end_code_units_column(utf16_cache)
+      assert_equal 5, location.cached_end_code_units_column(utf32_cache)
+    end
+
     def test_code_units_binary_valid_utf8
       program = Prism.parse(<<~RUBY).value
         # -*- encoding: binary -*-