[ruby/prism] Prism::CodeUnitsCache

Calculating code unit offsets for a source can be very expensive, especially when the source is large. This commit introduces a new class that wraps the source and desired encoding into a cache that reuses pre-computed offsets. It performs quite a bit better. There are still some problems with this approach, namely character boundaries and the fact that the cache is unbounded, but both of these may be addressed in subsequent commits. https://github.com/ruby/prism/commit/2e3e1a4d4d
author: Kevin Newton <kddnewton@gmail.com> 2024-10-09 14:40:35 -0400
committer: git <svn-admin@ruby-lang.org> 2024-10-10 18:02:27 +0000
commit: 7a198af7cdb437c5245ac3ab70cb66cef2002d06 (patch)
tree: 1c7270805ff508b297aca4d1e5f89c6464d03838 /test/prism/ruby
parent: b77ff342ccb1c57a4b6c618e4ddf6bf1fec23a1d (diff)
1 files changed, 46 insertions, 0 deletions
diff --git a/test/prism/ruby/location_test.rb b/test/prism/ruby/location_test.rb
index 3d3e7dd562..33f844243c 100644
--- a/test/prism/ruby/location_test.rb
+++ b/test/prism/ruby/location_test.rb
@@ -140,6 +140,52 @@ module Prism
       assert_equal 7, location.end_code_units_column(Encoding::UTF_32LE)
     end
 
+    def test_cached_code_units
+      result = Prism.parse("😀 + 😀\n😍 ||= 😍")
+
+      utf8_cache = result.code_units_cache(Encoding::UTF_8)
+      utf16_cache = result.code_units_cache(Encoding::UTF_16LE)
+      utf32_cache = result.code_units_cache(Encoding::UTF_32LE)
+
+      # first 😀
+      location = result.value.statements.body.first.receiver.location
+
+      assert_equal 0, location.cached_start_code_units_offset(utf8_cache)
+      assert_equal 0, location.cached_start_code_units_offset(utf16_cache)
+      assert_equal 0, location.cached_start_code_units_offset(utf32_cache)
+
+      assert_equal 1, location.cached_end_code_units_offset(utf8_cache)
+      assert_equal 2, location.cached_end_code_units_offset(utf16_cache)
+      assert_equal 1, location.cached_end_code_units_offset(utf32_cache)
+
+      assert_equal 0, location.cached_start_code_units_column(utf8_cache)
+      assert_equal 0, location.cached_start_code_units_column(utf16_cache)
+      assert_equal 0, location.cached_start_code_units_column(utf32_cache)
+
+      assert_equal 1, location.cached_end_code_units_column(utf8_cache)
+      assert_equal 2, location.cached_end_code_units_column(utf16_cache)
+      assert_equal 1, location.cached_end_code_units_column(utf32_cache)
+
+      # second 😀
+      location = result.value.statements.body.first.arguments.arguments.first.location
+
+      assert_equal 4, location.cached_start_code_units_offset(utf8_cache)
+      assert_equal 5, location.cached_start_code_units_offset(utf16_cache)
+      assert_equal 4, location.cached_start_code_units_offset(utf32_cache)
+
+      assert_equal 5, location.cached_end_code_units_offset(utf8_cache)
+      assert_equal 7, location.cached_end_code_units_offset(utf16_cache)
+      assert_equal 5, location.cached_end_code_units_offset(utf32_cache)
+
+      assert_equal 4, location.cached_start_code_units_column(utf8_cache)
+      assert_equal 5, location.cached_start_code_units_column(utf16_cache)
+      assert_equal 4, location.cached_start_code_units_column(utf32_cache)
+
+      assert_equal 5, location.cached_end_code_units_column(utf8_cache)
+      assert_equal 7, location.cached_end_code_units_column(utf16_cache)
+      assert_equal 5, location.cached_end_code_units_column(utf32_cache)
+    end
+
     def test_code_units_binary_valid_utf8
       program = Prism.parse(<<~RUBY).value
         # -*- encoding: binary -*-
author	Kevin Newton <kddnewton@gmail.com>	2024-10-09 14:40:35 -0400
committer	git <svn-admin@ruby-lang.org>	2024-10-10 18:02:27 +0000
commit	7a198af7cdb437c5245ac3ab70cb66cef2002d06 (patch)
tree	1c7270805ff508b297aca4d1e5f89c6464d03838 /test/prism/ruby
parent	b77ff342ccb1c57a4b6c618e4ddf6bf1fec23a1d (diff)