[ruby/prism] Add character APIs for locations

(https://github.com/ruby/prism/pull/1809) https://github.com/ruby/prism/commit/d493ccd093
author: Kevin Newton <kddnewton@gmail.com> 2023-11-20 11:07:02 -0500
committer: git <svn-admin@ruby-lang.org> 2023-11-20 16:07:06 +0000
commit: f2ed7eaba0275099842b5b8407250e2d410f2f25 (patch)
tree: f4033a6b792245c785dae837043b632c0c888878
parent: adee7dab3edc3d58cc3d7245398b75ab1de8d077 (diff)
4 files changed, 83 insertions, 23 deletions
diff --git a/lib/prism/ffi.rb b/lib/prism/ffi.rb
index e1d3e0dca7..c910fd3aae 100644
--- a/lib/prism/ffi.rb
+++ b/lib/prism/ffi.rb
@@ -230,7 +230,7 @@ module Prism
         loader = Serialize::Loader.new(source, buffer.read)
 
         loader.load_header
-        loader.load_force_encoding
+        loader.load_encoding
         loader.load_start_line
         loader.load_comments
       end
diff --git a/lib/prism/parse_result.rb b/lib/prism/parse_result.rb
index 170a529bea..50c23bce65 100644
--- a/lib/prism/parse_result.rb
+++ b/lib/prism/parse_result.rb
@@ -25,40 +25,50 @@ module Prism
 
     # Perform a byteslice on the source code using the given byte offset and
     # byte length.
-    def slice(offset, length)
-      source.byteslice(offset, length)
+    def slice(byte_offset, length)
+      source.byteslice(byte_offset, length)
     end
 
     # Binary search through the offsets to find the line number for the given
     # byte offset.
-    def line(value)
-      start_line + find_line(value)
+    def line(byte_offset)
+      start_line + find_line(byte_offset)
     end
 
     # Return the byte offset of the start of the line corresponding to the given
     # byte offset.
-    def line_offset(value)
-      offsets[find_line(value)]
+    def line_start(byte_offset)
+      offsets[find_line(byte_offset)]
     end
 
     # Return the column number for the given byte offset.
-    def column(value)
-      value - offsets[find_line(value)]
+    def column(byte_offset)
+      byte_offset - line_start(byte_offset)
+    end
+
+    # Return the character offset for the given byte offset.
+    def character_offset(byte_offset)
+      source.byteslice(0, byte_offset).length
+    end
+
+    # Return the column number in characters for the given byte offset.
+    def character_column(byte_offset)
+      character_offset(byte_offset) - character_offset(line_start(byte_offset))
     end
 
     private
 
     # Binary search through the offsets to find the line number for the given
     # byte offset.
-    def find_line(value)
+    def find_line(byte_offset)
       left = 0
       right = offsets.length - 1
 
       while left <= right
         mid = left + (right - left) / 2
-        return mid if offsets[mid] == value
+        return mid if offsets[mid] == byte_offset
 
-        if offsets[mid] < value
+        if offsets[mid] < byte_offset
           left = mid + 1
         else
           right = mid - 1
@@ -121,11 +131,23 @@ module Prism
       source.slice(start_offset, length)
     end
 
+    # The character offset from the beginning of the source where this location
+    # starts.
+    def start_character_offset
+      source.character_offset(start_offset)
+    end
+
     # The byte offset from the beginning of the source where this location ends.
     def end_offset
       start_offset + length
     end
 
+    # The character offset from the beginning of the source where this location
+    # ends.
+    def end_character_offset
+      source.character_offset(end_offset)
+    end
+
     # The line number where this location starts.
     def start_line
       source.line(start_offset)
@@ -133,7 +155,7 @@ module Prism
 
     # The content of the line where this location starts before this location.
     def start_line_slice
-      offset = source.line_offset(start_offset)
+      offset = source.line_start(start_offset)
       source.slice(offset, start_offset - offset)
     end
 
@@ -148,12 +170,24 @@ module Prism
       source.column(start_offset)
     end
 
+    # The column number in characters where this location ends from the start of
+    # the line.
+    def start_character_column
+      source.character_column(start_offset)
+    end
+
     # The column number in bytes where this location ends from the start of the
     # line.
     def end_column
       source.column(end_offset)
     end
 
+    # The column number in characters where this location ends from the start of
+    # the line.
+    def end_character_column
+      source.character_column(end_offset)
+    end
+
     # Implement the hash pattern matching interface for Location.
     def deconstruct_keys(keys)
       { start_offset: start_offset, end_offset: end_offset }
diff --git a/prism/templates/lib/prism/serialize.rb.erb b/prism/templates/lib/prism/serialize.rb.erb
index 2837504543..e5a88ae99a 100644
--- a/prism/templates/lib/prism/serialize.rb.erb
+++ b/prism/templates/lib/prism/serialize.rb.erb
@@ -73,12 +73,9 @@ module Prism
       end
 
       def load_encoding
-        Encoding.find(io.read(load_varint))
-      end
-
-      def load_force_encoding
-        @encoding = load_encoding
+        @encoding = Encoding.find(io.read(load_varint))
         @input = input.force_encoding(@encoding).freeze
+        @encoding
       end
 
       def load_start_line
@@ -121,10 +118,7 @@ module Prism
         encoding = load_encoding
         load_start_line
         comments, magic_comments, errors, warnings = load_metadata
-
-        if encoding != @encoding
-          tokens.each { |token,| token.value.force_encoding(encoding) }
-        end
+        tokens.each { |token,| token.value.force_encoding(encoding) }
 
         raise "Expected to consume all bytes while deserializing" unless @io.eof?
         Prism::ParseResult.new(tokens, comments, magic_comments, errors, warnings, @source)
@@ -132,7 +126,7 @@ module Prism
 
       def load_nodes
         load_header
-        load_force_encoding
+        load_encoding
         load_start_line
 
         comments, magic_comments, errors, warnings = load_metadata
diff --git a/test/prism/ruby_api_test.rb b/test/prism/ruby_api_test.rb
index a61282cca1..cd87a81395 100644
--- a/test/prism/ruby_api_test.rb
+++ b/test/prism/ruby_api_test.rb
@@ -71,6 +71,38 @@ module Prism
       end
     end
 
+    def test_location_character_offsets
+      program = Prism.parse("😀 + 😀\n😍 ||= 😍").value
+
+      # first 😀
+      location = program.statements.body.first.receiver.location
+      assert_equal 0, location.start_character_offset
+      assert_equal 1, location.end_character_offset
+      assert_equal 0, location.start_character_column
+      assert_equal 1, location.end_character_column
+
+      # second 😀
+      location = program.statements.body.first.arguments.arguments.first.location
+      assert_equal 4, location.start_character_offset
+      assert_equal 5, location.end_character_offset
+      assert_equal 4, location.start_character_column
+      assert_equal 5, location.end_character_column
+
+      # first 😍
+      location = program.statements.body.last.name_loc
+      assert_equal 6, location.start_character_offset
+      assert_equal 7, location.end_character_offset
+      assert_equal 0, location.start_character_column
+      assert_equal 1, location.end_character_column
+
+      # second 😍
+      location = program.statements.body.last.value.location
+      assert_equal 12, location.start_character_offset
+      assert_equal 13, location.end_character_offset
+      assert_equal 6, location.start_character_column
+      assert_equal 7, location.end_character_column
+    end
+
     private
 
     def parse_expression(source)
author	Kevin Newton <kddnewton@gmail.com>	2023-11-20 11:07:02 -0500
committer	git <svn-admin@ruby-lang.org>	2023-11-20 16:07:06 +0000
commit	f2ed7eaba0275099842b5b8407250e2d410f2f25 (patch)
tree	f4033a6b792245c785dae837043b632c0c888878
parent	adee7dab3edc3d58cc3d7245398b75ab1de8d077 (diff)