1 files changed, 119 insertions, 16 deletions
diff --git a/spec/ruby/optional/capi/encoding_spec.rb b/spec/ruby/optional/capi/encoding_spec.rb
index 66c2dd40de..1529e012b0 100644
--- a/spec/ruby/optional/capi/encoding_spec.rb
+++ b/spec/ruby/optional/capi/encoding_spec.rb
@@ -1,8 +1,9 @@
 # -*- encoding: utf-8 -*-
+# frozen_string_literal: false
 require_relative 'spec_helper'
 require_relative 'fixtures/encoding'
 
-load_extension('encoding')
+extension_path = load_extension('encoding')
 
 describe :rb_enc_get_index, shared: true do
   it "returns the index of the encoding of a String" do
@@ -63,6 +64,48 @@ describe "C-API Encoding function" do
     end
   end
 
+  describe "rb_enc_strlen" do
+    before :each do
+      @str = 'こにちわ' # Each codepoint in this string is 3 bytes in UTF-8
+    end
+
+    it "returns the correct string length for the encoding" do
+      @s.rb_enc_strlen(@str, @str.bytesize, Encoding::UTF_8).should == 4
+      @s.rb_enc_strlen(@str, @str.bytesize, Encoding::BINARY).should == 12
+    end
+
+    it "returns the string length based on a fixed-width encoding's character length, even if the encoding is incompatible" do
+      @s.rb_enc_strlen(@str, @str.bytesize, Encoding::UTF_16BE).should == 6
+      @s.rb_enc_strlen(@str, @str.bytesize, Encoding::UTF_16LE).should == 6
+      @s.rb_enc_strlen(@str, @str.bytesize, Encoding::UTF_32BE).should == 3
+      @s.rb_enc_strlen(@str, @str.bytesize, Encoding::UTF_32LE).should == 3
+    end
+
+    it "does not consider strings to be NUL-terminated" do
+      s = "abc\0def"
+      @s.rb_enc_strlen(s, s.bytesize, Encoding::US_ASCII).should == 7
+      @s.rb_enc_strlen(s, s.bytesize, Encoding::UTF_8).should == 7
+    end
+
+    describe "handles broken strings" do
+      it "combines valid character and invalid character counts in UTF-8" do
+        # The result is 3 because `rb_enc_strlen` counts the first valid character and then adds
+        # the byte count for the invalid character that follows for 1 + 2.
+        @s.rb_enc_strlen(@str, 5, Encoding::UTF_8).should == 3
+      end
+
+      it "combines valid character and invalid character counts in UTF-16" do
+        @s.rb_enc_strlen(@str, 5, Encoding::UTF_16BE).should == 3
+      end
+
+      it "rounds up for fixed-width encodings" do
+        @s.rb_enc_strlen(@str, 7, Encoding::UTF_32BE).should == 2
+        @s.rb_enc_strlen(@str, 7, Encoding::UTF_32LE).should == 2
+        @s.rb_enc_strlen(@str, 5, Encoding::BINARY).should == 5
+      end
+    end
+  end
+
   describe "rb_enc_find" do
     it "returns the encoding of an Encoding" do
       @s.rb_enc_find("UTF-8").should == "UTF-8"
@@ -128,10 +171,16 @@ describe "C-API Encoding function" do
 
   describe "rb_enc_mbc_to_codepoint" do
     it "returns the correct codepoint for the given character and size" do
-       @s.rb_enc_mbc_to_codepoint("é", 2).should == 0x00E9
-       @s.rb_enc_mbc_to_codepoint("éa", 2).should == 0x00E9
-       @s.rb_enc_mbc_to_codepoint("éa", 1).should == 0xC3
-       @s.rb_enc_mbc_to_codepoint("éa", 3).should == 0x00E9
+       @s.rb_enc_mbc_to_codepoint("é").should == 0xE9
+    end
+
+    it "returns 0 if p == e" do
+      @s.rb_enc_mbc_to_codepoint("").should == 0
+    end
+
+    it "returns the raw byte if incomplete character in UTF-8" do
+      @s.rb_enc_mbc_to_codepoint("\xC3").should == 0xC3
+      @s.rb_enc_mbc_to_codepoint("\x80").should == 0x80
     end
   end
 
@@ -511,19 +560,19 @@ describe "C-API Encoding function" do
 
   describe "rb_ascii8bit_encindex" do
     it "returns an index for the ASCII-8BIT encoding" do
-      @s.rb_ascii8bit_encindex().should >= 0
+      @s.rb_ascii8bit_encindex().should == 0
     end
   end
 
   describe "rb_utf8_encindex" do
     it "returns an index for the UTF-8 encoding" do
-      @s.rb_utf8_encindex().should >= 0
+      @s.rb_utf8_encindex().should == 1
     end
   end
 
   describe "rb_usascii_encindex" do
     it "returns an index for the US-ASCII encoding" do
-      @s.rb_usascii_encindex().should >= 0
+      @s.rb_usascii_encindex().should == 2
     end
   end
 
@@ -609,27 +658,58 @@ describe "C-API Encoding function" do
     end
   end
 
+  describe "rb_enc_raise" do
+    it "forces exception message encoding to the specified one" do
+      utf_8_incompatible_string = "\x81".b
+
+      -> {
+        @s.rb_enc_raise(Encoding::UTF_8, RuntimeError, utf_8_incompatible_string)
+      }.should raise_error { |e|
+        e.message.encoding.should == Encoding::UTF_8
+        e.message.valid_encoding?.should == false
+        e.message.bytes.should == utf_8_incompatible_string.bytes
+      }
+    end
+  end
+
   describe "rb_uv_to_utf8" do
     it 'converts a Unicode codepoint to a UTF-8 C string' do
       str = ' ' * 6
       {
-        0  => "\x01",
-        0x7f => "\xC2\x80",
-        0x7ff => "\xE0\xA0\x80",
-        0xffff => "\xF0\x90\x80\x80",
-        0x1fffff => "\xF8\x88\x80\x80\x80",
-        0x3ffffff => "\xFC\x84\x80\x80\x80\x80",
+        1  => "\x01",
+        0x80 => "\xC2\x80",
+        0x800 => "\xE0\xA0\x80",
+        0x10000 => "\xF0\x90\x80\x80",
+        0x200000 => "\xF8\x88\x80\x80\x80",
+        0x4000000 => "\xFC\x84\x80\x80\x80\x80",
       }.each do |num, result|
-        len = @s.rb_uv_to_utf8(str, num + 1)
-        str[0..len-1].should == result
+        len = @s.rb_uv_to_utf8(str, num)
+        str.byteslice(0, len).should == result
       end
     end
   end
 
+  describe "rb_enc_left_char_head" do
+    it 'returns the head position of a character' do
+      @s.rb_enc_left_char_head("é", 1).should == 0
+      @s.rb_enc_left_char_head("éééé", 7).should == 6
+
+      @s.rb_enc_left_char_head("a", 0).should == 0
+
+      # unclear if this is intended to work
+      @s.rb_enc_left_char_head("a", 1).should == 1
+
+      # Works because for single-byte encodings rb_enc_left_char_head() just returns the pointer
+      @s.rb_enc_left_char_head("a".force_encoding(Encoding::US_ASCII), 88).should == 88
+      @s.rb_enc_left_char_head("a".b, 88).should == 88
+    end
+  end
+
   describe "ONIGENC_MBC_CASE_FOLD" do
     it "returns the correct case fold for the given string" do
       @s.ONIGENC_MBC_CASE_FOLD("lower").should == ["l", 1]
       @s.ONIGENC_MBC_CASE_FOLD("Upper").should == ["u", 1]
+      @s.ONIGENC_MBC_CASE_FOLD("ABC"[1..-1]).should == ["b", 1]
     end
 
     it "works with other encodings" do
@@ -642,4 +722,27 @@ describe "C-API Encoding function" do
       str.bytes.should == [0, 0x24]
     end
   end
+
+  describe "rb_define_dummy_encoding" do
+    it "defines the dummy encoding" do
+      @s.rb_define_dummy_encoding("FOO")
+      enc = Encoding.find("FOO")
+      enc.should.dummy?
+    end
+
+    it "returns the index of the dummy encoding" do
+      index = @s.rb_define_dummy_encoding("BAR")
+      index.should == Encoding.list.size - 1
+    end
+
+    ruby_version_is "3.2" do
+      it "raises EncodingError if too many encodings" do
+        code = <<-RUBY
+          require #{extension_path.dump}
+          1_000.times {|i| CApiEncodingSpecs.new.rb_define_dummy_encoding("R_\#{i}") }
+        RUBY
+        ruby_exe(code, args: "2>&1", exit_status: 1).should.include?('too many encoding (> 256) (EncodingError)')
+      end
+    end
+  end
 end