diff options
Diffstat (limited to 'spec/ruby/optional/capi/encoding_spec.rb')
| -rw-r--r-- | spec/ruby/optional/capi/encoding_spec.rb | 284 |
1 files changed, 225 insertions, 59 deletions
diff --git a/spec/ruby/optional/capi/encoding_spec.rb b/spec/ruby/optional/capi/encoding_spec.rb index 93bde54069..b77a967b1e 100644 --- a/spec/ruby/optional/capi/encoding_spec.rb +++ b/spec/ruby/optional/capi/encoding_spec.rb @@ -1,8 +1,9 @@ # -*- encoding: utf-8 -*- +# frozen_string_literal: false require_relative 'spec_helper' require_relative 'fixtures/encoding' -load_extension('encoding') +extension_path = load_extension('encoding') describe :rb_enc_get_index, shared: true do it "returns the index of the encoding of a String" do @@ -31,13 +32,11 @@ describe :rb_enc_set_index, shared: true do result.first.should == result.last end - ruby_version_is "2.6" do - it "raises an ArgumentError for a non-encoding capable object" do - obj = Object.new - -> { - result = @s.send(@method, obj, 1) - }.should raise_error(ArgumentError, "cannot set encoding on non-encoding capable object") - end + it "raises an ArgumentError for a non-encoding capable object" do + obj = Object.new + -> { + result = @s.send(@method, obj, 1) + }.should.raise(ArgumentError, "cannot set encoding on non-encoding capable object") end end @@ -48,13 +47,11 @@ describe "C-API Encoding function" do @s = CApiEncodingSpecs.new end - ruby_version_is "2.6" do - describe "rb_enc_alias" do - it "creates an alias for an existing Encoding" do - name = "ZOMGWTFBBQ#{@n += 1}" - @s.rb_enc_alias(name, "UTF-8").should >= 0 - Encoding.find(name).name.should == "UTF-8" - end + describe "rb_enc_alias" do + it "creates an alias for an existing Encoding" do + name = "ZOMGWTFBBQ#{@n += 1}" + @s.rb_enc_alias(name, "UTF-8").should >= 0 + Encoding.find(name).name.should == "UTF-8" end end @@ -67,6 +64,48 @@ describe "C-API Encoding function" do end end + describe "rb_enc_strlen" do + before :each do + @str = 'こにちわ' # Each codepoint in this string is 3 bytes in UTF-8 + end + + it "returns the correct string length for the encoding" do + @s.rb_enc_strlen(@str, @str.bytesize, Encoding::UTF_8).should == 4 + @s.rb_enc_strlen(@str, @str.bytesize, Encoding::BINARY).should == 12 + end + + it "returns the string length based on a fixed-width encoding's character length, even if the encoding is incompatible" do + @s.rb_enc_strlen(@str, @str.bytesize, Encoding::UTF_16BE).should == 6 + @s.rb_enc_strlen(@str, @str.bytesize, Encoding::UTF_16LE).should == 6 + @s.rb_enc_strlen(@str, @str.bytesize, Encoding::UTF_32BE).should == 3 + @s.rb_enc_strlen(@str, @str.bytesize, Encoding::UTF_32LE).should == 3 + end + + it "does not consider strings to be NUL-terminated" do + s = "abc\0def" + @s.rb_enc_strlen(s, s.bytesize, Encoding::US_ASCII).should == 7 + @s.rb_enc_strlen(s, s.bytesize, Encoding::UTF_8).should == 7 + end + + describe "handles broken strings" do + it "combines valid character and invalid character counts in UTF-8" do + # The result is 3 because `rb_enc_strlen` counts the first valid character and then adds + # the byte count for the invalid character that follows for 1 + 2. + @s.rb_enc_strlen(@str, 5, Encoding::UTF_8).should == 3 + end + + it "combines valid character and invalid character counts in UTF-16" do + @s.rb_enc_strlen(@str, 5, Encoding::UTF_16BE).should == 3 + end + + it "rounds up for fixed-width encodings" do + @s.rb_enc_strlen(@str, 7, Encoding::UTF_32BE).should == 2 + @s.rb_enc_strlen(@str, 7, Encoding::UTF_32LE).should == 2 + @s.rb_enc_strlen(@str, 5, Encoding::BINARY).should == 5 + end + end + end + describe "rb_enc_find" do it "returns the encoding of an Encoding" do @s.rb_enc_find("UTF-8").should == "UTF-8" @@ -126,16 +165,22 @@ describe "C-API Encoding function" do describe "rb_enc_from_index" do it "returns an Encoding" do - @s.rb_enc_from_index(0).should be_an_instance_of(String) + @s.rb_enc_from_index(0).should.instance_of?(String) end end describe "rb_enc_mbc_to_codepoint" do it "returns the correct codepoint for the given character and size" do - @s.rb_enc_mbc_to_codepoint("é", 2).should == 0x00E9 - @s.rb_enc_mbc_to_codepoint("éa", 2).should == 0x00E9 - @s.rb_enc_mbc_to_codepoint("éa", 1).should == 0xC3 - @s.rb_enc_mbc_to_codepoint("éa", 3).should == 0x00E9 + @s.rb_enc_mbc_to_codepoint("é").should == 0xE9 + end + + it "returns 0 if p == e" do + @s.rb_enc_mbc_to_codepoint("").should == 0 + end + + it "returns the raw byte if incomplete character in UTF-8" do + @s.rb_enc_mbc_to_codepoint("\xC3").should == 0xC3 + @s.rb_enc_mbc_to_codepoint("\x80").should == 0x80 end end @@ -239,11 +284,9 @@ describe "C-API Encoding function" do @s.rb_enc_get_index(1).should == -1 end - ruby_version_is "2.6" do - it "returns -1 for an object without an encoding" do - obj = Object.new - @s.rb_enc_get_index(obj).should == -1 - end + it "returns -1 for an object without an encoding" do + obj = Object.new + @s.rb_enc_get_index(obj).should == -1 end end @@ -255,7 +298,7 @@ describe "C-API Encoding function" do it "returns a String in US-ASCII encoding when high bits are set" do xEE = [0xEE].pack('C').force_encoding('utf-8') result = @s.rb_enc_str_new(xEE, 1, Encoding::US_ASCII) - result.encoding.should equal(Encoding::US_ASCII) + result.encoding.should.equal?(Encoding::US_ASCII) end end @@ -346,12 +389,12 @@ describe "C-API Encoding function" do it "returns true if the object encoding is only ASCII" do str = "abc".force_encoding("us-ascii") str.valid_encoding? # make sure to set the coderange - @s.ENC_CODERANGE_ASCIIONLY(str).should be_true + @s.ENC_CODERANGE_ASCIIONLY(str).should == true end it "returns false if the object encoding is not ASCII only" do str = "ありがとう".force_encoding("utf-8") - @s.ENC_CODERANGE_ASCIIONLY(str).should be_false + @s.ENC_CODERANGE_ASCIIONLY(str).should == false end end @@ -378,7 +421,7 @@ describe "C-API Encoding function" do describe "when the rb_encoding struct is stored in native memory" do it "can still read the name of the encoding" do address = @s.rb_to_encoding_native_store(Encoding::UTF_8) - address.should be_kind_of(Integer) + address.should.is_a?(Integer) @s.rb_to_encoding_native_name(address).should == "UTF-8" end end @@ -409,7 +452,7 @@ describe "C-API Encoding function" do describe "rb_enc_compatible" do it "returns 0 if the encodings of the Strings are not compatible" do a = [0xff].pack('C').force_encoding "binary" - b = "\u3042".encode("utf-8") + b = "あ" @s.rb_enc_compatible(a, b).should == 0 end @@ -418,11 +461,25 @@ describe "C-API Encoding function" do # Encoding.compatible? it "returns the same value as Encoding.compatible? if the Strings have a compatible encoding" do a = "abc".force_encoding("us-ascii") - b = "\u3042".encode("utf-8") + b = "あ" @s.rb_enc_compatible(a, b).should == Encoding.compatible?(a, b) end end + describe "rb_enc_check" do + it "returns the compatible encoding of the two Strings" do + a = "abc".force_encoding("us-ascii") + b = "あ" + @s.rb_enc_check(a, b).should == Encoding::UTF_8 + end + + it "raises Encoding::CompatibilityError if the encodings are not compatible" do + a = [0xff].pack('C').b + b = "あ" + -> { @s.rb_enc_check(a, b) }.should.raise(Encoding::CompatibilityError) + end + end + describe "rb_enc_copy" do before :each do @obj = "rb_enc_copy".encode(Encoding::US_ASCII) @@ -432,12 +489,20 @@ describe "C-API Encoding function" do @s.rb_enc_copy("string", @obj).encoding.should == Encoding::US_ASCII end - it "raises a RuntimeError if the second argument is a Symbol" do - -> { @s.rb_enc_copy(:symbol, @obj) }.should raise_error(RuntimeError) + it "raises a RuntimeError if the first argument is a Symbol" do + -> { @s.rb_enc_copy(:symbol, @obj) }.should.raise(RuntimeError) + end + + ruby_version_is "4.1" do + it "raises a FrozenError if the first argument is a Regexp" do + -> { @s.rb_enc_copy(/regexp/.dup, @obj) }.should.raise(FrozenError) + end end - it "sets the encoding of a Regexp to that of the second argument" do - @s.rb_enc_copy(/regexp/.dup, @obj).encoding.should == Encoding::US_ASCII + ruby_version_is ""..."4.1" do + it "sets the encoding of a Regexp to that of the second argument" do + @s.rb_enc_copy(/regexp/.dup, @obj).encoding.should == Encoding::US_ASCII + end end end @@ -452,7 +517,7 @@ describe "C-API Encoding function" do it "returns 0 if Encoding.default_internal is nil" do Encoding.default_internal = nil - @s.rb_default_internal_encoding.should be_nil + @s.rb_default_internal_encoding.should == nil end it "returns the encoding for Encoding.default_internal" do @@ -484,11 +549,19 @@ describe "C-API Encoding function" do end it "raises a RuntimeError if the argument is Symbol" do - -> { @s.rb_enc_associate(:symbol, "US-ASCII") }.should raise_error(RuntimeError) + -> { @s.rb_enc_associate(:symbol, "US-ASCII") }.should.raise(RuntimeError) end - it "sets the encoding of a Regexp to the encoding" do - @s.rb_enc_associate(/regexp/.dup, "BINARY").encoding.should == Encoding::BINARY + ruby_version_is "4.1" do + it "raises a FrozenError if the argument is a Regexp" do + -> { @s.rb_enc_associate(/regexp/.dup, "BINARY") }.should.raise(FrozenError) + end + end + + ruby_version_is ""..."4.1" do + it "sets the encoding of a Regexp to the encoding" do + @s.rb_enc_associate(/regexp/.dup, "BINARY").encoding.should == Encoding::BINARY + end end it "sets the encoding of a String to a default when the encoding is NULL" do @@ -503,33 +576,42 @@ describe "C-API Encoding function" do enc.should == Encoding::BINARY end - it "sets the encoding of a Regexp to the encoding" do - index = @s.rb_enc_find_index("UTF-8") - enc = @s.rb_enc_associate_index(/regexp/.dup, index).encoding - enc.should == Encoding::UTF_8 + ruby_version_is "4.1" do + it "raises a FrozenError if the argument is a Regexp" do + index = @s.rb_enc_find_index("UTF-8") + -> { @s.rb_enc_associate_index(/regexp/.dup, index) }.should.raise(FrozenError) + end + end + + ruby_version_is ""..."4.1" do + it "sets the encoding of a Regexp to the encoding" do + index = @s.rb_enc_find_index("UTF-8") + enc = @s.rb_enc_associate_index(/regexp/.dup, index).encoding + enc.should == Encoding::UTF_8 + end end it "sets the encoding of a Symbol to the encoding" do index = @s.rb_enc_find_index("UTF-8") - -> { @s.rb_enc_associate_index(:symbol, index) }.should raise_error(RuntimeError) + -> { @s.rb_enc_associate_index(:symbol, index) }.should.raise(RuntimeError) end end describe "rb_ascii8bit_encindex" do it "returns an index for the ASCII-8BIT encoding" do - @s.rb_ascii8bit_encindex().should >= 0 + @s.rb_ascii8bit_encindex().should == 0 end end describe "rb_utf8_encindex" do it "returns an index for the UTF-8 encoding" do - @s.rb_utf8_encindex().should >= 0 + @s.rb_utf8_encindex().should == 1 end end describe "rb_usascii_encindex" do it "returns an index for the US-ASCII encoding" do - @s.rb_usascii_encindex().should >= 0 + @s.rb_usascii_encindex().should == 2 end end @@ -567,13 +649,13 @@ describe "C-API Encoding function" do it "raises ArgumentError if an empty string is given" do -> do @s.rb_enc_codepoint_len("") - end.should raise_error(ArgumentError) + end.should.raise(ArgumentError) end it "raises ArgumentError if an invalid byte sequence is given" do -> do @s.rb_enc_codepoint_len([0xa0, 0xa1].pack('CC').force_encoding('utf-8')) # Invalid sequence identifier - end.should raise_error(ArgumentError) + end.should.raise(ArgumentError) end it "returns codepoint 0x24 and length 1 for character '$'" do @@ -607,11 +689,25 @@ describe "C-API Encoding function" do describe "rb_enc_str_asciionly_p" do it "returns true for an ASCII string" do - @s.rb_enc_str_asciionly_p("hello").should be_true + @s.rb_enc_str_asciionly_p("hello").should == true end it "returns false for a non-ASCII string" do - @s.rb_enc_str_asciionly_p("hüllo").should be_false + @s.rb_enc_str_asciionly_p("hüllo").should == false + end + end + + describe "rb_enc_raise" do + it "forces exception message encoding to the specified one" do + utf_8_incompatible_string = "\x81".b + + -> { + @s.rb_enc_raise(Encoding::UTF_8, RuntimeError, utf_8_incompatible_string) + }.should.raise { |e| + e.message.encoding.should == Encoding::UTF_8 + e.message.valid_encoding?.should == false + e.message.bytes.should == utf_8_incompatible_string.bytes + } end end @@ -619,23 +715,40 @@ describe "C-API Encoding function" do it 'converts a Unicode codepoint to a UTF-8 C string' do str = ' ' * 6 { - 0 => "\x01", - 0x7f => "\xC2\x80", - 0x7ff => "\xE0\xA0\x80", - 0xffff => "\xF0\x90\x80\x80", - 0x1fffff => "\xF8\x88\x80\x80\x80", - 0x3ffffff => "\xFC\x84\x80\x80\x80\x80", + 1 => "\x01", + 0x80 => "\xC2\x80", + 0x800 => "\xE0\xA0\x80", + 0x10000 => "\xF0\x90\x80\x80", + 0x200000 => "\xF8\x88\x80\x80\x80", + 0x4000000 => "\xFC\x84\x80\x80\x80\x80", }.each do |num, result| - len = @s.rb_uv_to_utf8(str, num + 1) - str[0..len-1].should == result + len = @s.rb_uv_to_utf8(str, num) + str.byteslice(0, len).should == result end end end + describe "rb_enc_left_char_head" do + it 'returns the head position of a character' do + @s.rb_enc_left_char_head("é", 1).should == 0 + @s.rb_enc_left_char_head("éééé", 7).should == 6 + + @s.rb_enc_left_char_head("a", 0).should == 0 + + # unclear if this is intended to work + @s.rb_enc_left_char_head("a", 1).should == 1 + + # Works because for single-byte encodings rb_enc_left_char_head() just returns the pointer + @s.rb_enc_left_char_head("a".force_encoding(Encoding::US_ASCII), 88).should == 88 + @s.rb_enc_left_char_head("a".b, 88).should == 88 + end + end + describe "ONIGENC_MBC_CASE_FOLD" do it "returns the correct case fold for the given string" do @s.ONIGENC_MBC_CASE_FOLD("lower").should == ["l", 1] @s.ONIGENC_MBC_CASE_FOLD("Upper").should == ["u", 1] + @s.ONIGENC_MBC_CASE_FOLD("ABC"[1..-1]).should == ["b", 1] end it "works with other encodings" do @@ -648,4 +761,57 @@ describe "C-API Encoding function" do str.bytes.should == [0, 0x24] end end + + describe "rb_define_dummy_encoding" do + run = 0 + + it "defines the dummy encoding" do + @s.rb_define_dummy_encoding("FOO#{run += 1}") + enc = Encoding.find("FOO#{run}") + enc.should.dummy? + end + + it "returns the index of the dummy encoding" do + index = @s.rb_define_dummy_encoding("BAR#{run += 1}") + index.should == Encoding.list.size - 1 + end + + it "raises EncodingError if too many encodings" do + code = <<-RUBY + require #{extension_path.dump} + 1_000.times {|i| CApiEncodingSpecs.new.rb_define_dummy_encoding("R_\#{i}") } + RUBY + ruby_exe(code, args: "2>&1", exit_status: 1).should.include?('too many encoding (> 256) (EncodingError)') + end + end + + describe "ONIGENC_IS_UNICODE" do + it "is true only for select UTF-related encodings" do + unicode = [ + Encoding::UTF_8, + Encoding::UTF8_DOCOMO, + Encoding::UTF8_KDDI, + Encoding::UTF8_MAC, + Encoding::UTF8_SOFTBANK, + Encoding::CESU_8, + Encoding::UTF_16LE, + Encoding::UTF_16BE, + Encoding::UTF_32LE, + Encoding::UTF_32BE + ] + unicode.each do |enc| + @s.should.ONIGENC_IS_UNICODE(enc) + end + + (Encoding.list - unicode).each { |enc| + @s.should_not.ONIGENC_IS_UNICODE(enc) + } + end + + # Redundant with the above but more explicit + it "is false for the dummy UTF-16 and UTF-32 encodings" do + @s.should_not.ONIGENC_IS_UNICODE(Encoding::UTF_16) + @s.should_not.ONIGENC_IS_UNICODE(Encoding::UTF_32) + end + end end |
