summaryrefslogtreecommitdiff
path: root/spec/ruby/core/string/valid_encoding/utf_8_spec.rb
diff options
context:
space:
mode:
Diffstat (limited to 'spec/ruby/core/string/valid_encoding/utf_8_spec.rb')
-rw-r--r--spec/ruby/core/string/valid_encoding/utf_8_spec.rb214
1 files changed, 214 insertions, 0 deletions
diff --git a/spec/ruby/core/string/valid_encoding/utf_8_spec.rb b/spec/ruby/core/string/valid_encoding/utf_8_spec.rb
new file mode 100644
index 0000000000..a14c3af830
--- /dev/null
+++ b/spec/ruby/core/string/valid_encoding/utf_8_spec.rb
@@ -0,0 +1,214 @@
+# -*- encoding: utf-8 -*-
+require_relative '../../../spec_helper'
+
+describe "String#valid_encoding? and UTF-8" do
+ def utf8(bytes)
+ bytes.pack("C*").force_encoding("UTF-8")
+ end
+
+ describe "1-byte character" do
+ it "is valid if is in format 0xxxxxxx" do
+ utf8([0b00000000]).valid_encoding?.should == true
+ utf8([0b01111111]).valid_encoding?.should == true
+ end
+
+ it "is not valid if is not in format 0xxxxxxx" do
+ utf8([0b10000000]).valid_encoding?.should == false
+ utf8([0b11111111]).valid_encoding?.should == false
+ end
+ end
+
+ describe "2-bytes character" do
+ it "is valid if in format [110xxxxx 10xxxxx]" do
+ utf8([0b11000010, 0b10000000]).valid_encoding?.should == true
+ utf8([0b11000010, 0b10111111]).valid_encoding?.should == true
+
+ utf8([0b11011111, 0b10000000]).valid_encoding?.should == true
+ utf8([0b11011111, 0b10111111]).valid_encoding?.should == true
+ end
+
+ it "is not valid if the first byte is not in format 110xxxxx" do
+ utf8([0b00000010, 0b10000000]).valid_encoding?.should == false
+ utf8([0b00100010, 0b10000000]).valid_encoding?.should == false
+ utf8([0b01000010, 0b10000000]).valid_encoding?.should == false
+ utf8([0b01100010, 0b10000000]).valid_encoding?.should == false
+ utf8([0b10000010, 0b10000000]).valid_encoding?.should == false
+ utf8([0b10100010, 0b10000000]).valid_encoding?.should == false
+ utf8([0b11000010, 0b10000000]).valid_encoding?.should == true # correct bytes
+ utf8([0b11100010, 0b10000000]).valid_encoding?.should == false
+ end
+
+ it "is not valid if the second byte is not in format 10xxxxxx" do
+ utf8([0b11000010, 0b00000000]).valid_encoding?.should == false
+ utf8([0b11000010, 0b01000000]).valid_encoding?.should == false
+ utf8([0b11000010, 0b11000000]).valid_encoding?.should == false
+ end
+
+ it "is not valid if is smaller than [xxxxxx10 xx000000] (codepoints < U+007F, that are encoded with the 1-byte format)" do
+ utf8([0b11000000, 0b10111111]).valid_encoding?.should == false
+ utf8([0b11000001, 0b10111111]).valid_encoding?.should == false
+ end
+
+ it "is not valid if the first byte is missing" do
+ bytes = [0b11000010, 0b10000000]
+ utf8(bytes[1..1]).valid_encoding?.should == false
+ end
+
+ it "is not valid if the second byte is missing" do
+ bytes = [0b11000010, 0b10000000]
+ utf8(bytes[0..0]).valid_encoding?.should == false
+ end
+ end
+
+ describe "3-bytes character" do
+ it "is valid if in format [1110xxxx 10xxxxxx 10xxxxxx]" do
+ utf8([0b11100000, 0b10100000, 0b10000000]).valid_encoding?.should == true
+ utf8([0b11100000, 0b10100000, 0b10111111]).valid_encoding?.should == true
+ utf8([0b11100000, 0b10111111, 0b10111111]).valid_encoding?.should == true
+ utf8([0b11101111, 0b10111111, 0b10111111]).valid_encoding?.should == true
+ end
+
+ it "is not valid if the first byte is not in format 1110xxxx" do
+ utf8([0b00000000, 0b10100000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b00010000, 0b10100000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b00100000, 0b10100000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b00110000, 0b10100000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b01000000, 0b10100000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b01010000, 0b10100000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b01100000, 0b10100000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b01110000, 0b10100000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b10000000, 0b10100000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b10010000, 0b10100000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b10100000, 0b10100000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b10110000, 0b10100000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b11000000, 0b10100000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b11010000, 0b10100000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b11100000, 0b10100000, 0b10000000]).valid_encoding?.should == true # correct bytes
+ utf8([0b11110000, 0b10100000, 0b10000000]).valid_encoding?.should == false
+ end
+
+ it "is not valid if the second byte is not in format 10xxxxxx" do
+ utf8([0b11100000, 0b00100000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b11100000, 0b01100000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b11100000, 0b11100000, 0b10000000]).valid_encoding?.should == false
+ end
+
+ it "is not valid if the third byte is not in format 10xxxxxx" do
+ utf8([0b11100000, 0b10100000, 0b00000000]).valid_encoding?.should == false
+ utf8([0b11100000, 0b10100000, 0b01000000]).valid_encoding?.should == false
+ utf8([0b11100000, 0b10100000, 0b01000000]).valid_encoding?.should == false
+ end
+
+ it "is not valid if is smaller than [xxxx0000 xx100000 xx000000] (codepoints < U+07FF that are encoded with the 2-byte format)" do
+ utf8([0b11100000, 0b10010000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b11100000, 0b10001000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b11100000, 0b10000100, 0b10000000]).valid_encoding?.should == false
+ utf8([0b11100000, 0b10000010, 0b10000000]).valid_encoding?.should == false
+ utf8([0b11100000, 0b10000001, 0b10000000]).valid_encoding?.should == false
+ utf8([0b11100000, 0b10000000, 0b10000000]).valid_encoding?.should == false
+ end
+
+ it "is not valid if in range [xxxx1101 xx100000 xx000000] - [xxxx1101 xx111111 xx111111] (codepoints U+D800 - U+DFFF)" do
+ utf8([0b11101101, 0b10100000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b11101101, 0b10100000, 0b10000001]).valid_encoding?.should == false
+ utf8([0b11101101, 0b10111111, 0b10111111]).valid_encoding?.should == false
+
+ utf8([0b11101101, 0b10011111, 0b10111111]).valid_encoding?.should == true # lower boundary - 1
+ utf8([0b11101110, 0b10000000, 0b10000000]).valid_encoding?.should == true # upper boundary + 1
+ end
+
+ it "is not valid if the first byte is missing" do
+ bytes = [0b11100000, 0b10100000, 0b10000000]
+ utf8(bytes[2..3]).valid_encoding?.should == false
+ end
+
+ it "is not valid if the second byte is missing" do
+ bytes = [0b11100000, 0b10100000, 0b10000000]
+ utf8([bytes[0], bytes[2]]).valid_encoding?.should == false
+ end
+
+ it "is not valid if the second and the third bytes are missing" do
+ bytes = [0b11100000, 0b10100000, 0b10000000]
+ utf8(bytes[0..0]).valid_encoding?.should == false
+ end
+ end
+
+ describe "4-bytes character" do
+ it "is valid if in format [11110xxx 10xxxxxx 10xxxxxx 10xxxxxx]" do
+ utf8([0b11110000, 0b10010000, 0b10000000, 0b10000000]).valid_encoding?.should == true
+ utf8([0b11110000, 0b10010000, 0b10000000, 0b10111111]).valid_encoding?.should == true
+ utf8([0b11110000, 0b10010000, 0b10111111, 0b10111111]).valid_encoding?.should == true
+ utf8([0b11110000, 0b10111111, 0b10111111, 0b10111111]).valid_encoding?.should == true
+ utf8([0b11110100, 0b10001111, 0b10111111, 0b10111111]).valid_encoding?.should == true
+ end
+
+ it "is not valid if the first byte is not in format 11110xxx" do
+ utf8([0b11100000, 0b10010000, 0b10000000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b11010000, 0b10010000, 0b10000000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b10110000, 0b10010000, 0b10000000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b01110000, 0b10010000, 0b10000000, 0b10000000]).valid_encoding?.should == false
+ end
+
+ it "is not valid if the second byte is not in format 10xxxxxx" do
+ utf8([0b11110000, 0b00010000, 0b10000000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b11110000, 0b01010000, 0b10000000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b11110000, 0b10010000, 0b10000000, 0b10000000]).valid_encoding?.should == true # correct bytes
+ utf8([0b11110000, 0b11010000, 0b10000000, 0b10000000]).valid_encoding?.should == false
+ end
+
+ it "is not valid if the third byte is not in format 10xxxxxx" do
+ utf8([0b11110000, 0b10010000, 0b00000000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b11110000, 0b10010000, 0b01000000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b11110000, 0b10010000, 0b10000000, 0b10000000]).valid_encoding?.should == true # correct bytes
+ utf8([0b11110000, 0b10010000, 0b11000000, 0b10000000]).valid_encoding?.should == false
+ end
+
+ it "is not valid if the forth byte is not in format 10xxxxxx" do
+ utf8([0b11110000, 0b10010000, 0b10000000, 0b00000000]).valid_encoding?.should == false
+ utf8([0b11110000, 0b10010000, 0b10000000, 0b01000000]).valid_encoding?.should == false
+ utf8([0b11110000, 0b10010000, 0b10000000, 0b10000000]).valid_encoding?.should == true # correct bytes
+ utf8([0b11110000, 0b10010000, 0b10000000, 0b11000000]).valid_encoding?.should == false
+ end
+
+ it "is not valid if is smaller than [xxxxx000 xx001000 xx000000 xx000000] (codepoint < U+10000)" do
+ utf8([0b11110000, 0b10000111, 0b10000000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b11110000, 0b10000110, 0b10000000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b11110000, 0b10000101, 0b10000000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b11110000, 0b10000100, 0b10000000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b11110000, 0b10000011, 0b10000000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b11110000, 0b10000010, 0b10000000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b11110000, 0b10000001, 0b10000000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b11110000, 0b10000000, 0b10000000, 0b10000000]).valid_encoding?.should == false
+ end
+
+ it "is not valid if is greater than [xxxxx100 xx001111 xx111111 xx111111] (codepoint > U+10FFFF)" do
+ utf8([0b11110100, 0b10010000, 0b10000000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b11110100, 0b10100000, 0b10000000, 0b10000000]).valid_encoding?.should == false
+ utf8([0b11110100, 0b10110000, 0b10000000, 0b10000000]).valid_encoding?.should == false
+
+ utf8([0b11110101, 0b10001111, 0b10111111, 0b10111111]).valid_encoding?.should == false
+ utf8([0b11110110, 0b10001111, 0b10111111, 0b10111111]).valid_encoding?.should == false
+ utf8([0b11110111, 0b10001111, 0b10111111, 0b10111111]).valid_encoding?.should == false
+ end
+
+ it "is not valid if the first byte is missing" do
+ bytes = [0b11110000, 0b10010000, 0b10000000, 0b10000000]
+ utf8(bytes[1..3]).valid_encoding?.should == false
+ end
+
+ it "is not valid if the second byte is missing" do
+ bytes = [0b11110000, 0b10010000, 0b10000000, 0b10000000]
+ utf8([bytes[0], bytes[2], bytes[3]]).valid_encoding?.should == false
+ end
+
+ it "is not valid if the second and the third bytes are missing" do
+ bytes = [0b11110000, 0b10010000, 0b10000000, 0b10000000]
+ utf8([bytes[0], bytes[3]]).valid_encoding?.should == false
+ end
+
+ it "is not valid if the second, the third and the fourth bytes are missing" do
+ bytes = [0b11110000, 0b10010000, 0b10000000, 0b10000000]
+ utf8(bytes[0..0]).valid_encoding?.should == false
+ end
+ end
+end