4 files changed, 466 insertions, 0 deletions
diff --git a/test/prism/encoding/encodings_test.rb b/test/prism/encoding/encodings_test.rb
new file mode 100644
index 0000000000..b008fc3fa1
--- /dev/null
+++ b/test/prism/encoding/encodings_test.rb
@@ -0,0 +1,91 @@
+# frozen_string_literal: true
+
+return if RUBY_ENGINE != "ruby"
+
+require_relative "../test_helper"
+
+module Prism
+  class EncodingsTest < TestCase
+    class ConstantContext < BasicObject
+      def self.const_missing(const)
+        const
+      end
+    end
+
+    class IdentifierContext < BasicObject
+      def method_missing(name, *)
+        name
+      end
+    end
+
+    # These test that we're correctly parsing codepoints for each alias of each
+    # encoding that prism supports.
+    each_encoding do |encoding, range|
+      (encoding.names - %w[external internal filesystem locale]).each do |name|
+        define_method(:"test_encoding_#{name}") do
+          assert_encoding(encoding, name, range)
+        end
+      end
+    end
+
+    private
+
+    def assert_encoding_constant(name, character)
+      source = "# encoding: #{name}\n#{character}"
+      expected = ConstantContext.new.instance_eval(source)
+
+      result = Prism.parse(source)
+      assert result.success?
+
+      actual = result.value.statements.body.last
+      assert_kind_of ConstantReadNode, actual
+      assert_equal expected, actual.name
+    end
+
+    def assert_encoding_identifier(name, character)
+      source = "# encoding: #{name}\n#{character}"
+      expected = IdentifierContext.new.instance_eval(source)
+
+      result = Prism.parse(source)
+      assert result.success?
+
+      actual = result.value.statements.body.last
+      assert_kind_of CallNode, actual
+      assert_equal expected, actual.name
+    end
+
+    # Check that we can properly parse every codepoint in the given encoding.
+    def assert_encoding(encoding, name, range)
+      unicode = false
+
+      case encoding
+      when Encoding::UTF_8, Encoding::UTF_8_MAC, Encoding::UTF8_DoCoMo, Encoding::UTF8_KDDI, Encoding::UTF8_SoftBank, Encoding::CESU_8
+        unicode = true
+      when Encoding::Windows_1253
+        range = range.to_a - [0xb5]
+      end
+
+      range.each do |codepoint|
+        character = codepoint.chr(encoding)
+
+        if character.match?(/[[:alpha:]]/)
+          if character.match?(/[[:upper:]]/) || (unicode && character.match?(Regexp.new("\\p{Lt}".encode(encoding))))
+            assert_encoding_constant(name, character)
+          else
+            assert_encoding_identifier(name, character)
+          end
+        elsif character.match?(/[[:alnum:]]/)
+          assert_encoding_identifier(name, "_#{character}")
+        else
+          next if ["/", "{"].include?(character)
+
+          source = "# encoding: #{name}\n/(?##{character})/\n"
+          assert Prism.parse_success?(source), "Expected #{source.inspect} to parse successfully."
+        end
+      rescue RangeError
+        source = "# encoding: #{name}\n\\x#{codepoint.to_s(16)}"
+        assert Prism.parse_failure?(source)
+      end
+    end
+  end
+end
diff --git a/test/prism/encoding/regular_expression_encoding_test.rb b/test/prism/encoding/regular_expression_encoding_test.rb
new file mode 100644
index 0000000000..e2daae1d7f
--- /dev/null
+++ b/test/prism/encoding/regular_expression_encoding_test.rb
@@ -0,0 +1,131 @@
+# frozen_string_literal: true
+
+return unless defined?(RubyVM::InstructionSequence)
+return if RubyVM::InstructionSequence.compile("").to_a[4][:parser] == :prism
+
+require_relative "../test_helper"
+
+module Prism
+  class RegularExpressionEncodingTest < TestCase
+    each_encoding do |encoding, _|
+      define_method(:"test_regular_expression_encoding_flags_#{encoding.name}") do
+        assert_regular_expression_encoding_flags(encoding, ["/a/", "/ą/", "//"])
+      end
+
+      escapes = ["\\x00", "\\x7F", "\\x80", "\\xFF", "\\u{00}", "\\u{7F}", "\\u{80}", "\\M-\\C-?"]
+      escapes = escapes.concat(escapes.product(escapes).map(&:join))
+
+      define_method(:"test_regular_expression_escape_encoding_flags_#{encoding.name}") do
+        assert_regular_expression_encoding_flags(encoding, escapes.map { |e| "/#{e}/" })
+      end
+
+      ["n", "u", "e", "s"].each do |modifier|
+        define_method(:"test_regular_expression_encoding_modifiers_/#{modifier}_#{encoding.name}") do
+          regexp_sources = ["abc", "garçon", "\\x80", "gar\\xC3\\xA7on", "gar\\u{E7}on", "abc\\u{FFFFFF}", "\\x80\\u{80}" ]
+
+          assert_regular_expression_encoding_flags(
+            encoding,
+            regexp_sources.product(["n", "u", "e", "s"]).map { |r, modifier| "/#{r}/#{modifier}" }
+          )
+        end
+      end
+    end
+
+    private
+
+    def assert_regular_expression_encoding_flags(encoding, regexps)
+      regexps.each do |regexp|
+        regexp_modifier_used = regexp.end_with?("/u") || regexp.end_with?("/e") || regexp.end_with?("/s") || regexp.end_with?("/n")
+        source = "# encoding: #{encoding.name}\n#{regexp}"
+
+        encoding_errors = ["invalid multibyte char", "escaped non ASCII character in UTF-8 regexp", "differs from source encoding"]
+        skipped_errors = ["invalid multibyte escape", "incompatible character encoding", "UTF-8 character in non UTF-8 regexp", "invalid Unicode range", "invalid Unicode list"]
+
+        # TODO (nirvdrum 21-Feb-2024): Prism currently does not handle Regexp validation unless modifiers are used. So, skip processing those errors for now: https://github.com/ruby/prism/issues/2104
+        unless regexp_modifier_used
+          skipped_errors += encoding_errors
+          encoding_errors.clear
+        end
+
+        expected =
+          begin
+            eval(source).encoding
+          rescue SyntaxError => error
+            if encoding_errors.find { |e| error.message.include?(e) }
+              error.message.split("\n").map { |m| m[/: (.+?)$/, 1] }
+            elsif skipped_errors.find { |e| error.message.include?(e) }
+              next
+            else
+              raise
+            end
+          end
+
+        actual =
+          Prism.parse(source).then do |result|
+            if result.success?
+              regexp = result.statement
+
+              actual_encoding = if regexp.forced_utf8_encoding?
+                Encoding::UTF_8
+              elsif regexp.forced_binary_encoding?
+                Encoding::ASCII_8BIT
+              elsif regexp.forced_us_ascii_encoding?
+                Encoding::US_ASCII
+              elsif regexp.ascii_8bit?
+                Encoding::ASCII_8BIT
+              elsif regexp.utf_8?
+                Encoding::UTF_8
+              elsif regexp.euc_jp?
+                Encoding::EUC_JP
+              elsif regexp.windows_31j?
+                Encoding::Windows_31J
+              else
+                encoding
+              end
+
+              if regexp.utf_8? && actual_encoding != Encoding::UTF_8
+                raise "expected regexp encoding to be UTF-8 due to '/u' modifier, but got #{actual_encoding.name}"
+              elsif regexp.ascii_8bit? && (actual_encoding != Encoding::ASCII_8BIT && actual_encoding != Encoding::US_ASCII)
+                raise "expected regexp encoding to be ASCII-8BIT or US-ASCII due to '/n' modifier, but got #{actual_encoding.name}"
+              elsif regexp.euc_jp? && actual_encoding != Encoding::EUC_JP
+                raise "expected regexp encoding to be EUC-JP due to '/e' modifier, but got #{actual_encoding.name}"
+              elsif regexp.windows_31j? && actual_encoding != Encoding::Windows_31J
+                raise "expected regexp encoding to be Windows-31J due to '/s' modifier, but got #{actual_encoding.name}"
+              end
+
+              if regexp.utf_8? && regexp.forced_utf8_encoding?
+                raise "the forced_utf8 flag should not be set when the UTF-8 modifier (/u) is used"
+              elsif regexp.ascii_8bit? && regexp.forced_binary_encoding?
+                raise "the forced_ascii_8bit flag should not be set when the UTF-8 modifier (/u) is used"
+              end
+
+              actual_encoding
+            else
+              errors = result.errors.map(&:message)
+
+              if errors.last&.include?("UTF-8 mixed within")
+                nil
+              else
+                errors
+              end
+            end
+          end
+
+        # TODO (nirvdrum 22-Feb-2024): Remove this workaround once Prism better maps CRuby's error messages.
+        # This class of error message is tricky. The part not being compared is a representation of the regexp.
+        # Depending on the source encoding and any encoding modifiers being used, CRuby alters how the regexp is represented.
+        # Sometimes it's an MBC string. Other times it uses hexadecimal character escapes. And in other cases it uses
+        # the long-form Unicode escape sequences. This short-circuit checks that the error message is mostly correct.
+        if expected.is_a?(Array) && actual.is_a?(Array)
+          if expected.last.start_with?("/.../n has a non escaped non ASCII character in non ASCII-8BIT script:") &&
+              actual.last.start_with?("/.../n has a non escaped non ASCII character in non ASCII-8BIT script:")
+            expected.pop
+            actual.pop
+          end
+        end
+
+        assert_equal expected, actual
+      end
+    end
+  end
+end
diff --git a/test/prism/encoding/string_encoding_test.rb b/test/prism/encoding/string_encoding_test.rb
new file mode 100644
index 0000000000..6f9d86df3b
--- /dev/null
+++ b/test/prism/encoding/string_encoding_test.rb
@@ -0,0 +1,136 @@
+# frozen_string_literal: true
+
+require_relative "../test_helper"
+
+module Prism
+  class StringEncodingTest < TestCase
+    each_encoding do |encoding, _|
+      define_method(:"test_#{encoding.name}") do
+        assert_encoding(encoding)
+      end
+    end
+
+    def test_coding
+      actual = Prism.parse_statement("# coding: utf-8\n'string'").unescaped.encoding
+      assert_equal Encoding::UTF_8, actual
+    end
+
+    def test_coding_with_whitespace
+      actual = Prism.parse_statement("# coding \t \r  \v   :     \t \v    \r   ascii-8bit \n'string'").unescaped.encoding
+      assert_equal Encoding::ASCII_8BIT, actual
+    end
+
+    def test_emacs_style
+      actual = Prism.parse_statement("# -*- coding: utf-8 -*-\n'string'").unescaped.encoding
+      assert_equal Encoding::UTF_8, actual
+    end
+
+    def test_utf_8_unix
+      actual = Prism.parse_statement("# coding: utf-8-unix\n'string'").unescaped.encoding
+      assert_equal Encoding::UTF_8, actual
+    end
+
+    def test_utf_8_dos
+      actual = Prism.parse_statement("# coding: utf-8-dos\n'string'").unescaped.encoding
+      assert_equal Encoding::UTF_8, actual
+    end
+
+    def test_utf_8_mac
+      actual = Prism.parse_statement("# coding: utf-8-mac\n'string'").unescaped.encoding
+      assert_equal Encoding::UTF_8, actual
+    end
+
+    def test_utf_8_star
+      actual = Prism.parse_statement("# coding: utf-8-*\n'string'").unescaped.encoding
+      assert_equal Encoding::UTF_8, actual
+    end
+
+    def test_first_lexed_token
+      encoding = Prism.lex("# encoding: ascii-8bit").value[0][0].value.encoding
+      assert_equal Encoding::ASCII_8BIT, encoding
+    end
+
+    if !ENV["PRISM_BUILD_MINIMAL"]
+      # This test may be a little confusing. Basically when we use our strpbrk,
+      # it takes into account the encoding of the file.
+      def test_strpbrk_multibyte
+        result = Prism.parse(<<~RUBY)
+          # encoding: Shift_JIS
+          %w[\x81\x5c]
+        RUBY
+
+        assert(result.errors.empty?)
+        assert_equal(
+          (+"\x81\x5c").force_encoding(Encoding::Shift_JIS),
+          result.statement.elements.first.unescaped
+        )
+      end
+
+      def test_slice_encoding
+        slice = Prism.parse("# encoding: Shift_JIS\nア").value.slice
+        assert_equal (+"ア").force_encoding(Encoding::SHIFT_JIS), slice
+        assert_equal Encoding::SHIFT_JIS, slice.encoding
+      end
+
+      def test_multibyte_escapes
+        [
+          ["'", "'"],
+          ["\"", "\""],
+          ["`", "`"],
+          ["/", "/"],
+          ["<<'HERE'\n", "\nHERE"],
+          ["<<-HERE\n", "\nHERE"]
+        ].each do |opening, closing|
+          assert Prism.parse_success?("# encoding: shift_jis\n'\\\x82\xA0'\n")
+        end
+      end
+    end
+
+    private
+
+    def assert_encoding(encoding)
+      escapes = ["\\x00", "\\x7F", "\\x80", "\\xFF", "\\u{00}", "\\u{7F}", "\\u{80}", "\\M-\\C-?"]
+      escapes = escapes.concat(escapes.product(escapes).map(&:join))
+
+      escapes.each do |escaped|
+        source = "# encoding: #{encoding.name}\n\"#{escaped}\""
+
+        expected =
+          begin
+            eval(source).encoding
+          rescue SyntaxError => error
+            if error.message.include?("UTF-8 mixed within")
+              error.message[/UTF-8 mixed within .+? source/]
+            else
+              raise
+            end
+          end
+
+        actual =
+          Prism.parse(source).then do |result|
+            if result.success?
+              string = result.statement
+
+              if string.forced_utf8_encoding?
+                Encoding::UTF_8
+              elsif string.forced_binary_encoding?
+                Encoding::ASCII_8BIT
+              else
+                encoding
+              end
+            else
+              error = result.errors.first
+
+              if error.message.include?("mixed")
+                error.message
+              else
+                raise error.message
+              end
+            end
+          end
+
+        assert_equal expected, actual
+      end
+    end
+  end
+end
diff --git a/test/prism/encoding/symbol_encoding_test.rb b/test/prism/encoding/symbol_encoding_test.rb
new file mode 100644
index 0000000000..20c998a58b
--- /dev/null
+++ b/test/prism/encoding/symbol_encoding_test.rb
@@ -0,0 +1,108 @@
+# frozen_string_literal: true
+
+return if RUBY_ENGINE != "ruby"
+
+require_relative "../test_helper"
+
+module Prism
+  class SymbolEncodingTest < TestCase
+    each_encoding do |encoding, _|
+      define_method(:"test_symbols_#{encoding.name}") do
+        assert_symbols(encoding)
+      end
+
+      define_method(:"test_escapes_#{encoding.name}") do
+        assert_escapes(encoding)
+      end
+    end
+
+    private
+
+    def expected_encoding(source)
+      eval(source).encoding
+    end
+
+    def actual_encoding(source, encoding)
+      result = Prism.parse(source)
+
+      if result.success?
+        symbol = result.statement
+
+        if symbol.forced_utf8_encoding?
+          Encoding::UTF_8
+        elsif symbol.forced_binary_encoding?
+          Encoding::ASCII_8BIT
+        elsif symbol.forced_us_ascii_encoding?
+          Encoding::US_ASCII
+        else
+          encoding
+        end
+      else
+        raise SyntaxError.new(result.errors.map(&:message).join("\n"))
+      end
+    end
+
+    def assert_symbols(encoding)
+      [:a, :ą, :+].each do |symbol|
+        source = "# encoding: #{encoding.name}\n#{symbol.inspect}"
+
+        expected =
+          begin
+            expected_encoding(source)
+          rescue SyntaxError => error
+            if error.message.include?("invalid multibyte")
+              "invalid multibyte"
+            else
+              raise
+            end
+          end
+
+        actual =
+          begin
+            actual_encoding(source, encoding)
+          rescue SyntaxError => error
+            if error.message.include?("invalid multibyte")
+              "invalid multibyte"
+            else
+              raise
+            end
+          end
+
+        assert_equal expected, actual
+      end
+    end
+
+    def assert_escapes(encoding)
+      escapes = ["\\x00", "\\x7F", "\\x80", "\\xFF", "\\u{00}", "\\u{7F}", "\\u{80}", "\\M-\\C-?"]
+      escapes = escapes.concat(escapes.product(escapes).map(&:join))
+
+      escapes.each do |escaped|
+        source = "# encoding: #{encoding.name}\n:\"#{escaped}\""
+
+        expected =
+          begin
+            expected_encoding(source)
+          rescue SyntaxError => error
+            if error.message.include?("UTF-8 mixed within")
+              error.message[/UTF-8 mixed within .+? source/]
+            else
+              raise
+            end
+          end
+
+        actual =
+          begin
+            actual_encoding(source, encoding)
+          rescue SyntaxError => error
+            if error.message.include?("mixed")
+              error.message.split("\n", 2).first
+            else
+              raise
+            end
+          end
+
+        assert_equal expected, actual
+      end
+    end
+  end
+end