summaryrefslogtreecommitdiff
path: root/test/prism/encoding
diff options
context:
space:
mode:
Diffstat (limited to 'test/prism/encoding')
-rw-r--r--test/prism/encoding/encodings_test.rb91
-rw-r--r--test/prism/encoding/regular_expression_encoding_test.rb131
-rw-r--r--test/prism/encoding/string_encoding_test.rb136
-rw-r--r--test/prism/encoding/symbol_encoding_test.rb108
4 files changed, 466 insertions, 0 deletions
diff --git a/test/prism/encoding/encodings_test.rb b/test/prism/encoding/encodings_test.rb
new file mode 100644
index 0000000000..b008fc3fa1
--- /dev/null
+++ b/test/prism/encoding/encodings_test.rb
@@ -0,0 +1,91 @@
+# frozen_string_literal: true
+
+return if RUBY_ENGINE != "ruby"
+
+require_relative "../test_helper"
+
+module Prism
+ class EncodingsTest < TestCase
+ class ConstantContext < BasicObject
+ def self.const_missing(const)
+ const
+ end
+ end
+
+ class IdentifierContext < BasicObject
+ def method_missing(name, *)
+ name
+ end
+ end
+
+ # These test that we're correctly parsing codepoints for each alias of each
+ # encoding that prism supports.
+ each_encoding do |encoding, range|
+ (encoding.names - %w[external internal filesystem locale]).each do |name|
+ define_method(:"test_encoding_#{name}") do
+ assert_encoding(encoding, name, range)
+ end
+ end
+ end
+
+ private
+
+ def assert_encoding_constant(name, character)
+ source = "# encoding: #{name}\n#{character}"
+ expected = ConstantContext.new.instance_eval(source)
+
+ result = Prism.parse(source)
+ assert result.success?
+
+ actual = result.value.statements.body.last
+ assert_kind_of ConstantReadNode, actual
+ assert_equal expected, actual.name
+ end
+
+ def assert_encoding_identifier(name, character)
+ source = "# encoding: #{name}\n#{character}"
+ expected = IdentifierContext.new.instance_eval(source)
+
+ result = Prism.parse(source)
+ assert result.success?
+
+ actual = result.value.statements.body.last
+ assert_kind_of CallNode, actual
+ assert_equal expected, actual.name
+ end
+
+ # Check that we can properly parse every codepoint in the given encoding.
+ def assert_encoding(encoding, name, range)
+ unicode = false
+
+ case encoding
+ when Encoding::UTF_8, Encoding::UTF_8_MAC, Encoding::UTF8_DoCoMo, Encoding::UTF8_KDDI, Encoding::UTF8_SoftBank, Encoding::CESU_8
+ unicode = true
+ when Encoding::Windows_1253
+ range = range.to_a - [0xb5]
+ end
+
+ range.each do |codepoint|
+ character = codepoint.chr(encoding)
+
+ if character.match?(/[[:alpha:]]/)
+ if character.match?(/[[:upper:]]/) || (unicode && character.match?(Regexp.new("\\p{Lt}".encode(encoding))))
+ assert_encoding_constant(name, character)
+ else
+ assert_encoding_identifier(name, character)
+ end
+ elsif character.match?(/[[:alnum:]]/)
+ assert_encoding_identifier(name, "_#{character}")
+ else
+ next if ["/", "{"].include?(character)
+
+ source = "# encoding: #{name}\n/(?##{character})/\n"
+ assert Prism.parse_success?(source), "Expected #{source.inspect} to parse successfully."
+ end
+ rescue RangeError
+ source = "# encoding: #{name}\n\\x#{codepoint.to_s(16)}"
+ assert Prism.parse_failure?(source)
+ end
+ end
+ end
+end
diff --git a/test/prism/encoding/regular_expression_encoding_test.rb b/test/prism/encoding/regular_expression_encoding_test.rb
new file mode 100644
index 0000000000..e2daae1d7f
--- /dev/null
+++ b/test/prism/encoding/regular_expression_encoding_test.rb
@@ -0,0 +1,131 @@
+# frozen_string_literal: true
+
+return unless defined?(RubyVM::InstructionSequence)
+return if RubyVM::InstructionSequence.compile("").to_a[4][:parser] == :prism
+
+require_relative "../test_helper"
+
+module Prism
+ class RegularExpressionEncodingTest < TestCase
+ each_encoding do |encoding, _|
+ define_method(:"test_regular_expression_encoding_flags_#{encoding.name}") do
+ assert_regular_expression_encoding_flags(encoding, ["/a/", "/ą/", "//"])
+ end
+
+ escapes = ["\\x00", "\\x7F", "\\x80", "\\xFF", "\\u{00}", "\\u{7F}", "\\u{80}", "\\M-\\C-?"]
+ escapes = escapes.concat(escapes.product(escapes).map(&:join))
+
+ define_method(:"test_regular_expression_escape_encoding_flags_#{encoding.name}") do
+ assert_regular_expression_encoding_flags(encoding, escapes.map { |e| "/#{e}/" })
+ end
+
+ ["n", "u", "e", "s"].each do |modifier|
+ define_method(:"test_regular_expression_encoding_modifiers_/#{modifier}_#{encoding.name}") do
+ regexp_sources = ["abc", "garçon", "\\x80", "gar\\xC3\\xA7on", "gar\\u{E7}on", "abc\\u{FFFFFF}", "\\x80\\u{80}" ]
+
+ assert_regular_expression_encoding_flags(
+ encoding,
+ regexp_sources.product(["n", "u", "e", "s"]).map { |r, modifier| "/#{r}/#{modifier}" }
+ )
+ end
+ end
+ end
+
+ private
+
+ def assert_regular_expression_encoding_flags(encoding, regexps)
+ regexps.each do |regexp|
+ regexp_modifier_used = regexp.end_with?("/u") || regexp.end_with?("/e") || regexp.end_with?("/s") || regexp.end_with?("/n")
+ source = "# encoding: #{encoding.name}\n#{regexp}"
+
+ encoding_errors = ["invalid multibyte char", "escaped non ASCII character in UTF-8 regexp", "differs from source encoding"]
+ skipped_errors = ["invalid multibyte escape", "incompatible character encoding", "UTF-8 character in non UTF-8 regexp", "invalid Unicode range", "invalid Unicode list"]
+
+ # TODO (nirvdrum 21-Feb-2024): Prism currently does not handle Regexp validation unless modifiers are used. So, skip processing those errors for now: https://github.com/ruby/prism/issues/2104
+ unless regexp_modifier_used
+ skipped_errors += encoding_errors
+ encoding_errors.clear
+ end
+
+ expected =
+ begin
+ eval(source).encoding
+ rescue SyntaxError => error
+ if encoding_errors.find { |e| error.message.include?(e) }
+ error.message.split("\n").map { |m| m[/: (.+?)$/, 1] }
+ elsif skipped_errors.find { |e| error.message.include?(e) }
+ next
+ else
+ raise
+ end
+ end
+
+ actual =
+ Prism.parse(source).then do |result|
+ if result.success?
+ regexp = result.statement
+
+ actual_encoding = if regexp.forced_utf8_encoding?
+ Encoding::UTF_8
+ elsif regexp.forced_binary_encoding?
+ Encoding::ASCII_8BIT
+ elsif regexp.forced_us_ascii_encoding?
+ Encoding::US_ASCII
+ elsif regexp.ascii_8bit?
+ Encoding::ASCII_8BIT
+ elsif regexp.utf_8?
+ Encoding::UTF_8
+ elsif regexp.euc_jp?
+ Encoding::EUC_JP
+ elsif regexp.windows_31j?
+ Encoding::Windows_31J
+ else
+ encoding
+ end
+
+ if regexp.utf_8? && actual_encoding != Encoding::UTF_8
+ raise "expected regexp encoding to be UTF-8 due to '/u' modifier, but got #{actual_encoding.name}"
+ elsif regexp.ascii_8bit? && (actual_encoding != Encoding::ASCII_8BIT && actual_encoding != Encoding::US_ASCII)
+ raise "expected regexp encoding to be ASCII-8BIT or US-ASCII due to '/n' modifier, but got #{actual_encoding.name}"
+ elsif regexp.euc_jp? && actual_encoding != Encoding::EUC_JP
+ raise "expected regexp encoding to be EUC-JP due to '/e' modifier, but got #{actual_encoding.name}"
+ elsif regexp.windows_31j? && actual_encoding != Encoding::Windows_31J
+ raise "expected regexp encoding to be Windows-31J due to '/s' modifier, but got #{actual_encoding.name}"
+ end
+
+ if regexp.utf_8? && regexp.forced_utf8_encoding?
+ raise "the forced_utf8 flag should not be set when the UTF-8 modifier (/u) is used"
+ elsif regexp.ascii_8bit? && regexp.forced_binary_encoding?
+ raise "the forced_ascii_8bit flag should not be set when the UTF-8 modifier (/u) is used"
+ end
+
+ actual_encoding
+ else
+ errors = result.errors.map(&:message)
+
+ if errors.last&.include?("UTF-8 mixed within")
+ nil
+ else
+ errors
+ end
+ end
+ end
+
+ # TODO (nirvdrum 22-Feb-2024): Remove this workaround once Prism better maps CRuby's error messages.
+ # This class of error message is tricky. The part not being compared is a representation of the regexp.
+ # Depending on the source encoding and any encoding modifiers being used, CRuby alters how the regexp is represented.
+ # Sometimes it's an MBC string. Other times it uses hexadecimal character escapes. And in other cases it uses
+ # the long-form Unicode escape sequences. This short-circuit checks that the error message is mostly correct.
+ if expected.is_a?(Array) && actual.is_a?(Array)
+ if expected.last.start_with?("/.../n has a non escaped non ASCII character in non ASCII-8BIT script:") &&
+ actual.last.start_with?("/.../n has a non escaped non ASCII character in non ASCII-8BIT script:")
+ expected.pop
+ actual.pop
+ end
+ end
+
+ assert_equal expected, actual
+ end
+ end
+ end
+end
diff --git a/test/prism/encoding/string_encoding_test.rb b/test/prism/encoding/string_encoding_test.rb
new file mode 100644
index 0000000000..6f9d86df3b
--- /dev/null
+++ b/test/prism/encoding/string_encoding_test.rb
@@ -0,0 +1,136 @@
+# frozen_string_literal: true
+
+require_relative "../test_helper"
+
+module Prism
+ class StringEncodingTest < TestCase
+ each_encoding do |encoding, _|
+ define_method(:"test_#{encoding.name}") do
+ assert_encoding(encoding)
+ end
+ end
+
+ def test_coding
+ actual = Prism.parse_statement("# coding: utf-8\n'string'").unescaped.encoding
+ assert_equal Encoding::UTF_8, actual
+ end
+
+ def test_coding_with_whitespace
+ actual = Prism.parse_statement("# coding \t \r \v : \t \v \r ascii-8bit \n'string'").unescaped.encoding
+ assert_equal Encoding::ASCII_8BIT, actual
+ end
+
+ def test_emacs_style
+ actual = Prism.parse_statement("# -*- coding: utf-8 -*-\n'string'").unescaped.encoding
+ assert_equal Encoding::UTF_8, actual
+ end
+
+ def test_utf_8_unix
+ actual = Prism.parse_statement("# coding: utf-8-unix\n'string'").unescaped.encoding
+ assert_equal Encoding::UTF_8, actual
+ end
+
+ def test_utf_8_dos
+ actual = Prism.parse_statement("# coding: utf-8-dos\n'string'").unescaped.encoding
+ assert_equal Encoding::UTF_8, actual
+ end
+
+ def test_utf_8_mac
+ actual = Prism.parse_statement("# coding: utf-8-mac\n'string'").unescaped.encoding
+ assert_equal Encoding::UTF_8, actual
+ end
+
+ def test_utf_8_star
+ actual = Prism.parse_statement("# coding: utf-8-*\n'string'").unescaped.encoding
+ assert_equal Encoding::UTF_8, actual
+ end
+
+ def test_first_lexed_token
+ encoding = Prism.lex("# encoding: ascii-8bit").value[0][0].value.encoding
+ assert_equal Encoding::ASCII_8BIT, encoding
+ end
+
+ if !ENV["PRISM_BUILD_MINIMAL"]
+ # This test may be a little confusing. Basically when we use our strpbrk,
+ # it takes into account the encoding of the file.
+ def test_strpbrk_multibyte
+ result = Prism.parse(<<~RUBY)
+ # encoding: Shift_JIS
+ %w[\x81\x5c]
+ RUBY
+
+ assert(result.errors.empty?)
+ assert_equal(
+ (+"\x81\x5c").force_encoding(Encoding::Shift_JIS),
+ result.statement.elements.first.unescaped
+ )
+ end
+
+ def test_slice_encoding
+ slice = Prism.parse("# encoding: Shift_JIS\nア").value.slice
+ assert_equal (+"ア").force_encoding(Encoding::SHIFT_JIS), slice
+ assert_equal Encoding::SHIFT_JIS, slice.encoding
+ end
+
+ def test_multibyte_escapes
+ [
+ ["'", "'"],
+ ["\"", "\""],
+ ["`", "`"],
+ ["/", "/"],
+ ["<<'HERE'\n", "\nHERE"],
+ ["<<-HERE\n", "\nHERE"]
+ ].each do |opening, closing|
+ assert Prism.parse_success?("# encoding: shift_jis\n'\\\x82\xA0'\n")
+ end
+ end
+ end
+
+ private
+
+ def assert_encoding(encoding)
+ escapes = ["\\x00", "\\x7F", "\\x80", "\\xFF", "\\u{00}", "\\u{7F}", "\\u{80}", "\\M-\\C-?"]
+ escapes = escapes.concat(escapes.product(escapes).map(&:join))
+
+ escapes.each do |escaped|
+ source = "# encoding: #{encoding.name}\n\"#{escaped}\""
+
+ expected =
+ begin
+ eval(source).encoding
+ rescue SyntaxError => error
+ if error.message.include?("UTF-8 mixed within")
+ error.message[/UTF-8 mixed within .+? source/]
+ else
+ raise
+ end
+ end
+
+ actual =
+ Prism.parse(source).then do |result|
+ if result.success?
+ string = result.statement
+
+ if string.forced_utf8_encoding?
+ Encoding::UTF_8
+ elsif string.forced_binary_encoding?
+ Encoding::ASCII_8BIT
+ else
+ encoding
+ end
+ else
+ error = result.errors.first
+
+ if error.message.include?("mixed")
+ error.message
+ else
+ raise error.message
+ end
+ end
+ end
+
+ assert_equal expected, actual
+ end
+ end
+ end
+end
diff --git a/test/prism/encoding/symbol_encoding_test.rb b/test/prism/encoding/symbol_encoding_test.rb
new file mode 100644
index 0000000000..20c998a58b
--- /dev/null
+++ b/test/prism/encoding/symbol_encoding_test.rb
@@ -0,0 +1,108 @@
+# frozen_string_literal: true
+
+return if RUBY_ENGINE != "ruby"
+
+require_relative "../test_helper"
+
+module Prism
+ class SymbolEncodingTest < TestCase
+ each_encoding do |encoding, _|
+ define_method(:"test_symbols_#{encoding.name}") do
+ assert_symbols(encoding)
+ end
+
+ define_method(:"test_escapes_#{encoding.name}") do
+ assert_escapes(encoding)
+ end
+ end
+
+ private
+
+ def expected_encoding(source)
+ eval(source).encoding
+ end
+
+ def actual_encoding(source, encoding)
+ result = Prism.parse(source)
+
+ if result.success?
+ symbol = result.statement
+
+ if symbol.forced_utf8_encoding?
+ Encoding::UTF_8
+ elsif symbol.forced_binary_encoding?
+ Encoding::ASCII_8BIT
+ elsif symbol.forced_us_ascii_encoding?
+ Encoding::US_ASCII
+ else
+ encoding
+ end
+ else
+ raise SyntaxError.new(result.errors.map(&:message).join("\n"))
+ end
+ end
+
+ def assert_symbols(encoding)
+ [:a, :ą, :+].each do |symbol|
+ source = "# encoding: #{encoding.name}\n#{symbol.inspect}"
+
+ expected =
+ begin
+ expected_encoding(source)
+ rescue SyntaxError => error
+ if error.message.include?("invalid multibyte")
+ "invalid multibyte"
+ else
+ raise
+ end
+ end
+
+ actual =
+ begin
+ actual_encoding(source, encoding)
+ rescue SyntaxError => error
+ if error.message.include?("invalid multibyte")
+ "invalid multibyte"
+ else
+ raise
+ end
+ end
+
+ assert_equal expected, actual
+ end
+ end
+
+ def assert_escapes(encoding)
+ escapes = ["\\x00", "\\x7F", "\\x80", "\\xFF", "\\u{00}", "\\u{7F}", "\\u{80}", "\\M-\\C-?"]
+ escapes = escapes.concat(escapes.product(escapes).map(&:join))
+
+ escapes.each do |escaped|
+ source = "# encoding: #{encoding.name}\n:\"#{escaped}\""
+
+ expected =
+ begin
+ expected_encoding(source)
+ rescue SyntaxError => error
+ if error.message.include?("UTF-8 mixed within")
+ error.message[/UTF-8 mixed within .+? source/]
+ else
+ raise
+ end
+ end
+
+ actual =
+ begin
+ actual_encoding(source, encoding)
+ rescue SyntaxError => error
+ if error.message.include?("mixed")
+ error.message.split("\n", 2).first
+ else
+ raise
+ end
+ end
+
+ assert_equal expected, actual
+ end
+ end
+ end
+end