diff options
Diffstat (limited to 'test/prism/encoding')
| -rw-r--r-- | test/prism/encoding/encodings_test.rb | 91 | ||||
| -rw-r--r-- | test/prism/encoding/regular_expression_encoding_test.rb | 131 | ||||
| -rw-r--r-- | test/prism/encoding/string_encoding_test.rb | 136 | ||||
| -rw-r--r-- | test/prism/encoding/symbol_encoding_test.rb | 108 |
4 files changed, 466 insertions, 0 deletions
diff --git a/test/prism/encoding/encodings_test.rb b/test/prism/encoding/encodings_test.rb new file mode 100644 index 0000000000..b008fc3fa1 --- /dev/null +++ b/test/prism/encoding/encodings_test.rb @@ -0,0 +1,91 @@ +# frozen_string_literal: true + +return if RUBY_ENGINE != "ruby" + +require_relative "../test_helper" + +module Prism + class EncodingsTest < TestCase + class ConstantContext < BasicObject + def self.const_missing(const) + const + end + end + + class IdentifierContext < BasicObject + def method_missing(name, *) + name + end + end + + # These test that we're correctly parsing codepoints for each alias of each + # encoding that prism supports. + each_encoding do |encoding, range| + (encoding.names - %w[external internal filesystem locale]).each do |name| + define_method(:"test_encoding_#{name}") do + assert_encoding(encoding, name, range) + end + end + end + + private + + def assert_encoding_constant(name, character) + source = "# encoding: #{name}\n#{character}" + expected = ConstantContext.new.instance_eval(source) + + result = Prism.parse(source) + assert result.success? + + actual = result.value.statements.body.last + assert_kind_of ConstantReadNode, actual + assert_equal expected, actual.name + end + + def assert_encoding_identifier(name, character) + source = "# encoding: #{name}\n#{character}" + expected = IdentifierContext.new.instance_eval(source) + + result = Prism.parse(source) + assert result.success? + + actual = result.value.statements.body.last + assert_kind_of CallNode, actual + assert_equal expected, actual.name + end + + # Check that we can properly parse every codepoint in the given encoding. + def assert_encoding(encoding, name, range) + unicode = false + + case encoding + when Encoding::UTF_8, Encoding::UTF_8_MAC, Encoding::UTF8_DoCoMo, Encoding::UTF8_KDDI, Encoding::UTF8_SoftBank, Encoding::CESU_8 + unicode = true + when Encoding::Windows_1253 + range = range.to_a - [0xb5] + end + + range.each do |codepoint| + character = codepoint.chr(encoding) + + if character.match?(/[[:alpha:]]/) + if character.match?(/[[:upper:]]/) || (unicode && character.match?(Regexp.new("\\p{Lt}".encode(encoding)))) + assert_encoding_constant(name, character) + else + assert_encoding_identifier(name, character) + end + elsif character.match?(/[[:alnum:]]/) + assert_encoding_identifier(name, "_#{character}") + else + next if ["/", "{"].include?(character) + + source = "# encoding: #{name}\n/(?##{character})/\n" + assert Prism.parse_success?(source), "Expected #{source.inspect} to parse successfully." + end + rescue RangeError + source = "# encoding: #{name}\n\\x#{codepoint.to_s(16)}" + assert Prism.parse_failure?(source) + end + end + end +end diff --git a/test/prism/encoding/regular_expression_encoding_test.rb b/test/prism/encoding/regular_expression_encoding_test.rb new file mode 100644 index 0000000000..e2daae1d7f --- /dev/null +++ b/test/prism/encoding/regular_expression_encoding_test.rb @@ -0,0 +1,131 @@ +# frozen_string_literal: true + +return unless defined?(RubyVM::InstructionSequence) +return if RubyVM::InstructionSequence.compile("").to_a[4][:parser] == :prism + +require_relative "../test_helper" + +module Prism + class RegularExpressionEncodingTest < TestCase + each_encoding do |encoding, _| + define_method(:"test_regular_expression_encoding_flags_#{encoding.name}") do + assert_regular_expression_encoding_flags(encoding, ["/a/", "/ą/", "//"]) + end + + escapes = ["\\x00", "\\x7F", "\\x80", "\\xFF", "\\u{00}", "\\u{7F}", "\\u{80}", "\\M-\\C-?"] + escapes = escapes.concat(escapes.product(escapes).map(&:join)) + + define_method(:"test_regular_expression_escape_encoding_flags_#{encoding.name}") do + assert_regular_expression_encoding_flags(encoding, escapes.map { |e| "/#{e}/" }) + end + + ["n", "u", "e", "s"].each do |modifier| + define_method(:"test_regular_expression_encoding_modifiers_/#{modifier}_#{encoding.name}") do + regexp_sources = ["abc", "garçon", "\\x80", "gar\\xC3\\xA7on", "gar\\u{E7}on", "abc\\u{FFFFFF}", "\\x80\\u{80}" ] + + assert_regular_expression_encoding_flags( + encoding, + regexp_sources.product(["n", "u", "e", "s"]).map { |r, modifier| "/#{r}/#{modifier}" } + ) + end + end + end + + private + + def assert_regular_expression_encoding_flags(encoding, regexps) + regexps.each do |regexp| + regexp_modifier_used = regexp.end_with?("/u") || regexp.end_with?("/e") || regexp.end_with?("/s") || regexp.end_with?("/n") + source = "# encoding: #{encoding.name}\n#{regexp}" + + encoding_errors = ["invalid multibyte char", "escaped non ASCII character in UTF-8 regexp", "differs from source encoding"] + skipped_errors = ["invalid multibyte escape", "incompatible character encoding", "UTF-8 character in non UTF-8 regexp", "invalid Unicode range", "invalid Unicode list"] + + # TODO (nirvdrum 21-Feb-2024): Prism currently does not handle Regexp validation unless modifiers are used. So, skip processing those errors for now: https://github.com/ruby/prism/issues/2104 + unless regexp_modifier_used + skipped_errors += encoding_errors + encoding_errors.clear + end + + expected = + begin + eval(source).encoding + rescue SyntaxError => error + if encoding_errors.find { |e| error.message.include?(e) } + error.message.split("\n").map { |m| m[/: (.+?)$/, 1] } + elsif skipped_errors.find { |e| error.message.include?(e) } + next + else + raise + end + end + + actual = + Prism.parse(source).then do |result| + if result.success? + regexp = result.statement + + actual_encoding = if regexp.forced_utf8_encoding? + Encoding::UTF_8 + elsif regexp.forced_binary_encoding? + Encoding::ASCII_8BIT + elsif regexp.forced_us_ascii_encoding? + Encoding::US_ASCII + elsif regexp.ascii_8bit? + Encoding::ASCII_8BIT + elsif regexp.utf_8? + Encoding::UTF_8 + elsif regexp.euc_jp? + Encoding::EUC_JP + elsif regexp.windows_31j? + Encoding::Windows_31J + else + encoding + end + + if regexp.utf_8? && actual_encoding != Encoding::UTF_8 + raise "expected regexp encoding to be UTF-8 due to '/u' modifier, but got #{actual_encoding.name}" + elsif regexp.ascii_8bit? && (actual_encoding != Encoding::ASCII_8BIT && actual_encoding != Encoding::US_ASCII) + raise "expected regexp encoding to be ASCII-8BIT or US-ASCII due to '/n' modifier, but got #{actual_encoding.name}" + elsif regexp.euc_jp? && actual_encoding != Encoding::EUC_JP + raise "expected regexp encoding to be EUC-JP due to '/e' modifier, but got #{actual_encoding.name}" + elsif regexp.windows_31j? && actual_encoding != Encoding::Windows_31J + raise "expected regexp encoding to be Windows-31J due to '/s' modifier, but got #{actual_encoding.name}" + end + + if regexp.utf_8? && regexp.forced_utf8_encoding? + raise "the forced_utf8 flag should not be set when the UTF-8 modifier (/u) is used" + elsif regexp.ascii_8bit? && regexp.forced_binary_encoding? + raise "the forced_ascii_8bit flag should not be set when the UTF-8 modifier (/u) is used" + end + + actual_encoding + else + errors = result.errors.map(&:message) + + if errors.last&.include?("UTF-8 mixed within") + nil + else + errors + end + end + end + + # TODO (nirvdrum 22-Feb-2024): Remove this workaround once Prism better maps CRuby's error messages. + # This class of error message is tricky. The part not being compared is a representation of the regexp. + # Depending on the source encoding and any encoding modifiers being used, CRuby alters how the regexp is represented. + # Sometimes it's an MBC string. Other times it uses hexadecimal character escapes. And in other cases it uses + # the long-form Unicode escape sequences. This short-circuit checks that the error message is mostly correct. + if expected.is_a?(Array) && actual.is_a?(Array) + if expected.last.start_with?("/.../n has a non escaped non ASCII character in non ASCII-8BIT script:") && + actual.last.start_with?("/.../n has a non escaped non ASCII character in non ASCII-8BIT script:") + expected.pop + actual.pop + end + end + + assert_equal expected, actual + end + end + end +end diff --git a/test/prism/encoding/string_encoding_test.rb b/test/prism/encoding/string_encoding_test.rb new file mode 100644 index 0000000000..6f9d86df3b --- /dev/null +++ b/test/prism/encoding/string_encoding_test.rb @@ -0,0 +1,136 @@ +# frozen_string_literal: true + +require_relative "../test_helper" + +module Prism + class StringEncodingTest < TestCase + each_encoding do |encoding, _| + define_method(:"test_#{encoding.name}") do + assert_encoding(encoding) + end + end + + def test_coding + actual = Prism.parse_statement("# coding: utf-8\n'string'").unescaped.encoding + assert_equal Encoding::UTF_8, actual + end + + def test_coding_with_whitespace + actual = Prism.parse_statement("# coding \t \r \v : \t \v \r ascii-8bit \n'string'").unescaped.encoding + assert_equal Encoding::ASCII_8BIT, actual + end + + def test_emacs_style + actual = Prism.parse_statement("# -*- coding: utf-8 -*-\n'string'").unescaped.encoding + assert_equal Encoding::UTF_8, actual + end + + def test_utf_8_unix + actual = Prism.parse_statement("# coding: utf-8-unix\n'string'").unescaped.encoding + assert_equal Encoding::UTF_8, actual + end + + def test_utf_8_dos + actual = Prism.parse_statement("# coding: utf-8-dos\n'string'").unescaped.encoding + assert_equal Encoding::UTF_8, actual + end + + def test_utf_8_mac + actual = Prism.parse_statement("# coding: utf-8-mac\n'string'").unescaped.encoding + assert_equal Encoding::UTF_8, actual + end + + def test_utf_8_star + actual = Prism.parse_statement("# coding: utf-8-*\n'string'").unescaped.encoding + assert_equal Encoding::UTF_8, actual + end + + def test_first_lexed_token + encoding = Prism.lex("# encoding: ascii-8bit").value[0][0].value.encoding + assert_equal Encoding::ASCII_8BIT, encoding + end + + if !ENV["PRISM_BUILD_MINIMAL"] + # This test may be a little confusing. Basically when we use our strpbrk, + # it takes into account the encoding of the file. + def test_strpbrk_multibyte + result = Prism.parse(<<~RUBY) + # encoding: Shift_JIS + %w[\x81\x5c] + RUBY + + assert(result.errors.empty?) + assert_equal( + (+"\x81\x5c").force_encoding(Encoding::Shift_JIS), + result.statement.elements.first.unescaped + ) + end + + def test_slice_encoding + slice = Prism.parse("# encoding: Shift_JIS\nア").value.slice + assert_equal (+"ア").force_encoding(Encoding::SHIFT_JIS), slice + assert_equal Encoding::SHIFT_JIS, slice.encoding + end + + def test_multibyte_escapes + [ + ["'", "'"], + ["\"", "\""], + ["`", "`"], + ["/", "/"], + ["<<'HERE'\n", "\nHERE"], + ["<<-HERE\n", "\nHERE"] + ].each do |opening, closing| + assert Prism.parse_success?("# encoding: shift_jis\n'\\\x82\xA0'\n") + end + end + end + + private + + def assert_encoding(encoding) + escapes = ["\\x00", "\\x7F", "\\x80", "\\xFF", "\\u{00}", "\\u{7F}", "\\u{80}", "\\M-\\C-?"] + escapes = escapes.concat(escapes.product(escapes).map(&:join)) + + escapes.each do |escaped| + source = "# encoding: #{encoding.name}\n\"#{escaped}\"" + + expected = + begin + eval(source).encoding + rescue SyntaxError => error + if error.message.include?("UTF-8 mixed within") + error.message[/UTF-8 mixed within .+? source/] + else + raise + end + end + + actual = + Prism.parse(source).then do |result| + if result.success? + string = result.statement + + if string.forced_utf8_encoding? + Encoding::UTF_8 + elsif string.forced_binary_encoding? + Encoding::ASCII_8BIT + else + encoding + end + else + error = result.errors.first + + if error.message.include?("mixed") + error.message + else + raise error.message + end + end + end + + assert_equal expected, actual + end + end + end +end diff --git a/test/prism/encoding/symbol_encoding_test.rb b/test/prism/encoding/symbol_encoding_test.rb new file mode 100644 index 0000000000..20c998a58b --- /dev/null +++ b/test/prism/encoding/symbol_encoding_test.rb @@ -0,0 +1,108 @@ +# frozen_string_literal: true + +return if RUBY_ENGINE != "ruby" + +require_relative "../test_helper" + +module Prism + class SymbolEncodingTest < TestCase + each_encoding do |encoding, _| + define_method(:"test_symbols_#{encoding.name}") do + assert_symbols(encoding) + end + + define_method(:"test_escapes_#{encoding.name}") do + assert_escapes(encoding) + end + end + + private + + def expected_encoding(source) + eval(source).encoding + end + + def actual_encoding(source, encoding) + result = Prism.parse(source) + + if result.success? + symbol = result.statement + + if symbol.forced_utf8_encoding? + Encoding::UTF_8 + elsif symbol.forced_binary_encoding? + Encoding::ASCII_8BIT + elsif symbol.forced_us_ascii_encoding? + Encoding::US_ASCII + else + encoding + end + else + raise SyntaxError.new(result.errors.map(&:message).join("\n")) + end + end + + def assert_symbols(encoding) + [:a, :ą, :+].each do |symbol| + source = "# encoding: #{encoding.name}\n#{symbol.inspect}" + + expected = + begin + expected_encoding(source) + rescue SyntaxError => error + if error.message.include?("invalid multibyte") + "invalid multibyte" + else + raise + end + end + + actual = + begin + actual_encoding(source, encoding) + rescue SyntaxError => error + if error.message.include?("invalid multibyte") + "invalid multibyte" + else + raise + end + end + + assert_equal expected, actual + end + end + + def assert_escapes(encoding) + escapes = ["\\x00", "\\x7F", "\\x80", "\\xFF", "\\u{00}", "\\u{7F}", "\\u{80}", "\\M-\\C-?"] + escapes = escapes.concat(escapes.product(escapes).map(&:join)) + + escapes.each do |escaped| + source = "# encoding: #{encoding.name}\n:\"#{escaped}\"" + + expected = + begin + expected_encoding(source) + rescue SyntaxError => error + if error.message.include?("UTF-8 mixed within") + error.message[/UTF-8 mixed within .+? source/] + else + raise + end + end + + actual = + begin + actual_encoding(source, encoding) + rescue SyntaxError => error + if error.message.include?("mixed") + error.message.split("\n", 2).first + else + raise + end + end + + assert_equal expected, actual + end + end + end +end |
