summaryrefslogtreecommitdiff
path: root/test/csv/test_encodings.rb
diff options
context:
space:
mode:
Diffstat (limited to 'test/csv/test_encodings.rb')
-rw-r--r--test/csv/test_encodings.rb372
1 files changed, 0 insertions, 372 deletions
diff --git a/test/csv/test_encodings.rb b/test/csv/test_encodings.rb
deleted file mode 100644
index 8d228c05f3..0000000000
--- a/test/csv/test_encodings.rb
+++ /dev/null
@@ -1,372 +0,0 @@
-# -*- coding: utf-8 -*-
-# frozen_string_literal: false
-
-require_relative "helper"
-
-class TestCSVEncodings < Test::Unit::TestCase
- extend DifferentOFS
- include Helper
-
- def setup
- super
- require 'tempfile'
- @temp_csv_file = Tempfile.new(%w"test_csv. .csv")
- @temp_csv_path = @temp_csv_file.path
- @temp_csv_file.close
- end
-
- def teardown
- @temp_csv_file.close!
- super
- end
-
- ########################################
- ### Hand Test Some Popular Encodings ###
- ########################################
-
- def test_parses_utf8_encoding
- assert_parses( [ %w[ one two … ],
- %w[ 1 … 3 ],
- %w[ … 5 6 ] ], "UTF-8" )
- end
-
- def test_parses_latin1_encoding
- assert_parses( [ %w[ one two Résumé ],
- %w[ 1 Résumé 3 ],
- %w[ Résumé 5 6 ] ], "ISO-8859-1" )
- end
-
- def test_parses_utf16be_encoding
- assert_parses( [ %w[ one two … ],
- %w[ 1 … 3 ],
- %w[ … 5 6 ] ], "UTF-16BE" )
- end
-
- def test_parses_shift_jis_encoding
- assert_parses( [ %w[ 一 二 三 ],
- %w[ 四 五 六 ],
- %w[ 七 八 九 ] ], "Shift_JIS" )
- end
-
- ###########################################################
- ### Try Simple Reading for All Non-dummy Ruby Encodings ###
- ###########################################################
-
- def test_reading_with_most_encodings
- each_encoding do |encoding|
- begin
- assert_parses( [ %w[ abc def ],
- %w[ ghi jkl ] ], encoding )
- rescue Encoding::ConverterNotFoundError
- fail("Failed to support #{encoding.name}.")
- end
- end
- end
-
- def test_regular_expression_escaping
- each_encoding do |encoding|
- begin
- assert_parses( [ %w[ abc def ],
- %w[ ghi jkl ] ], encoding, col_sep: "|" )
- rescue Encoding::ConverterNotFoundError
- fail("Failed to properly escape #{encoding.name}.")
- end
- end
- end
-
- def test_read_with_default_encoding
- data = "abc"
- default_external = Encoding.default_external
- each_encoding do |encoding|
- File.open(@temp_csv_path, "wb", encoding: encoding) {|f| f << data}
- begin
- no_warnings do
- Encoding.default_external = encoding
- end
- result = CSV.read(@temp_csv_path)[0][0]
- ensure
- no_warnings do
- Encoding.default_external = default_external
- end
- end
- assert_equal(encoding, result.encoding)
- end
- end
-
- #######################################################################
- ### Stress Test ASCII Compatible and Non-ASCII Compatible Encodings ###
- #######################################################################
-
- def test_auto_line_ending_detection
- # arrange data to place a \r at the end of CSV's read ahead point
- encode_for_tests([["a" * 509]], row_sep: "\r\n") do |data|
- assert_equal("\r\n".encode(data.encoding), CSV.new(data).row_sep)
- end
- end
-
- def test_csv_chars_are_transcoded
- encode_for_tests([%w[abc def]]) do |data|
- %w[col_sep row_sep quote_char].each do |csv_char|
- assert_equal( "|".encode(data.encoding),
- CSV.new(data, csv_char.to_sym => "|").send(csv_char) )
- end
- end
- end
-
- def test_parser_works_with_encoded_headers
- encode_for_tests([%w[one two three], %w[1 2 3]]) do |data|
- parsed = CSV.parse(data, headers: true)
- assert_all?(parsed.headers, "Wrong data encoding.") {|h| h.encoding == data.encoding}
- parsed.each do |row|
- assert_all?(row.fields, "Wrong data encoding.") {|f| f.encoding == data.encoding}
- end
- end
- end
-
- def test_built_in_converters_transcode_to_utf_8_then_convert
- encode_for_tests([%w[one two three], %w[1 2 3]]) do |data|
- parsed = CSV.parse(data, converters: :integer)
- assert_all?(parsed[0], "Wrong data encoding.") {|f| f.encoding == data.encoding}
- assert_equal([1, 2, 3], parsed[1])
- end
- end
-
- def test_built_in_header_converters_transcode_to_utf_8_then_convert
- encode_for_tests([%w[one two three], %w[1 2 3]]) do |data|
- parsed = CSV.parse( data, headers: true,
- header_converters: :downcase )
- assert_all?(parsed.headers, "Wrong data encoding.") {|h| h.encoding.name == "UTF-8"}
- assert_all?(parsed[0].fields, "Wrong data encoding.") {|f| f.encoding == data.encoding}
- end
- end
-
- def test_open_allows_you_to_set_encodings
- encode_for_tests([%w[abc def]]) do |data|
- # read and write in encoding
- File.open(@temp_csv_path, "wb:#{data.encoding.name}") { |f| f << data }
- CSV.open(@temp_csv_path, "rb:#{data.encoding.name}") do |csv|
- csv.each do |row|
- assert_all?(row, "Wrong data encoding.") {|f| f.encoding == data.encoding}
- end
- end
-
- # read and write with transcoding
- File.open(@temp_csv_path, "wb:UTF-32BE:#{data.encoding.name}") do |f|
- f << data
- end
- CSV.open(@temp_csv_path, "rb:UTF-32BE:#{data.encoding.name}") do |csv|
- csv.each do |row|
- assert_all?(row, "Wrong data encoding.") {|f| f.encoding == data.encoding}
- end
- end
- end
- end
-
- def test_foreach_allows_you_to_set_encodings
- encode_for_tests([%w[abc def]]) do |data|
- # read and write in encoding
- File.open(@temp_csv_path, "wb", encoding: data.encoding) { |f| f << data }
- CSV.foreach(@temp_csv_path, encoding: data.encoding) do |row|
- row.each {|f| assert_equal(f.encoding, data.encoding)}
- end
-
- # read and write with transcoding
- File.open(@temp_csv_path, "wb:UTF-32BE:#{data.encoding.name}") do |f|
- f << data
- end
- CSV.foreach( @temp_csv_path,
- encoding: "UTF-32BE:#{data.encoding.name}" ) do |row|
- assert_all?(row, "Wrong data encoding.") {|f| f.encoding == data.encoding}
- end
- end
- end
-
- def test_read_allows_you_to_set_encodings
- encode_for_tests([%w[abc def]]) do |data|
- # read and write in encoding
- File.open(@temp_csv_path, "wb:#{data.encoding.name}") { |f| f << data }
- rows = CSV.read(@temp_csv_path, encoding: data.encoding.name)
- assert_all?(rows.flatten, "Wrong data encoding.") {|f| f.encoding == data.encoding}
-
- # read and write with transcoding
- File.open(@temp_csv_path, "wb:UTF-32BE:#{data.encoding.name}") do |f|
- f << data
- end
- rows = CSV.read( @temp_csv_path,
- encoding: "UTF-32BE:#{data.encoding.name}" )
- assert_all?(rows.flatten, "Wrong data encoding.") {|f| f.encoding == data.encoding}
- end
- end
-
- #################################
- ### Write CSV in any Encoding ###
- #################################
-
- def test_can_write_csv_in_any_encoding
- each_encoding do |encoding|
- # test generate_line with encoding hint
- begin
- csv = %w[abc d|ef].map { |f| f.encode(encoding) }.
- to_csv(col_sep: "|", encoding: encoding.name)
- rescue Encoding::ConverterNotFoundError
- next
- end
- assert_equal(encoding, csv.encoding)
-
- # test generate_line with encoding guessing from fields
- csv = %w[abc d|ef].map { |f| f.encode(encoding) }.to_csv(col_sep: "|")
- assert_equal(encoding, csv.encoding)
-
- # writing to files
- data = encode_ary([%w[abc d,ef], %w[123 456 ]], encoding)
- CSV.open(@temp_csv_path, "wb:#{encoding.name}") do |f|
- data.each { |row| f << row }
- end
- assert_equal(data, CSV.read(@temp_csv_path, encoding: encoding.name))
- end
- end
-
- def test_encoding_is_upgraded_during_writing_as_needed
- data = ["foo".force_encoding("US-ASCII"), "\u3042"]
- assert_equal("US-ASCII", data.first.encoding.name)
- assert_equal("UTF-8", data.last.encoding.name)
- assert_equal("UTF-8", data.join('').encoding.name)
- assert_equal("UTF-8", data.to_csv.encoding.name)
- end
-
- def test_encoding_is_upgraded_for_ascii_content_during_writing_as_needed
- data = ["foo".force_encoding("ISO-8859-1"), "\u3042"]
- assert_equal("ISO-8859-1", data.first.encoding.name)
- assert_equal("UTF-8", data.last.encoding.name)
- assert_equal("UTF-8", data.join('').encoding.name)
- assert_equal("UTF-8", data.to_csv.encoding.name)
- end
-
- def test_encoding_is_not_upgraded_for_non_ascii_content_during_writing_as_needed
- data = ["\u00c0".encode("ISO-8859-1"), "\u3042"]
- assert_equal([
- "ISO-8859-1",
- "UTF-8",
- ],
- data.collect {|field| field.encoding.name})
- assert_raise(Encoding::CompatibilityError) do
- data.to_csv
- end
- end
-
- def test_explicit_encoding
- bug9766 = '[ruby-core:62113] [Bug #9766]'
- s = CSV.generate(encoding: "Windows-31J") do |csv|
- csv << ["foo".force_encoding("ISO-8859-1"), "\u3042"]
- end
- assert_equal(["foo,\u3042\n".encode(Encoding::Windows_31J), Encoding::Windows_31J], [s, s.encoding], bug9766)
- end
-
- def test_encoding_with_default_internal
- with_default_internal(Encoding::UTF_8) do
- s = CSV.generate(String.new(encoding: Encoding::Big5), encoding: Encoding::Big5) do |csv|
- csv << ["漢字"]
- end
- assert_equal(["漢字\n".encode(Encoding::Big5), Encoding::Big5], [s, s.encoding])
- end
- end
-
- def test_row_separator_detection_with_invalid_encoding
- csv = CSV.new("invalid,\xF8\r\nvalid,x\r\n".force_encoding("UTF-8"),
- encoding: "UTF-8")
- assert_equal("\r\n", csv.row_sep)
- end
-
- def test_invalid_encoding_row_error
- csv = CSV.new("valid,x\rinvalid,\xF8\r".force_encoding("UTF-8"),
- encoding: "UTF-8", row_sep: "\r")
- error = assert_raise(CSV::MalformedCSVError) do
- csv.shift
- csv.shift
- end
- assert_equal("Invalid byte sequence in UTF-8 in line 2.",
- error.message)
- end
-
- private
-
- def assert_parses(fields, encoding, **options)
- encoding = Encoding.find(encoding) unless encoding.is_a? Encoding
- orig_fields = fields
- fields = encode_ary(fields, encoding)
- data = ary_to_data(fields, **options)
- parsed = CSV.parse(data, **options)
- assert_equal(fields, parsed)
- parsed.flatten.each_with_index do |field, i|
- assert_equal(encoding, field.encoding, "Field[#{i + 1}] was transcoded.")
- end
- File.open(@temp_csv_path, "wb") {|f| f.print(data)}
- CSV.open(@temp_csv_path, "rb:#{encoding}", **options) do |csv|
- csv.each_with_index do |row, i|
- assert_equal(fields[i], row)
- end
- end
- begin
- CSV.open(@temp_csv_path,
- "rb:#{encoding}:#{__ENCODING__}",
- **options) do |csv|
- csv.each_with_index do |row, i|
- assert_equal(orig_fields[i], row)
- end
- end unless encoding == __ENCODING__
- rescue Encoding::ConverterNotFoundError
- end
- options[:encoding] = encoding.name
- CSV.open(@temp_csv_path, **options) do |csv|
- csv.each_with_index do |row, i|
- assert_equal(fields[i], row)
- end
- end
- options.delete(:encoding)
- options[:external_encoding] = encoding.name
- options[:internal_encoding] = __ENCODING__.name
- begin
- CSV.open(@temp_csv_path, **options) do |csv|
- csv.each_with_index do |row, i|
- assert_equal(orig_fields[i], row)
- end
- end unless encoding == __ENCODING__
- rescue Encoding::ConverterNotFoundError
- end
- end
-
- def encode_ary(ary, encoding)
- ary.map { |row| row.map { |field| field.encode(encoding) } }
- end
-
- def ary_to_data(ary, **options)
- encoding = ary.flatten.first.encoding
- quote_char = (options[:quote_char] || '"').encode(encoding)
- col_sep = (options[:col_sep] || ",").encode(encoding)
- row_sep = (options[:row_sep] || "\n").encode(encoding)
- ary.map { |row|
- row.map { |field|
- [quote_char, field.encode(encoding), quote_char].join('')
- }.join(col_sep) + row_sep
- }.join('').encode(encoding)
- end
-
- def encode_for_tests(data, **options)
- yield ary_to_data(encode_ary(data, "UTF-8"), **options)
- yield ary_to_data(encode_ary(data, "UTF-16BE"), **options)
- end
-
- def each_encoding
- Encoding.list.each do |encoding|
- next if encoding.dummy? # skip "dummy" encodings
- yield encoding
- end
- end
-
- def no_warnings
- old_verbose, $VERBOSE = $VERBOSE, nil
- yield
- ensure
- $VERBOSE = old_verbose
- end
-end