From 4a5d372ca8902a649928eb0689aca7edcfaa07b6 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 24 Dec 2021 10:18:18 +0900 Subject: [ruby/csv] parser: fix a keep bug that some texts may be dropped unexpectedly Ruby: [Bug #18245] [ruby-core:105587] Reported by Hassan Abdul Rehman. https://github.com/ruby/csv/commit/5c6523da0a --- lib/csv/parser.rb | 85 +++++++++++++++++++++-------------- test/csv/parse/test_inputs_scanner.rb | 37 +++++++++++++++ 2 files changed, 88 insertions(+), 34 deletions(-) create mode 100644 test/csv/parse/test_inputs_scanner.rb diff --git a/lib/csv/parser.rb b/lib/csv/parser.rb index ef33a69478..e1fe559a41 100644 --- a/lib/csv/parser.rb +++ b/lib/csv/parser.rb @@ -166,6 +166,7 @@ class CSV end def keep_start + adjust_last_keep @keeps.push([@scanner.pos, nil]) end @@ -196,7 +197,17 @@ class CSV end def keep_drop - @keeps.pop + _, buffer = @keeps.pop + return unless buffer + + last_keep = @keeps.last + return unless last_keep + + if last_keep[1] + last_keep[1] << buffer + else + last_keep[1] = buffer + end end def rest @@ -204,24 +215,30 @@ class CSV end private + def adjust_last_keep + keep = @keeps.last + return if keep.nil? + + keep_start = keep[0] + return if @scanner.pos == keep_start + + string = @scanner.string + keep_data = string.byteslice(keep_start, @scanner.pos - keep_start) + if keep_data + keep_buffer = keep[1] + if keep_buffer + keep_buffer << keep_data + else + keep[1] = keep_data.dup + end + end + keep[0] = 0 + end + def read_chunk return false if @last_scanner - unless @keeps.empty? - keep = @keeps.last - keep_start = keep[0] - string = @scanner.string - keep_data = string.byteslice(keep_start, @scanner.pos - keep_start) - if keep_data - keep_buffer = keep[1] - if keep_buffer - keep_buffer << keep_data - else - keep[1] = keep_data.dup - end - end - keep[0] = 0 - end + adjust_last_keep input = @inputs.first case input @@ -728,28 +745,26 @@ class CSV sample[0, 128].index(@quote_character) end - SCANNER_TEST = (ENV["CSV_PARSER_SCANNER_TEST"] == "yes") - if SCANNER_TEST - class UnoptimizedStringIO - def initialize(string) - @io = StringIO.new(string, "rb:#{string.encoding}") - end + class UnoptimizedStringIO # :nodoc: + def initialize(string) + @io = StringIO.new(string, "rb:#{string.encoding}") + end - def gets(*args) - @io.gets(*args) - end + def gets(*args) + @io.gets(*args) + end - def each_line(*args, &block) - @io.each_line(*args, &block) - end + def each_line(*args, &block) + @io.each_line(*args, &block) + end - def eof? - @io.eof? - end + def eof? + @io.eof? end + end - SCANNER_TEST_CHUNK_SIZE = - Integer((ENV["CSV_PARSER_SCANNER_TEST_CHUNK_SIZE"] || "1"), 10) + SCANNER_TEST = (ENV["CSV_PARSER_SCANNER_TEST"] == "yes") + if SCANNER_TEST def build_scanner inputs = @samples.collect do |sample| UnoptimizedStringIO.new(sample) @@ -759,9 +774,11 @@ class CSV else inputs << @input end + chunk_size = + Integer((ENV["CSV_PARSER_SCANNER_TEST_CHUNK_SIZE"] || "1"), 10) InputsScanner.new(inputs, @encoding, - chunk_size: SCANNER_TEST_CHUNK_SIZE) + chunk_size: chunk_size) end else def build_scanner diff --git a/test/csv/parse/test_inputs_scanner.rb b/test/csv/parse/test_inputs_scanner.rb new file mode 100644 index 0000000000..dd0a64cc45 --- /dev/null +++ b/test/csv/parse/test_inputs_scanner.rb @@ -0,0 +1,37 @@ +require_relative "../helper" + +class TestCSVParseInputsScanner < Test::Unit::TestCase + include Helper + + def test_keep_over_chunks_nested_back + input = CSV::Parser::UnoptimizedStringIO.new("abcdefghijklmnl") + scanner = CSV::Parser::InputsScanner.new([input], + Encoding::UTF_8, + nil, + chunk_size: 2) + scanner.keep_start + assert_equal("abc", scanner.scan_all(/[a-c]+/)) + scanner.keep_start + assert_equal("def", scanner.scan_all(/[d-f]+/)) + scanner.keep_back + scanner.keep_back + assert_equal("abcdefg", scanner.scan_all(/[a-g]+/)) + end + + + def test_keep_over_chunks_nested_drop_back + input = CSV::Parser::UnoptimizedStringIO.new("abcdefghijklmnl") + scanner = CSV::Parser::InputsScanner.new([input], + Encoding::UTF_8, + nil, + chunk_size: 3) + scanner.keep_start + assert_equal("ab", scanner.scan(/../)) + scanner.keep_start + assert_equal("c", scanner.scan(/./)) + assert_equal("d", scanner.scan(/./)) + scanner.keep_drop + scanner.keep_back + assert_equal("abcdefg", scanner.scan_all(/[a-g]+/)) + end +end -- cgit v1.2.3