diff options
Diffstat (limited to 'lib/csv/parser.rb')
-rw-r--r-- | lib/csv/parser.rb | 1288 |
1 files changed, 0 insertions, 1288 deletions
diff --git a/lib/csv/parser.rb b/lib/csv/parser.rb deleted file mode 100644 index 1f8b150e19..0000000000 --- a/lib/csv/parser.rb +++ /dev/null @@ -1,1288 +0,0 @@ -# frozen_string_literal: true - -require "strscan" - -require_relative "input_record_separator" -require_relative "row" -require_relative "table" - -class CSV - # Note: Don't use this class directly. This is an internal class. - class Parser - # - # A CSV::Parser is m17n aware. The parser works in the Encoding of the IO - # or String object being read from or written to. Your data is never transcoded - # (unless you ask Ruby to transcode it for you) and will literally be parsed in - # the Encoding it is in. Thus CSV will return Arrays or Rows of Strings in the - # Encoding of your data. This is accomplished by transcoding the parser itself - # into your Encoding. - # - - # Raised when encoding is invalid. - class InvalidEncoding < StandardError - end - - # Raised when unexpected case is happen. - class UnexpectedError < StandardError - end - - # - # CSV::Scanner receives a CSV output, scans it and return the content. - # It also controls the life cycle of the object with its methods +keep_start+, - # +keep_end+, +keep_back+, +keep_drop+. - # - # Uses StringScanner (the official strscan gem). Strscan provides lexical - # scanning operations on a String. We inherit its object and take advantage - # on the methods. For more information, please visit: - # https://ruby-doc.org/stdlib-2.6.1/libdoc/strscan/rdoc/StringScanner.html - # - class Scanner < StringScanner - alias_method :scan_all, :scan - - def initialize(*args) - super - @keeps = [] - end - - def each_line(row_separator) - position = pos - rest.each_line(row_separator) do |line| - position += line.bytesize - self.pos = position - yield(line) - end - end - - def keep_start - @keeps.push(pos) - end - - def keep_end - start = @keeps.pop - string.byteslice(start, pos - start) - end - - def keep_back - self.pos = @keeps.pop - end - - def keep_drop - @keeps.pop - end - end - - # - # CSV::InputsScanner receives IO inputs, encoding and the chunk_size. - # It also controls the life cycle of the object with its methods +keep_start+, - # +keep_end+, +keep_back+, +keep_drop+. - # - # CSV::InputsScanner.scan() tries to match with pattern at the current position. - # If there's a match, the scanner advances the "scan pointer" and returns the matched string. - # Otherwise, the scanner returns nil. - # - # CSV::InputsScanner.rest() returns the "rest" of the string (i.e. everything after the scan pointer). - # If there is no more data (eos? = true), it returns "". - # - class InputsScanner - def initialize(inputs, encoding, row_separator, chunk_size: 8192) - @inputs = inputs.dup - @encoding = encoding - @row_separator = row_separator - @chunk_size = chunk_size - @last_scanner = @inputs.empty? - @keeps = [] - read_chunk - end - - def each_line(row_separator) - return enum_for(__method__, row_separator) unless block_given? - buffer = nil - input = @scanner.rest - position = @scanner.pos - offset = 0 - n_row_separator_chars = row_separator.size - # trace(__method__, :start, line, input) - while true - input.each_line(row_separator) do |line| - @scanner.pos += line.bytesize - if buffer - if n_row_separator_chars == 2 and - buffer.end_with?(row_separator[0]) and - line.start_with?(row_separator[1]) - buffer << line[0] - line = line[1..-1] - position += buffer.bytesize + offset - @scanner.pos = position - offset = 0 - yield(buffer) - buffer = nil - next if line.empty? - else - buffer << line - line = buffer - buffer = nil - end - end - if line.end_with?(row_separator) - position += line.bytesize + offset - @scanner.pos = position - offset = 0 - yield(line) - else - buffer = line - end - end - break unless read_chunk - input = @scanner.rest - position = @scanner.pos - offset = -buffer.bytesize if buffer - end - yield(buffer) if buffer - end - - def scan(pattern) - # trace(__method__, pattern, :start) - value = @scanner.scan(pattern) - # trace(__method__, pattern, :done, :last, value) if @last_scanner - return value if @last_scanner - - read_chunk if value and @scanner.eos? - # trace(__method__, pattern, :done, value) - value - end - - def scan_all(pattern) - # trace(__method__, pattern, :start) - value = @scanner.scan(pattern) - # trace(__method__, pattern, :done, :last, value) if @last_scanner - return value if @last_scanner - - return nil if value.nil? - while @scanner.eos? and read_chunk and (sub_value = @scanner.scan(pattern)) - # trace(__method__, pattern, :sub, sub_value) - value << sub_value - end - # trace(__method__, pattern, :done, value) - value - end - - def eos? - @scanner.eos? - end - - def keep_start - # trace(__method__, :start) - adjust_last_keep - @keeps.push([@scanner, @scanner.pos, nil]) - # trace(__method__, :done) - end - - def keep_end - # trace(__method__, :start) - scanner, start, buffer = @keeps.pop - if scanner == @scanner - keep = @scanner.string.byteslice(start, @scanner.pos - start) - else - keep = @scanner.string.byteslice(0, @scanner.pos) - end - if buffer - buffer << keep - keep = buffer - end - # trace(__method__, :done, keep) - keep - end - - def keep_back - # trace(__method__, :start) - scanner, start, buffer = @keeps.pop - if buffer - # trace(__method__, :rescan, start, buffer) - string = @scanner.string - if scanner == @scanner - keep = string.byteslice(start, string.bytesize - start) - else - keep = string - end - if keep and not keep.empty? - @inputs.unshift(StringIO.new(keep)) - @last_scanner = false - end - @scanner = StringScanner.new(buffer) - else - if @scanner != scanner - message = "scanners are different but no buffer: " - message += "#{@scanner.inspect}(#{@scanner.object_id}): " - message += "#{scanner.inspect}(#{scanner.object_id})" - raise UnexpectedError, message - end - # trace(__method__, :repos, start, buffer) - @scanner.pos = start - end - read_chunk if @scanner.eos? - end - - def keep_drop - _, _, buffer = @keeps.pop - # trace(__method__, :done, :empty) unless buffer - return unless buffer - - last_keep = @keeps.last - # trace(__method__, :done, :no_last_keep) unless last_keep - return unless last_keep - - if last_keep[2] - last_keep[2] << buffer - else - last_keep[2] = buffer - end - # trace(__method__, :done) - end - - def rest - @scanner.rest - end - - def check(pattern) - @scanner.check(pattern) - end - - private - def trace(*args) - pp([*args, @scanner, @scanner&.string, @scanner&.pos, @keeps]) - end - - def adjust_last_keep - # trace(__method__, :start) - - keep = @keeps.last - # trace(__method__, :done, :empty) if keep.nil? - return if keep.nil? - - scanner, start, buffer = keep - string = @scanner.string - if @scanner != scanner - start = 0 - end - if start == 0 and @scanner.eos? - keep_data = string - else - keep_data = string.byteslice(start, @scanner.pos - start) - end - if keep_data - if buffer - buffer << keep_data - else - keep[2] = keep_data.dup - end - end - - # trace(__method__, :done) - end - - def read_chunk - return false if @last_scanner - - adjust_last_keep - - input = @inputs.first - case input - when StringIO - string = input.read - raise InvalidEncoding unless string.valid_encoding? - # trace(__method__, :stringio, string) - @scanner = StringScanner.new(string) - @inputs.shift - @last_scanner = @inputs.empty? - true - else - chunk = input.gets(@row_separator, @chunk_size) - if chunk - raise InvalidEncoding unless chunk.valid_encoding? - # trace(__method__, :chunk, chunk) - @scanner = StringScanner.new(chunk) - if input.respond_to?(:eof?) and input.eof? - @inputs.shift - @last_scanner = @inputs.empty? - end - true - else - # trace(__method__, :no_chunk) - @scanner = StringScanner.new("".encode(@encoding)) - @inputs.shift - @last_scanner = @inputs.empty? - if @last_scanner - false - else - read_chunk - end - end - end - end - end - - def initialize(input, options) - @input = input - @options = options - @samples = [] - - prepare - end - - def column_separator - @column_separator - end - - def row_separator - @row_separator - end - - def quote_character - @quote_character - end - - def field_size_limit - @max_field_size&.succ - end - - def max_field_size - @max_field_size - end - - def skip_lines - @skip_lines - end - - def unconverted_fields? - @unconverted_fields - end - - def headers - @headers - end - - def header_row? - @use_headers and @headers.nil? - end - - def return_headers? - @return_headers - end - - def skip_blanks? - @skip_blanks - end - - def liberal_parsing? - @liberal_parsing - end - - def lineno - @lineno - end - - def line - last_line - end - - def parse(&block) - return to_enum(__method__) unless block_given? - - if @return_headers and @headers and @raw_headers - headers = Row.new(@headers, @raw_headers, true) - if @unconverted_fields - headers = add_unconverted_fields(headers, []) - end - yield headers - end - - begin - @scanner ||= build_scanner - if quote_character.nil? - parse_no_quote(&block) - elsif @need_robust_parsing - parse_quotable_robust(&block) - else - parse_quotable_loose(&block) - end - rescue InvalidEncoding - if @scanner - ignore_broken_line - lineno = @lineno - else - lineno = @lineno + 1 - end - message = "Invalid byte sequence in #{@encoding}" - raise MalformedCSVError.new(message, lineno) - rescue UnexpectedError => error - if @scanner - ignore_broken_line - lineno = @lineno - else - lineno = @lineno + 1 - end - message = "This should not be happen: #{error.message}: " - message += "Please report this to https://github.com/ruby/csv/issues" - raise MalformedCSVError.new(message, lineno) - end - end - - def use_headers? - @use_headers - end - - private - # A set of tasks to prepare the file in order to parse it - def prepare - prepare_variable - prepare_quote_character - prepare_backslash - prepare_skip_lines - prepare_strip - prepare_separators - validate_strip_and_col_sep_options - prepare_quoted - prepare_unquoted - prepare_line - prepare_header - prepare_parser - end - - def prepare_variable - @need_robust_parsing = false - @encoding = @options[:encoding] - liberal_parsing = @options[:liberal_parsing] - if liberal_parsing - @liberal_parsing = true - if liberal_parsing.is_a?(Hash) - @double_quote_outside_quote = - liberal_parsing[:double_quote_outside_quote] - @backslash_quote = liberal_parsing[:backslash_quote] - else - @double_quote_outside_quote = false - @backslash_quote = false - end - @need_robust_parsing = true - else - @liberal_parsing = false - @backslash_quote = false - end - @unconverted_fields = @options[:unconverted_fields] - @max_field_size = @options[:max_field_size] - @skip_blanks = @options[:skip_blanks] - @fields_converter = @options[:fields_converter] - @header_fields_converter = @options[:header_fields_converter] - end - - def prepare_quote_character - @quote_character = @options[:quote_character] - if @quote_character.nil? - @escaped_quote_character = nil - @escaped_quote = nil - else - @quote_character = @quote_character.to_s.encode(@encoding) - if @quote_character.length != 1 - message = ":quote_char has to be nil or a single character String" - raise ArgumentError, message - end - @escaped_quote_character = Regexp.escape(@quote_character) - @escaped_quote = Regexp.new(@escaped_quote_character) - end - end - - def prepare_backslash - return unless @backslash_quote - - @backslash_character = "\\".encode(@encoding) - - @escaped_backslash_character = Regexp.escape(@backslash_character) - @escaped_backslash = Regexp.new(@escaped_backslash_character) - if @quote_character.nil? - @backslash_quote_character = nil - else - @backslash_quote_character = - @backslash_character + @escaped_quote_character - end - end - - def prepare_skip_lines - skip_lines = @options[:skip_lines] - case skip_lines - when String - @skip_lines = skip_lines.encode(@encoding) - when Regexp, nil - @skip_lines = skip_lines - else - unless skip_lines.respond_to?(:match) - message = - ":skip_lines has to respond to \#match: #{skip_lines.inspect}" - raise ArgumentError, message - end - @skip_lines = skip_lines - end - end - - def prepare_strip - @strip = @options[:strip] - @escaped_strip = nil - @strip_value = nil - @rstrip_value = nil - if @strip.is_a?(String) - case @strip.length - when 0 - raise ArgumentError, ":strip must not be an empty String" - when 1 - # ok - else - raise ArgumentError, ":strip doesn't support 2 or more characters yet" - end - @strip = @strip.encode(@encoding) - @escaped_strip = Regexp.escape(@strip) - if @quote_character - @strip_value = Regexp.new(@escaped_strip + - "+".encode(@encoding)) - @rstrip_value = Regexp.new(@escaped_strip + - "+\\z".encode(@encoding)) - end - @need_robust_parsing = true - elsif @strip - strip_values = " \t\f\v" - @escaped_strip = strip_values.encode(@encoding) - if @quote_character - @strip_value = Regexp.new("[#{strip_values}]+".encode(@encoding)) - @rstrip_value = Regexp.new("[#{strip_values}]+\\z".encode(@encoding)) - end - @need_robust_parsing = true - end - end - - begin - StringScanner.new("x").scan("x") - rescue TypeError - STRING_SCANNER_SCAN_ACCEPT_STRING = false - else - STRING_SCANNER_SCAN_ACCEPT_STRING = true - end - - def prepare_separators - column_separator = @options[:column_separator] - @column_separator = column_separator.to_s.encode(@encoding) - if @column_separator.size < 1 - message = ":col_sep must be 1 or more characters: " - message += column_separator.inspect - raise ArgumentError, message - end - @row_separator = - resolve_row_separator(@options[:row_separator]).encode(@encoding) - - @escaped_column_separator = Regexp.escape(@column_separator) - @escaped_first_column_separator = Regexp.escape(@column_separator[0]) - if @column_separator.size > 1 - @column_end = Regexp.new(@escaped_column_separator) - @column_ends = @column_separator.each_char.collect do |char| - Regexp.new(Regexp.escape(char)) - end - @first_column_separators = Regexp.new(@escaped_first_column_separator + - "+".encode(@encoding)) - else - if STRING_SCANNER_SCAN_ACCEPT_STRING - @column_end = @column_separator - else - @column_end = Regexp.new(@escaped_column_separator) - end - @column_ends = nil - @first_column_separators = nil - end - - escaped_row_separator = Regexp.escape(@row_separator) - @row_end = Regexp.new(escaped_row_separator) - if @row_separator.size > 1 - @row_ends = @row_separator.each_char.collect do |char| - Regexp.new(Regexp.escape(char)) - end - else - @row_ends = nil - end - - @cr = "\r".encode(@encoding) - @lf = "\n".encode(@encoding) - @line_end = Regexp.new("\r\n|\n|\r".encode(@encoding)) - @not_line_end = Regexp.new("[^\r\n]+".encode(@encoding)) - end - - # This method verifies that there are no (obvious) ambiguities with the - # provided +col_sep+ and +strip+ parsing options. For example, if +col_sep+ - # and +strip+ were both equal to +\t+, then there would be no clear way to - # parse the input. - def validate_strip_and_col_sep_options - return unless @strip - - if @strip.is_a?(String) - if @column_separator.start_with?(@strip) || @column_separator.end_with?(@strip) - raise ArgumentError, - "The provided strip (#{@escaped_strip}) and " \ - "col_sep (#{@escaped_column_separator}) options are incompatible." - end - else - if Regexp.new("\\A[#{@escaped_strip}]|[#{@escaped_strip}]\\z").match?(@column_separator) - raise ArgumentError, - "The provided strip (true) and " \ - "col_sep (#{@escaped_column_separator}) options are incompatible." - end - end - end - - def prepare_quoted - if @quote_character - @quotes = Regexp.new(@escaped_quote_character + - "+".encode(@encoding)) - no_quoted_values = @escaped_quote_character.dup - if @backslash_quote - no_quoted_values << @escaped_backslash_character - end - @quoted_value = Regexp.new("[^".encode(@encoding) + - no_quoted_values + - "]+".encode(@encoding)) - end - if @escaped_strip - @split_column_separator = Regexp.new(@escaped_strip + - "*".encode(@encoding) + - @escaped_column_separator + - @escaped_strip + - "*".encode(@encoding)) - else - if @column_separator == " ".encode(@encoding) - @split_column_separator = Regexp.new(@escaped_column_separator) - else - @split_column_separator = @column_separator - end - end - end - - def prepare_unquoted - return if @quote_character.nil? - - no_unquoted_values = "\r\n".encode(@encoding) - no_unquoted_values << @escaped_first_column_separator - unless @liberal_parsing - no_unquoted_values << @escaped_quote_character - end - @unquoted_value = Regexp.new("[^".encode(@encoding) + - no_unquoted_values + - "]+".encode(@encoding)) - end - - def resolve_row_separator(separator) - if separator == :auto - cr = "\r".encode(@encoding) - lf = "\n".encode(@encoding) - if @input.is_a?(StringIO) - pos = @input.pos - separator = detect_row_separator(@input.read, cr, lf) - @input.seek(pos) - elsif @input.respond_to?(:gets) - if @input.is_a?(File) - chunk_size = 32 * 1024 - else - chunk_size = 1024 - end - begin - while separator == :auto - # - # if we run out of data, it's probably a single line - # (ensure will set default value) - # - break unless sample = @input.gets(nil, chunk_size) - - # extend sample if we're unsure of the line ending - if sample.end_with?(cr) - sample << (@input.gets(nil, 1) || "") - end - - @samples << sample - - separator = detect_row_separator(sample, cr, lf) - end - rescue IOError - # do nothing: ensure will set default - end - end - separator = InputRecordSeparator.value if separator == :auto - end - separator.to_s.encode(@encoding) - end - - def detect_row_separator(sample, cr, lf) - lf_index = sample.index(lf) - if lf_index - cr_index = sample[0, lf_index].index(cr) - else - cr_index = sample.index(cr) - end - if cr_index and lf_index - if cr_index + 1 == lf_index - cr + lf - elsif cr_index < lf_index - cr - else - lf - end - elsif cr_index - cr - elsif lf_index - lf - else - :auto - end - end - - def prepare_line - @lineno = 0 - @last_line = nil - @scanner = nil - end - - def last_line - if @scanner - @last_line ||= @scanner.keep_end - else - @last_line - end - end - - def prepare_header - @return_headers = @options[:return_headers] - - headers = @options[:headers] - case headers - when Array - @raw_headers = headers - quoted_fields = [false] * @raw_headers.size - @use_headers = true - when String - @raw_headers, quoted_fields = parse_headers(headers) - @use_headers = true - when nil, false - @raw_headers = nil - @use_headers = false - else - @raw_headers = nil - @use_headers = true - end - if @raw_headers - @headers = adjust_headers(@raw_headers, quoted_fields) - else - @headers = nil - end - end - - def parse_headers(row) - quoted_fields = [] - converter = lambda do |field, info| - quoted_fields << info.quoted? - field - end - headers = CSV.parse_line(row, - col_sep: @column_separator, - row_sep: @row_separator, - quote_char: @quote_character, - converters: [converter]) - [headers, quoted_fields] - end - - def adjust_headers(headers, quoted_fields) - adjusted_headers = @header_fields_converter.convert(headers, nil, @lineno, quoted_fields) - adjusted_headers.each {|h| h.freeze if h.is_a? String} - adjusted_headers - end - - def prepare_parser - @may_quoted = may_quoted? - end - - def may_quoted? - return false if @quote_character.nil? - - if @input.is_a?(StringIO) - pos = @input.pos - sample = @input.read - @input.seek(pos) - else - return false if @samples.empty? - sample = @samples.first - end - sample[0, 128].index(@quote_character) - end - - class UnoptimizedStringIO # :nodoc: - def initialize(string) - @io = StringIO.new(string, "rb:#{string.encoding}") - end - - def gets(*args) - @io.gets(*args) - end - - def each_line(*args, &block) - @io.each_line(*args, &block) - end - - def eof? - @io.eof? - end - end - - SCANNER_TEST = (ENV["CSV_PARSER_SCANNER_TEST"] == "yes") - if SCANNER_TEST - SCANNER_TEST_CHUNK_SIZE_NAME = "CSV_PARSER_SCANNER_TEST_CHUNK_SIZE" - SCANNER_TEST_CHUNK_SIZE_VALUE = ENV[SCANNER_TEST_CHUNK_SIZE_NAME] - def build_scanner - inputs = @samples.collect do |sample| - UnoptimizedStringIO.new(sample) - end - if @input.is_a?(StringIO) - inputs << UnoptimizedStringIO.new(@input.read) - else - inputs << @input - end - begin - chunk_size_value = ENV[SCANNER_TEST_CHUNK_SIZE_NAME] - rescue # Ractor::IsolationError - # Ractor on Ruby 3.0 can't read ENV value. - chunk_size_value = SCANNER_TEST_CHUNK_SIZE_VALUE - end - chunk_size = Integer((chunk_size_value || "1"), 10) - InputsScanner.new(inputs, - @encoding, - @row_separator, - chunk_size: chunk_size) - end - else - def build_scanner - string = nil - if @samples.empty? and @input.is_a?(StringIO) - string = @input.read - elsif @samples.size == 1 and - @input != ARGF and - @input.respond_to?(:eof?) and - @input.eof? - string = @samples[0] - end - if string - unless string.valid_encoding? - index = string.lines(@row_separator).index do |line| - !line.valid_encoding? - end - if index - message = "Invalid byte sequence in #{@encoding}" - raise MalformedCSVError.new(message, @lineno + index + 1) - end - end - Scanner.new(string) - else - inputs = @samples.collect do |sample| - StringIO.new(sample) - end - inputs << @input - InputsScanner.new(inputs, @encoding, @row_separator) - end - end - end - - def skip_needless_lines - return unless @skip_lines - - until @scanner.eos? - @scanner.keep_start - line = @scanner.scan_all(@not_line_end) || "".encode(@encoding) - line << @row_separator if parse_row_end - if skip_line?(line) - @lineno += 1 - @scanner.keep_drop - else - @scanner.keep_back - return - end - end - end - - def skip_line?(line) - line = line.delete_suffix(@row_separator) - case @skip_lines - when String - line.include?(@skip_lines) - when Regexp - @skip_lines.match?(line) - else - @skip_lines.match(line) - end - end - - def validate_field_size(field) - return unless @max_field_size - return if field.size <= @max_field_size - ignore_broken_line - message = "Field size exceeded: #{field.size} > #{@max_field_size}" - raise MalformedCSVError.new(message, @lineno) - end - - def parse_no_quote(&block) - @scanner.each_line(@row_separator) do |line| - next if @skip_lines and skip_line?(line) - original_line = line - line = line.delete_suffix(@row_separator) - - if line.empty? - next if @skip_blanks - row = [] - quoted_fields = [] - else - line = strip_value(line) - row = line.split(@split_column_separator, -1) - quoted_fields = [false] * row.size - if @max_field_size - row.each do |column| - validate_field_size(column) - end - end - n_columns = row.size - i = 0 - while i < n_columns - row[i] = nil if row[i].empty? - i += 1 - end - end - @last_line = original_line - emit_row(row, quoted_fields, &block) - end - end - - def parse_quotable_loose(&block) - @scanner.keep_start - @scanner.each_line(@row_separator) do |line| - if @skip_lines and skip_line?(line) - @scanner.keep_drop - @scanner.keep_start - next - end - original_line = line - line = line.delete_suffix(@row_separator) - - if line.empty? - if @skip_blanks - @scanner.keep_drop - @scanner.keep_start - next - end - row = [] - quoted_fields = [] - elsif line.include?(@cr) or line.include?(@lf) - @scanner.keep_back - @need_robust_parsing = true - return parse_quotable_robust(&block) - else - row = line.split(@split_column_separator, -1) - quoted_fields = [] - n_columns = row.size - i = 0 - while i < n_columns - column = row[i] - if column.empty? - quoted_fields << false - row[i] = nil - else - n_quotes = column.count(@quote_character) - if n_quotes.zero? - quoted_fields << false - # no quote - elsif n_quotes == 2 and - column.start_with?(@quote_character) and - column.end_with?(@quote_character) - quoted_fields << true - row[i] = column[1..-2] - else - @scanner.keep_back - @need_robust_parsing = true - return parse_quotable_robust(&block) - end - validate_field_size(row[i]) - end - i += 1 - end - end - @scanner.keep_drop - @scanner.keep_start - @last_line = original_line - emit_row(row, quoted_fields, &block) - end - @scanner.keep_drop - end - - def parse_quotable_robust(&block) - row = [] - quoted_fields = [] - skip_needless_lines - start_row - while true - @quoted_column_value = false - @unquoted_column_value = false - @scanner.scan_all(@strip_value) if @strip_value - value = parse_column_value - if value - @scanner.scan_all(@strip_value) if @strip_value - validate_field_size(value) - end - if parse_column_end - row << value - quoted_fields << @quoted_column_value - elsif parse_row_end - if row.empty? and value.nil? - emit_row([], [], &block) unless @skip_blanks - else - row << value - quoted_fields << @quoted_column_value - emit_row(row, quoted_fields, &block) - row = [] - quoted_fields = [] - end - skip_needless_lines - start_row - elsif @scanner.eos? - break if row.empty? and value.nil? - row << value - quoted_fields << @quoted_column_value - emit_row(row, quoted_fields, &block) - break - else - if @quoted_column_value - if liberal_parsing? and (new_line = @scanner.check(@line_end)) - message = - "Illegal end-of-line sequence outside of a quoted field " + - "<#{new_line.inspect}>" - else - message = "Any value after quoted field isn't allowed" - end - ignore_broken_line - raise MalformedCSVError.new(message, @lineno) - elsif @unquoted_column_value and - (new_line = @scanner.scan(@line_end)) - ignore_broken_line - message = "Unquoted fields do not allow new line " + - "<#{new_line.inspect}>" - raise MalformedCSVError.new(message, @lineno) - elsif @scanner.rest.start_with?(@quote_character) - ignore_broken_line - message = "Illegal quoting" - raise MalformedCSVError.new(message, @lineno) - elsif (new_line = @scanner.scan(@line_end)) - ignore_broken_line - message = "New line must be <#{@row_separator.inspect}> " + - "not <#{new_line.inspect}>" - raise MalformedCSVError.new(message, @lineno) - else - ignore_broken_line - raise MalformedCSVError.new("TODO: Meaningful message", - @lineno) - end - end - end - end - - def parse_column_value - if @liberal_parsing - quoted_value = parse_quoted_column_value - if quoted_value - @scanner.scan_all(@strip_value) if @strip_value - unquoted_value = parse_unquoted_column_value - if unquoted_value - if @double_quote_outside_quote - unquoted_value = unquoted_value.gsub(@quote_character * 2, - @quote_character) - if quoted_value.empty? # %Q{""...} case - return @quote_character + unquoted_value - end - end - @quote_character + quoted_value + @quote_character + unquoted_value - else - quoted_value - end - else - parse_unquoted_column_value - end - elsif @may_quoted - parse_quoted_column_value || - parse_unquoted_column_value - else - parse_unquoted_column_value || - parse_quoted_column_value - end - end - - def parse_unquoted_column_value - value = @scanner.scan_all(@unquoted_value) - return nil unless value - - @unquoted_column_value = true - if @first_column_separators - while true - @scanner.keep_start - is_column_end = @column_ends.all? do |column_end| - @scanner.scan(column_end) - end - @scanner.keep_back - break if is_column_end - sub_separator = @scanner.scan_all(@first_column_separators) - break if sub_separator.nil? - value << sub_separator - sub_value = @scanner.scan_all(@unquoted_value) - break if sub_value.nil? - value << sub_value - end - end - value.gsub!(@backslash_quote_character, @quote_character) if @backslash_quote - if @rstrip_value - value.gsub!(@rstrip_value, "") - end - value - end - - def parse_quoted_column_value - quotes = @scanner.scan_all(@quotes) - return nil unless quotes - - @quoted_column_value = true - n_quotes = quotes.size - if (n_quotes % 2).zero? - quotes[0, (n_quotes - 2) / 2] - else - value = quotes[0, n_quotes / 2] - while true - quoted_value = @scanner.scan_all(@quoted_value) - value << quoted_value if quoted_value - if @backslash_quote - if @scanner.scan(@escaped_backslash) - if @scanner.scan(@escaped_quote) - value << @quote_character - else - value << @backslash_character - end - next - end - end - - quotes = @scanner.scan_all(@quotes) - unless quotes - ignore_broken_line - message = "Unclosed quoted field" - raise MalformedCSVError.new(message, @lineno) - end - n_quotes = quotes.size - if n_quotes == 1 - break - else - value << quotes[0, n_quotes / 2] - break if (n_quotes % 2) == 1 - end - end - value - end - end - - def parse_column_end - return true if @scanner.scan(@column_end) - return false unless @column_ends - - @scanner.keep_start - if @column_ends.all? {|column_end| @scanner.scan(column_end)} - @scanner.keep_drop - true - else - @scanner.keep_back - false - end - end - - def parse_row_end - return true if @scanner.scan(@row_end) - return false unless @row_ends - @scanner.keep_start - if @row_ends.all? {|row_end| @scanner.scan(row_end)} - @scanner.keep_drop - true - else - @scanner.keep_back - false - end - end - - def strip_value(value) - return value unless @strip - return value if value.nil? - - case @strip - when String - while value.delete_prefix!(@strip) - # do nothing - end - while value.delete_suffix!(@strip) - # do nothing - end - else - value.strip! - end - value - end - - def ignore_broken_line - @scanner.scan_all(@not_line_end) - @scanner.scan_all(@line_end) - @lineno += 1 - end - - def start_row - if @last_line - @last_line = nil - else - @scanner.keep_drop - end - @scanner.keep_start - end - - def emit_row(row, quoted_fields, &block) - @lineno += 1 - - raw_row = row - if @use_headers - if @headers.nil? - @headers = adjust_headers(row, quoted_fields) - return unless @return_headers - row = Row.new(@headers, row, true) - else - row = Row.new(@headers, - @fields_converter.convert(raw_row, @headers, @lineno, quoted_fields)) - end - else - # convert fields, if needed... - row = @fields_converter.convert(raw_row, nil, @lineno, quoted_fields) - end - - # inject unconverted fields and accessor, if requested... - if @unconverted_fields and not row.respond_to?(:unconverted_fields) - add_unconverted_fields(row, raw_row) - end - - yield(row) - end - - # This method injects an instance variable <tt>unconverted_fields</tt> into - # +row+ and an accessor method for +row+ called unconverted_fields(). The - # variable is set to the contents of +fields+. - def add_unconverted_fields(row, fields) - class << row - attr_reader :unconverted_fields - end - row.instance_variable_set(:@unconverted_fields, fields) - row - end - end -end |