diff --git a/include/internal/basic_csv_parser.cpp b/include/internal/basic_csv_parser.cpp index 61b8f35..b948d39 100644 --- a/include/internal/basic_csv_parser.cpp +++ b/include/internal/basic_csv_parser.cpp @@ -139,8 +139,8 @@ namespace csv { case ParseFlags::NEWLINE: this->data_pos++; - // Catches CRLF (or LFLF, CRCRLF, or any other non-sensical combination of newlines) - while (this->data_pos < in.size() && parse_flag(in[this->data_pos]) == ParseFlags::NEWLINE) + // Catches CRLF Only (not skip LFLF, CRCRLF, or any other non-sensical combination of newlines) + if (this->data_pos < in.size() && in[this->data_pos-1] == '\r' and in[this->data_pos] == '\n') this->data_pos++; // End of record -> Write record @@ -235,6 +235,7 @@ namespace csv { this->field_start = UNINITIALIZED_FIELD; this->field_length = 0; this->reset_data_ptr(); + this->data_ptr->_stream_pos = this->mmap_pos; // Create memory map size_t length = std::min(this->source_size - this->mmap_pos, bytes); diff --git a/include/internal/basic_csv_parser.hpp b/include/internal/basic_csv_parser.hpp index d76b2d9..d499b25 100644 --- a/include/internal/basic_csv_parser.hpp +++ b/include/internal/basic_csv_parser.hpp @@ -320,7 +320,11 @@ namespace csv { void next(size_t bytes = ITERATION_CHUNK_SIZE) override { if (this->eof()) return; + // Reset parser state + this->field_start = UNINITIALIZED_FIELD; + this->field_length = 0; this->reset_data_ptr(); + this->data_ptr->_stream_pos = this->stream_pos; this->data_ptr->_data = std::make_shared(); if (source_size == 0) { diff --git a/include/internal/csv_reader.cpp b/include/internal/csv_reader.cpp index 008a487..81e4348 100644 --- a/include/internal/csv_reader.cpp +++ b/include/internal/csv_reader.cpp @@ -280,10 +280,12 @@ namespace csv { return false; else { // Reading thread is not active => start another one - if (this->read_csv_worker.joinable()) - this->read_csv_worker.join(); + // if (this->read_csv_worker.joinable()) + // this->read_csv_worker.join(); + + // this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); + this->read_csv(internals::ITERATION_CHUNK_SIZE); - this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); } } else if (this->records->front().size() != this->n_cols && diff --git a/include/internal/csv_reader.hpp b/include/internal/csv_reader.hpp index 300e7da..cf75ed2 100644 --- a/include/internal/csv_reader.hpp +++ b/include/internal/csv_reader.hpp @@ -134,9 +134,9 @@ namespace csv { CSVReader& operator=(const CSVReader&) = delete; // No copy assignment CSVReader& operator=(CSVReader&& other) = default; ~CSVReader() { - if (this->read_csv_worker.joinable()) { - this->read_csv_worker.join(); - } + // if (this->read_csv_worker.joinable()) { + // this->read_csv_worker.join(); + // } } /** @name Retrieving CSV Rows */ @@ -216,13 +216,14 @@ namespace csv { /** @name Multi-Threaded File Reading: Flags and State */ ///@{ - std::thread read_csv_worker; /**< Worker thread for read_csv() */ + // std::thread read_csv_worker; /**< Worker thread for read_csv() */ ///@} /** Read initial chunk to get metadata */ void initial_read() { - this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); - this->read_csv_worker.join(); + // this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); + // this->read_csv_worker.join(); + this->read_csv(internals::ITERATION_CHUNK_SIZE); } void trim_header(); diff --git a/include/internal/csv_reader_iterator.cpp b/include/internal/csv_reader_iterator.cpp index 4ac0e6f..58d901d 100644 --- a/include/internal/csv_reader_iterator.cpp +++ b/include/internal/csv_reader_iterator.cpp @@ -8,8 +8,9 @@ namespace csv { /** Return an iterator to the first row in the reader */ CSV_INLINE CSVReader::iterator CSVReader::begin() { if (this->records->empty()) { - this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); - this->read_csv_worker.join(); + // this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); + // this->read_csv_worker.join(); + this->read_csv(internals::ITERATION_CHUNK_SIZE); // Still empty => return end iterator if (this->records->empty()) return this->end(); diff --git a/include/internal/csv_row.hpp b/include/internal/csv_row.hpp index 0ab935d..f12bb76 100644 --- a/include/internal/csv_row.hpp +++ b/include/internal/csv_row.hpp @@ -136,6 +136,9 @@ namespace csv { internals::ColNamesPtr col_names = nullptr; internals::ParseFlagMap parse_flags; internals::WhitespaceMap ws_flags; + + /** where in Stream we start */ + uint64_t _stream_pos = {}; }; using RawCSVDataPtr = std::shared_ptr; @@ -324,6 +327,9 @@ namespace csv { /** Return the number of fields in this row */ CONSTEXPR size_t size() const noexcept { return row_length; } + /** Where in the Stream we start */ + size_t current_row_start() const { return data->_stream_pos + data_start; } + /** @name Value Retrieval */ ///@{ CSVField operator[](size_t n) const; diff --git a/single_include/csv.hpp b/single_include/csv.hpp index 811c8e1..0e94b7b 100644 --- a/single_include/csv.hpp +++ b/single_include/csv.hpp @@ -5531,6 +5531,9 @@ namespace csv { internals::ColNamesPtr col_names = nullptr; internals::ParseFlagMap parse_flags; internals::WhitespaceMap ws_flags; + + /** where in Stream we start */ + uint64_t _stream_pos = {}; }; using RawCSVDataPtr = std::shared_ptr; @@ -5719,6 +5722,9 @@ namespace csv { /** Return the number of fields in this row */ CONSTEXPR size_t size() const noexcept { return row_length; } + /** Where in the Stream we start */ + size_t current_row_start() const { return data->_stream_pos + data_start; } + /** @name Value Retrieval */ ///@{ CSVField operator[](size_t n) const; @@ -6159,7 +6165,11 @@ namespace csv { void next(size_t bytes = ITERATION_CHUNK_SIZE) override { if (this->eof()) return; + // Reset parser state + this->field_start = UNINITIALIZED_FIELD; + this->field_length = 0; this->reset_data_ptr(); + this->data_ptr->_stream_pos = this->stream_pos; this->data_ptr->_data = std::make_shared(); if (source_size == 0) { @@ -6344,9 +6354,9 @@ namespace csv { CSVReader& operator=(const CSVReader&) = delete; // No copy assignment CSVReader& operator=(CSVReader&& other) = default; ~CSVReader() { - if (this->read_csv_worker.joinable()) { - this->read_csv_worker.join(); - } + // if (this->read_csv_worker.joinable()) { + // this->read_csv_worker.join(); + // } } /** @name Retrieving CSV Rows */ @@ -6426,13 +6436,14 @@ namespace csv { /** @name Multi-Threaded File Reading: Flags and State */ ///@{ - std::thread read_csv_worker; /**< Worker thread for read_csv() */ + // std::thread read_csv_worker; /**< Worker thread for read_csv() */ ///@} /** Read initial chunk to get metadata */ void initial_read() { - this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); - this->read_csv_worker.join(); + // this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); + // this->read_csv_worker.join(); + this->read_csv(internals::ITERATION_CHUNK_SIZE); } void trim_header(); @@ -6940,6 +6951,84 @@ namespace csv { ///@} } +#include +#include + + +namespace csv { + /** Shorthand function for parsing an in-memory CSV string + * + * @return A collection of CSVRow objects + * + * @par Example + * @snippet tests/test_read_csv.cpp Parse Example + */ + CSV_INLINE CSVReader parse(csv::string_view in, CSVFormat format) { + std::stringstream stream(in.data()); + return CSVReader(stream, format); + } + + /** Parses a CSV string with no headers + * + * @return A collection of CSVRow objects + */ + CSV_INLINE CSVReader parse_no_header(csv::string_view in) { + CSVFormat format; + format.header_row(-1); + + return parse(in, format); + } + + /** Parse a RFC 4180 CSV string, returning a collection + * of CSVRow objects + * + * @par Example + * @snippet tests/test_read_csv.cpp Escaped Comma + * + */ + CSV_INLINE CSVReader operator ""_csv(const char* in, size_t n) { + return parse(csv::string_view(in, n)); + } + + /** A shorthand for csv::parse_no_header() */ + CSV_INLINE CSVReader operator ""_csv_no_header(const char* in, size_t n) { + return parse_no_header(csv::string_view(in, n)); + } + + /** + * Find the position of a column in a CSV file or CSV_NOT_FOUND otherwise + * + * @param[in] filename Path to CSV file + * @param[in] col_name Column whose position we should resolve + * @param[in] format Format of the CSV file + */ + CSV_INLINE int get_col_pos( + csv::string_view filename, + csv::string_view col_name, + const CSVFormat& format) { + CSVReader reader(filename, format); + return reader.index_of(col_name); + } + + /** Get basic information about a CSV file + * @include programs/csv_info.cpp + */ + CSV_INLINE CSVFileInfo get_file_info(const std::string& filename) { + CSVReader reader(filename); + CSVFormat format = reader.get_format(); + for (auto it = reader.begin(); it != reader.end(); ++it); + + CSVFileInfo info = { + filename, + reader.get_col_names(), + format.get_delim(), + reader.n_rows(), + reader.get_col_names().size() + }; + + return info; + } +} namespace csv { namespace internals { @@ -7080,8 +7169,8 @@ namespace csv { case ParseFlags::NEWLINE: this->data_pos++; - // Catches CRLF (or LFLF, CRCRLF, or any other non-sensical combination of newlines) - while (this->data_pos < in.size() && parse_flag(in[this->data_pos]) == ParseFlags::NEWLINE) + // Catches CRLF Only (not skip LFLF, CRCRLF, or any other non-sensical combination of newlines) + if (this->data_pos < in.size() && in[this->data_pos-1] == '\r' and in[this->data_pos] == '\n') this->data_pos++; // End of record -> Write record @@ -7176,6 +7265,7 @@ namespace csv { this->field_start = UNINITIALIZED_FIELD; this->field_length = 0; this->reset_data_ptr(); + this->data_ptr->_stream_pos = this->mmap_pos; // Create memory map size_t length = std::min(this->source_size - this->mmap_pos, bytes); @@ -7206,126 +7296,6 @@ namespace csv { } } - -namespace csv { - namespace internals { - CSV_INLINE std::vector ColNames::get_col_names() const { - return this->col_names; - } - - CSV_INLINE void ColNames::set_col_names(const std::vector& cnames) { - this->col_names = cnames; - - for (size_t i = 0; i < cnames.size(); i++) { - this->col_pos[cnames[i]] = i; - } - } - - CSV_INLINE int ColNames::index_of(csv::string_view col_name) const { - auto pos = this->col_pos.find(col_name.data()); - if (pos != this->col_pos.end()) - return (int)pos->second; - - return CSV_NOT_FOUND; - } - - CSV_INLINE size_t ColNames::size() const noexcept { - return this->col_names.size(); - } - - } -} -/** @file - * Defines an object used to store CSV format settings - */ - -#include -#include - - -namespace csv { - CSV_INLINE CSVFormat& CSVFormat::delimiter(char delim) { - this->possible_delimiters = { delim }; - this->assert_no_char_overlap(); - return *this; - } - - CSV_INLINE CSVFormat& CSVFormat::delimiter(const std::vector & delim) { - this->possible_delimiters = delim; - this->assert_no_char_overlap(); - return *this; - } - - CSV_INLINE CSVFormat& CSVFormat::quote(char quote) { - this->no_quote = false; - this->quote_char = quote; - this->assert_no_char_overlap(); - return *this; - } - - CSV_INLINE CSVFormat& CSVFormat::trim(const std::vector & chars) { - this->trim_chars = chars; - this->assert_no_char_overlap(); - return *this; - } - - CSV_INLINE CSVFormat& CSVFormat::column_names(const std::vector& names) { - this->col_names = names; - this->header = -1; - return *this; - } - - CSV_INLINE CSVFormat& CSVFormat::header_row(int row) { - if (row < 0) this->variable_column_policy = VariableColumnPolicy::KEEP; - - this->header = row; - this->col_names = {}; - return *this; - } - - CSV_INLINE void CSVFormat::assert_no_char_overlap() - { - auto delims = std::set( - this->possible_delimiters.begin(), this->possible_delimiters.end()), - trims = std::set( - this->trim_chars.begin(), this->trim_chars.end()); - - // Stores intersection of possible delimiters and trim characters - std::vector intersection = {}; - - // Find which characters overlap, if any - std::set_intersection( - delims.begin(), delims.end(), - trims.begin(), trims.end(), - std::back_inserter(intersection)); - - // Make sure quote character is not contained in possible delimiters - // or whitespace characters - if (delims.find(this->quote_char) != delims.end() || - trims.find(this->quote_char) != trims.end()) { - intersection.push_back(this->quote_char); - } - - if (!intersection.empty()) { - std::string err_msg = "There should be no overlap between the quote character, " - "the set of possible delimiters " - "and the set of whitespace characters. Offending characters: "; - - // Create a pretty error message with the list of overlapping - // characters - for (size_t i = 0; i < intersection.size(); i++) { - err_msg += "'"; - err_msg += intersection[i]; - err_msg += "'"; - - if (i + 1 < intersection.size()) - err_msg += ", "; - } - - throw std::runtime_error(err_msg + '.'); - } - } -} /** @file * @brief Defines functionality needed for basic CSV parsing */ @@ -7607,10 +7577,12 @@ namespace csv { return false; else { // Reading thread is not active => start another one - if (this->read_csv_worker.joinable()) - this->read_csv_worker.join(); + // if (this->read_csv_worker.joinable()) + // this->read_csv_worker.join(); + + // this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); + this->read_csv(internals::ITERATION_CHUNK_SIZE); - this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); } } else if (this->records->front().size() != this->n_cols && @@ -7644,8 +7616,9 @@ namespace csv { /** Return an iterator to the first row in the reader */ CSV_INLINE CSVReader::iterator CSVReader::begin() { if (this->records->empty()) { - this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); - this->read_csv_worker.join(); + // this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); + // this->read_csv_worker.join(); + this->read_csv(internals::ITERATION_CHUNK_SIZE); // Still empty => return end iterator if (this->records->empty()) return this->end(); @@ -7698,6 +7671,126 @@ namespace csv { } } + +namespace csv { + namespace internals { + CSV_INLINE std::vector ColNames::get_col_names() const { + return this->col_names; + } + + CSV_INLINE void ColNames::set_col_names(const std::vector& cnames) { + this->col_names = cnames; + + for (size_t i = 0; i < cnames.size(); i++) { + this->col_pos[cnames[i]] = i; + } + } + + CSV_INLINE int ColNames::index_of(csv::string_view col_name) const { + auto pos = this->col_pos.find(col_name.data()); + if (pos != this->col_pos.end()) + return (int)pos->second; + + return CSV_NOT_FOUND; + } + + CSV_INLINE size_t ColNames::size() const noexcept { + return this->col_names.size(); + } + + } +} +/** @file + * Defines an object used to store CSV format settings + */ + +#include +#include + + +namespace csv { + CSV_INLINE CSVFormat& CSVFormat::delimiter(char delim) { + this->possible_delimiters = { delim }; + this->assert_no_char_overlap(); + return *this; + } + + CSV_INLINE CSVFormat& CSVFormat::delimiter(const std::vector & delim) { + this->possible_delimiters = delim; + this->assert_no_char_overlap(); + return *this; + } + + CSV_INLINE CSVFormat& CSVFormat::quote(char quote) { + this->no_quote = false; + this->quote_char = quote; + this->assert_no_char_overlap(); + return *this; + } + + CSV_INLINE CSVFormat& CSVFormat::trim(const std::vector & chars) { + this->trim_chars = chars; + this->assert_no_char_overlap(); + return *this; + } + + CSV_INLINE CSVFormat& CSVFormat::column_names(const std::vector& names) { + this->col_names = names; + this->header = -1; + return *this; + } + + CSV_INLINE CSVFormat& CSVFormat::header_row(int row) { + if (row < 0) this->variable_column_policy = VariableColumnPolicy::KEEP; + + this->header = row; + this->col_names = {}; + return *this; + } + + CSV_INLINE void CSVFormat::assert_no_char_overlap() + { + auto delims = std::set( + this->possible_delimiters.begin(), this->possible_delimiters.end()), + trims = std::set( + this->trim_chars.begin(), this->trim_chars.end()); + + // Stores intersection of possible delimiters and trim characters + std::vector intersection = {}; + + // Find which characters overlap, if any + std::set_intersection( + delims.begin(), delims.end(), + trims.begin(), trims.end(), + std::back_inserter(intersection)); + + // Make sure quote character is not contained in possible delimiters + // or whitespace characters + if (delims.find(this->quote_char) != delims.end() || + trims.find(this->quote_char) != trims.end()) { + intersection.push_back(this->quote_char); + } + + if (!intersection.empty()) { + std::string err_msg = "There should be no overlap between the quote character, " + "the set of possible delimiters " + "and the set of whitespace characters. Offending characters: "; + + // Create a pretty error message with the list of overlapping + // characters + for (size_t i = 0; i < intersection.size(); i++) { + err_msg += "'"; + err_msg += intersection[i]; + err_msg += "'"; + + if (i + 1 < intersection.size()) + err_msg += ", "; + } + + throw std::runtime_error(err_msg + '.'); + } + } +} /** @file * Defines the data type used for storing information about a CSV row */ @@ -8502,84 +8595,6 @@ namespace csv { return csv_dtypes; } } -#include -#include - - -namespace csv { - /** Shorthand function for parsing an in-memory CSV string - * - * @return A collection of CSVRow objects - * - * @par Example - * @snippet tests/test_read_csv.cpp Parse Example - */ - CSV_INLINE CSVReader parse(csv::string_view in, CSVFormat format) { - std::stringstream stream(in.data()); - return CSVReader(stream, format); - } - - /** Parses a CSV string with no headers - * - * @return A collection of CSVRow objects - */ - CSV_INLINE CSVReader parse_no_header(csv::string_view in) { - CSVFormat format; - format.header_row(-1); - - return parse(in, format); - } - - /** Parse a RFC 4180 CSV string, returning a collection - * of CSVRow objects - * - * @par Example - * @snippet tests/test_read_csv.cpp Escaped Comma - * - */ - CSV_INLINE CSVReader operator ""_csv(const char* in, size_t n) { - return parse(csv::string_view(in, n)); - } - - /** A shorthand for csv::parse_no_header() */ - CSV_INLINE CSVReader operator ""_csv_no_header(const char* in, size_t n) { - return parse_no_header(csv::string_view(in, n)); - } - - /** - * Find the position of a column in a CSV file or CSV_NOT_FOUND otherwise - * - * @param[in] filename Path to CSV file - * @param[in] col_name Column whose position we should resolve - * @param[in] format Format of the CSV file - */ - CSV_INLINE int get_col_pos( - csv::string_view filename, - csv::string_view col_name, - const CSVFormat& format) { - CSVReader reader(filename, format); - return reader.index_of(col_name); - } - - /** Get basic information about a CSV file - * @include programs/csv_info.cpp - */ - CSV_INLINE CSVFileInfo get_file_info(const std::string& filename) { - CSVReader reader(filename); - CSVFormat format = reader.get_format(); - for (auto it = reader.begin(); it != reader.end(); ++it); - - CSVFileInfo info = { - filename, - reader.get_col_names(), - format.get_delim(), - reader.n_rows(), - reader.get_col_names().size() - }; - - return info; - } -} #endif diff --git a/single_include_test/csv.hpp b/single_include_test/csv.hpp index 811c8e1..0e94b7b 100644 --- a/single_include_test/csv.hpp +++ b/single_include_test/csv.hpp @@ -5531,6 +5531,9 @@ namespace csv { internals::ColNamesPtr col_names = nullptr; internals::ParseFlagMap parse_flags; internals::WhitespaceMap ws_flags; + + /** where in Stream we start */ + uint64_t _stream_pos = {}; }; using RawCSVDataPtr = std::shared_ptr; @@ -5719,6 +5722,9 @@ namespace csv { /** Return the number of fields in this row */ CONSTEXPR size_t size() const noexcept { return row_length; } + /** Where in the Stream we start */ + size_t current_row_start() const { return data->_stream_pos + data_start; } + /** @name Value Retrieval */ ///@{ CSVField operator[](size_t n) const; @@ -6159,7 +6165,11 @@ namespace csv { void next(size_t bytes = ITERATION_CHUNK_SIZE) override { if (this->eof()) return; + // Reset parser state + this->field_start = UNINITIALIZED_FIELD; + this->field_length = 0; this->reset_data_ptr(); + this->data_ptr->_stream_pos = this->stream_pos; this->data_ptr->_data = std::make_shared(); if (source_size == 0) { @@ -6344,9 +6354,9 @@ namespace csv { CSVReader& operator=(const CSVReader&) = delete; // No copy assignment CSVReader& operator=(CSVReader&& other) = default; ~CSVReader() { - if (this->read_csv_worker.joinable()) { - this->read_csv_worker.join(); - } + // if (this->read_csv_worker.joinable()) { + // this->read_csv_worker.join(); + // } } /** @name Retrieving CSV Rows */ @@ -6426,13 +6436,14 @@ namespace csv { /** @name Multi-Threaded File Reading: Flags and State */ ///@{ - std::thread read_csv_worker; /**< Worker thread for read_csv() */ + // std::thread read_csv_worker; /**< Worker thread for read_csv() */ ///@} /** Read initial chunk to get metadata */ void initial_read() { - this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); - this->read_csv_worker.join(); + // this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); + // this->read_csv_worker.join(); + this->read_csv(internals::ITERATION_CHUNK_SIZE); } void trim_header(); @@ -6940,6 +6951,84 @@ namespace csv { ///@} } +#include +#include + + +namespace csv { + /** Shorthand function for parsing an in-memory CSV string + * + * @return A collection of CSVRow objects + * + * @par Example + * @snippet tests/test_read_csv.cpp Parse Example + */ + CSV_INLINE CSVReader parse(csv::string_view in, CSVFormat format) { + std::stringstream stream(in.data()); + return CSVReader(stream, format); + } + + /** Parses a CSV string with no headers + * + * @return A collection of CSVRow objects + */ + CSV_INLINE CSVReader parse_no_header(csv::string_view in) { + CSVFormat format; + format.header_row(-1); + + return parse(in, format); + } + + /** Parse a RFC 4180 CSV string, returning a collection + * of CSVRow objects + * + * @par Example + * @snippet tests/test_read_csv.cpp Escaped Comma + * + */ + CSV_INLINE CSVReader operator ""_csv(const char* in, size_t n) { + return parse(csv::string_view(in, n)); + } + + /** A shorthand for csv::parse_no_header() */ + CSV_INLINE CSVReader operator ""_csv_no_header(const char* in, size_t n) { + return parse_no_header(csv::string_view(in, n)); + } + + /** + * Find the position of a column in a CSV file or CSV_NOT_FOUND otherwise + * + * @param[in] filename Path to CSV file + * @param[in] col_name Column whose position we should resolve + * @param[in] format Format of the CSV file + */ + CSV_INLINE int get_col_pos( + csv::string_view filename, + csv::string_view col_name, + const CSVFormat& format) { + CSVReader reader(filename, format); + return reader.index_of(col_name); + } + + /** Get basic information about a CSV file + * @include programs/csv_info.cpp + */ + CSV_INLINE CSVFileInfo get_file_info(const std::string& filename) { + CSVReader reader(filename); + CSVFormat format = reader.get_format(); + for (auto it = reader.begin(); it != reader.end(); ++it); + + CSVFileInfo info = { + filename, + reader.get_col_names(), + format.get_delim(), + reader.n_rows(), + reader.get_col_names().size() + }; + + return info; + } +} namespace csv { namespace internals { @@ -7080,8 +7169,8 @@ namespace csv { case ParseFlags::NEWLINE: this->data_pos++; - // Catches CRLF (or LFLF, CRCRLF, or any other non-sensical combination of newlines) - while (this->data_pos < in.size() && parse_flag(in[this->data_pos]) == ParseFlags::NEWLINE) + // Catches CRLF Only (not skip LFLF, CRCRLF, or any other non-sensical combination of newlines) + if (this->data_pos < in.size() && in[this->data_pos-1] == '\r' and in[this->data_pos] == '\n') this->data_pos++; // End of record -> Write record @@ -7176,6 +7265,7 @@ namespace csv { this->field_start = UNINITIALIZED_FIELD; this->field_length = 0; this->reset_data_ptr(); + this->data_ptr->_stream_pos = this->mmap_pos; // Create memory map size_t length = std::min(this->source_size - this->mmap_pos, bytes); @@ -7206,126 +7296,6 @@ namespace csv { } } - -namespace csv { - namespace internals { - CSV_INLINE std::vector ColNames::get_col_names() const { - return this->col_names; - } - - CSV_INLINE void ColNames::set_col_names(const std::vector& cnames) { - this->col_names = cnames; - - for (size_t i = 0; i < cnames.size(); i++) { - this->col_pos[cnames[i]] = i; - } - } - - CSV_INLINE int ColNames::index_of(csv::string_view col_name) const { - auto pos = this->col_pos.find(col_name.data()); - if (pos != this->col_pos.end()) - return (int)pos->second; - - return CSV_NOT_FOUND; - } - - CSV_INLINE size_t ColNames::size() const noexcept { - return this->col_names.size(); - } - - } -} -/** @file - * Defines an object used to store CSV format settings - */ - -#include -#include - - -namespace csv { - CSV_INLINE CSVFormat& CSVFormat::delimiter(char delim) { - this->possible_delimiters = { delim }; - this->assert_no_char_overlap(); - return *this; - } - - CSV_INLINE CSVFormat& CSVFormat::delimiter(const std::vector & delim) { - this->possible_delimiters = delim; - this->assert_no_char_overlap(); - return *this; - } - - CSV_INLINE CSVFormat& CSVFormat::quote(char quote) { - this->no_quote = false; - this->quote_char = quote; - this->assert_no_char_overlap(); - return *this; - } - - CSV_INLINE CSVFormat& CSVFormat::trim(const std::vector & chars) { - this->trim_chars = chars; - this->assert_no_char_overlap(); - return *this; - } - - CSV_INLINE CSVFormat& CSVFormat::column_names(const std::vector& names) { - this->col_names = names; - this->header = -1; - return *this; - } - - CSV_INLINE CSVFormat& CSVFormat::header_row(int row) { - if (row < 0) this->variable_column_policy = VariableColumnPolicy::KEEP; - - this->header = row; - this->col_names = {}; - return *this; - } - - CSV_INLINE void CSVFormat::assert_no_char_overlap() - { - auto delims = std::set( - this->possible_delimiters.begin(), this->possible_delimiters.end()), - trims = std::set( - this->trim_chars.begin(), this->trim_chars.end()); - - // Stores intersection of possible delimiters and trim characters - std::vector intersection = {}; - - // Find which characters overlap, if any - std::set_intersection( - delims.begin(), delims.end(), - trims.begin(), trims.end(), - std::back_inserter(intersection)); - - // Make sure quote character is not contained in possible delimiters - // or whitespace characters - if (delims.find(this->quote_char) != delims.end() || - trims.find(this->quote_char) != trims.end()) { - intersection.push_back(this->quote_char); - } - - if (!intersection.empty()) { - std::string err_msg = "There should be no overlap between the quote character, " - "the set of possible delimiters " - "and the set of whitespace characters. Offending characters: "; - - // Create a pretty error message with the list of overlapping - // characters - for (size_t i = 0; i < intersection.size(); i++) { - err_msg += "'"; - err_msg += intersection[i]; - err_msg += "'"; - - if (i + 1 < intersection.size()) - err_msg += ", "; - } - - throw std::runtime_error(err_msg + '.'); - } - } -} /** @file * @brief Defines functionality needed for basic CSV parsing */ @@ -7607,10 +7577,12 @@ namespace csv { return false; else { // Reading thread is not active => start another one - if (this->read_csv_worker.joinable()) - this->read_csv_worker.join(); + // if (this->read_csv_worker.joinable()) + // this->read_csv_worker.join(); + + // this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); + this->read_csv(internals::ITERATION_CHUNK_SIZE); - this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); } } else if (this->records->front().size() != this->n_cols && @@ -7644,8 +7616,9 @@ namespace csv { /** Return an iterator to the first row in the reader */ CSV_INLINE CSVReader::iterator CSVReader::begin() { if (this->records->empty()) { - this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); - this->read_csv_worker.join(); + // this->read_csv_worker = std::thread(&CSVReader::read_csv, this, internals::ITERATION_CHUNK_SIZE); + // this->read_csv_worker.join(); + this->read_csv(internals::ITERATION_CHUNK_SIZE); // Still empty => return end iterator if (this->records->empty()) return this->end(); @@ -7698,6 +7671,126 @@ namespace csv { } } + +namespace csv { + namespace internals { + CSV_INLINE std::vector ColNames::get_col_names() const { + return this->col_names; + } + + CSV_INLINE void ColNames::set_col_names(const std::vector& cnames) { + this->col_names = cnames; + + for (size_t i = 0; i < cnames.size(); i++) { + this->col_pos[cnames[i]] = i; + } + } + + CSV_INLINE int ColNames::index_of(csv::string_view col_name) const { + auto pos = this->col_pos.find(col_name.data()); + if (pos != this->col_pos.end()) + return (int)pos->second; + + return CSV_NOT_FOUND; + } + + CSV_INLINE size_t ColNames::size() const noexcept { + return this->col_names.size(); + } + + } +} +/** @file + * Defines an object used to store CSV format settings + */ + +#include +#include + + +namespace csv { + CSV_INLINE CSVFormat& CSVFormat::delimiter(char delim) { + this->possible_delimiters = { delim }; + this->assert_no_char_overlap(); + return *this; + } + + CSV_INLINE CSVFormat& CSVFormat::delimiter(const std::vector & delim) { + this->possible_delimiters = delim; + this->assert_no_char_overlap(); + return *this; + } + + CSV_INLINE CSVFormat& CSVFormat::quote(char quote) { + this->no_quote = false; + this->quote_char = quote; + this->assert_no_char_overlap(); + return *this; + } + + CSV_INLINE CSVFormat& CSVFormat::trim(const std::vector & chars) { + this->trim_chars = chars; + this->assert_no_char_overlap(); + return *this; + } + + CSV_INLINE CSVFormat& CSVFormat::column_names(const std::vector& names) { + this->col_names = names; + this->header = -1; + return *this; + } + + CSV_INLINE CSVFormat& CSVFormat::header_row(int row) { + if (row < 0) this->variable_column_policy = VariableColumnPolicy::KEEP; + + this->header = row; + this->col_names = {}; + return *this; + } + + CSV_INLINE void CSVFormat::assert_no_char_overlap() + { + auto delims = std::set( + this->possible_delimiters.begin(), this->possible_delimiters.end()), + trims = std::set( + this->trim_chars.begin(), this->trim_chars.end()); + + // Stores intersection of possible delimiters and trim characters + std::vector intersection = {}; + + // Find which characters overlap, if any + std::set_intersection( + delims.begin(), delims.end(), + trims.begin(), trims.end(), + std::back_inserter(intersection)); + + // Make sure quote character is not contained in possible delimiters + // or whitespace characters + if (delims.find(this->quote_char) != delims.end() || + trims.find(this->quote_char) != trims.end()) { + intersection.push_back(this->quote_char); + } + + if (!intersection.empty()) { + std::string err_msg = "There should be no overlap between the quote character, " + "the set of possible delimiters " + "and the set of whitespace characters. Offending characters: "; + + // Create a pretty error message with the list of overlapping + // characters + for (size_t i = 0; i < intersection.size(); i++) { + err_msg += "'"; + err_msg += intersection[i]; + err_msg += "'"; + + if (i + 1 < intersection.size()) + err_msg += ", "; + } + + throw std::runtime_error(err_msg + '.'); + } + } +} /** @file * Defines the data type used for storing information about a CSV row */ @@ -8502,84 +8595,6 @@ namespace csv { return csv_dtypes; } } -#include -#include - - -namespace csv { - /** Shorthand function for parsing an in-memory CSV string - * - * @return A collection of CSVRow objects - * - * @par Example - * @snippet tests/test_read_csv.cpp Parse Example - */ - CSV_INLINE CSVReader parse(csv::string_view in, CSVFormat format) { - std::stringstream stream(in.data()); - return CSVReader(stream, format); - } - - /** Parses a CSV string with no headers - * - * @return A collection of CSVRow objects - */ - CSV_INLINE CSVReader parse_no_header(csv::string_view in) { - CSVFormat format; - format.header_row(-1); - - return parse(in, format); - } - - /** Parse a RFC 4180 CSV string, returning a collection - * of CSVRow objects - * - * @par Example - * @snippet tests/test_read_csv.cpp Escaped Comma - * - */ - CSV_INLINE CSVReader operator ""_csv(const char* in, size_t n) { - return parse(csv::string_view(in, n)); - } - - /** A shorthand for csv::parse_no_header() */ - CSV_INLINE CSVReader operator ""_csv_no_header(const char* in, size_t n) { - return parse_no_header(csv::string_view(in, n)); - } - - /** - * Find the position of a column in a CSV file or CSV_NOT_FOUND otherwise - * - * @param[in] filename Path to CSV file - * @param[in] col_name Column whose position we should resolve - * @param[in] format Format of the CSV file - */ - CSV_INLINE int get_col_pos( - csv::string_view filename, - csv::string_view col_name, - const CSVFormat& format) { - CSVReader reader(filename, format); - return reader.index_of(col_name); - } - - /** Get basic information about a CSV file - * @include programs/csv_info.cpp - */ - CSV_INLINE CSVFileInfo get_file_info(const std::string& filename) { - CSVReader reader(filename); - CSVFormat format = reader.get_format(); - for (auto it = reader.begin(); it != reader.end(); ++it); - - CSVFileInfo info = { - filename, - reader.get_col_names(), - format.get_delim(), - reader.n_rows(), - reader.get_col_names().size() - }; - - return info; - } -} #endif diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e5b57a8..43358c3 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -27,6 +27,7 @@ target_sources(csv_test test_data_type.cpp test_raw_csv_data.cpp test_round_trip.cpp + test_row_start_position.cpp ) target_link_libraries(csv_test csv) target_link_libraries(csv_test Catch2::Catch2WithMain) diff --git a/tests/test_read_csv.cpp b/tests/test_read_csv.cpp index 6c0db12..851d50e 100644 --- a/tests/test_read_csv.cpp +++ b/tests/test_read_csv.cpp @@ -190,23 +190,20 @@ TEST_CASE("Cursed Newlines", "[read_csv_cursed_newline]") { "123,234,345\r\n" "1,2,3\r\n" "4,5,6", - // Unix style "A,B,C\n" // Header row "123,234,345\n" "1,2,3\n" "4,5,6", - // Eww brother what is that... - "A,B,C\r\r\n" // Header row - "123,234,345\r\r\n" - "1,2,3\r\r\n" + "A,B,C\r\n" // Header row + "123,234,345\r\n" + "1,2,3\r\n" "4,5,6", - // Doubled-up Windows style (ridiculous: but I'm sure it exists somewhere) - "A,B,C\r\n\r\n" // Header row - "123,234,345\r\n\r\n" - "1,2,3\r\n\r\n" + "A,B,C\r\n" // Header row + "123,234,345\r\n" + "1,2,3\r\n" "4,5,6" ) ); diff --git a/tests/test_read_csv_file.cpp b/tests/test_read_csv_file.cpp index 73a5ac0..4ba6820 100644 --- a/tests/test_read_csv_file.cpp +++ b/tests/test_read_csv_file.cpp @@ -57,7 +57,8 @@ TEST_CASE("get_file_info() Test", "[test_file_info]") { "./tests/data/real_data/2009PowerStatus.txt"); REQUIRE(info.delim == '|'); - REQUIRE(info.n_rows == 37960); // Can confirm with Excel + // REQUIRE(info.n_rows == 37960); // Can confirm with Excel + REQUIRE(info.n_rows == 37961); // not skip CRCRLF REQUIRE(info.n_cols == 3); REQUIRE(info.col_names == vector({ "ReportDt", "Unit", "Power" })); } diff --git a/tests/test_row_start_position.cpp b/tests/test_row_start_position.cpp new file mode 100644 index 0000000..c516268 --- /dev/null +++ b/tests/test_row_start_position.cpp @@ -0,0 +1,57 @@ +/** @file + * Tests for CSV parsing + */ + +#include // remove() +#include + +#include +#include +#include +#include + +#include "csv.hpp" + +using namespace csv; +using std::string; +using std::vector; + +// +// CSVRow::current_row_start() +// + +TEST_CASE("CSVRow::current_row_start", "[current_row_start]") { + CSVGuessResult guessed_format = guess_format("./tests/data/real_data/YEAR07_CBSA_NAC3.txt"); + REQUIRE(guessed_format.delim == ';'); + REQUIRE(guessed_format.header_row == 0); + + std::fstream fstream; + auto testfile = std::filesystem::path("./tests/data/real_data/YEAR07_CBSA_NAC3.txt"); + std::ifstream ifs(testfile.c_str()); + std::string content((std::istreambuf_iterator(ifs)), (std::istreambuf_iterator())); + + CSVFormat format; + format.delimiter(guessed_format.delim).header_row(guessed_format.header_row); + + { + // parse from file + CSVReader reader(testfile.c_str(), format); + uint64_t pos = 0; + for (CSVRow& row : reader) { + pos = content.find_first_of('\n', pos) + 1; + REQUIRE(row.current_row_start() == pos); + } + } + + { + // parse from stream + auto stream = std::stringstream(content); + auto reader = CSVReader(stream, format); + + uint64_t pos = 0; + for (CSVRow& row : reader) { + pos = content.find_first_of('\n', pos) + 1; + REQUIRE(row.current_row_start() == pos); + } + } +}