perf: fix record copying performance bug

BurntSushi · BurntSushi · commit 73cf38b829e6 · 2021-03-07T08:39:12.000-05:00
If there happens to be an abnormally long record in a CSV file---where the rest are short---this abnormally long record ends up causing a performance loss while parsing subsequent records. Such a thing is usually caused by a buffer being expanded, and then that expanded buffer leading to extra cost that shouldn't be paid when parsing smaller records. Indeed, this case is no exception. In this case, the standard record iterators use an internal record for copying CSV data into, and then clone this record as appropriate it the iterator's `next` method. In this way, that record's memory can be reused. This is a bit better than just allocating a fresh buffer every time, since generally speaking, the length of each CSV row is usually pretty similar to the length of prior rows. However, in this case, when we come across an exceptionally long record, the internal record is expanded to handle that record. When that internal record is clone to give back to the caller, the record *and* its excess capacity is also cloned. In the case of an abnormally long record, this ends up copying that extra excess capacity for all subsequent rows. This easily explains the performance bug. So to fix it, we introduce a new private method that lets us copy a record *without* excess capacity. (We could implement `Clone` more intelligently, but I'm not sure whether it's appropriate to drop excess capacity in a `Clone` impl. That might be unexpected.) We then use this new method in the iterators instead of standard `clone`. In the case where there is no abnormally long records, this shouldn't have any impact. Fixes #227
diff --git a/src/byte_record.rs b/src/byte_record.rs
@@ -497,6 +497,18 @@ impl ByteRecord {
         &self.0.fields[..self.0.bounds.end()]
     }
 
+    /// Clone this record, but only copy `fields` up to the end of bounds. This
+    /// is useful when one wants to copy a record, but not necessarily any
+    /// excess capacity in that record.
+    #[inline]
+    pub(crate) fn clone_truncated(&self) -> ByteRecord {
+        let mut br = ByteRecord::new();
+        br.0.pos = self.0.pos.clone();
+        br.0.bounds = self.0.bounds.clone();
+        br.0.fields = self.0.fields[..self.0.bounds.end()].to_vec();
+        br
+    }
+
     /// Retrieve the underlying parts of a byte record.
     #[inline]
     pub(crate) fn as_parts(&mut self) -> (&mut Vec<u8>, &mut Vec<usize>) {
diff --git a/src/reader.rs b/src/reader.rs
@@ -2050,7 +2050,7 @@ impl<R: io::Read> Iterator for StringRecordsIntoIter<R> {
     fn next(&mut self) -> Option<Result<StringRecord>> {
         match self.rdr.read_record(&mut self.rec) {
             Err(err) => Some(Err(err)),
-            Ok(true) => Some(Ok(self.rec.clone())),
+            Ok(true) => Some(Ok(self.rec.clone_truncated())),
             Ok(false) => None,
         }
     }
@@ -2087,7 +2087,7 @@ impl<'r, R: io::Read> Iterator for StringRecordsIter<'r, R> {
     fn next(&mut self) -> Option<Result<StringRecord>> {
         match self.rdr.read_record(&mut self.rec) {
             Err(err) => Some(Err(err)),
-            Ok(true) => Some(Ok(self.rec.clone())),
+            Ok(true) => Some(Ok(self.rec.clone_truncated())),
             Ok(false) => None,
         }
     }
@@ -2126,7 +2126,7 @@ impl<R: io::Read> Iterator for ByteRecordsIntoIter<R> {
     fn next(&mut self) -> Option<Result<ByteRecord>> {
         match self.rdr.read_byte_record(&mut self.rec) {
             Err(err) => Some(Err(err)),
-            Ok(true) => Some(Ok(self.rec.clone())),
+            Ok(true) => Some(Ok(self.rec.clone_truncated())),
             Ok(false) => None,
         }
     }
@@ -2163,7 +2163,7 @@ impl<'r, R: io::Read> Iterator for ByteRecordsIter<'r, R> {
     fn next(&mut self) -> Option<Result<ByteRecord>> {
         match self.rdr.read_byte_record(&mut self.rec) {
             Err(err) => Some(Err(err)),
-            Ok(true) => Some(Ok(self.rec.clone())),
+            Ok(true) => Some(Ok(self.rec.clone_truncated())),
             Ok(false) => None,
         }
     }
diff --git a/src/string_record.rs b/src/string_record.rs
@@ -610,6 +610,14 @@ impl StringRecord {
         self.0
     }
 
+    /// Clone this record, but only copy `fields` up to the end of bounds. This
+    /// is useful when one wants to copy a record, but not necessarily any
+    /// excess capacity in that record.
+    #[inline]
+    pub(crate) fn clone_truncated(&self) -> StringRecord {
+        StringRecord(self.0.clone_truncated())
+    }
+
     /// A safe function for reading CSV data into a `StringRecord`.
     ///
     /// This relies on the internal representation of `StringRecord`.

Original file line number	Diff line number	Diff line change
`@@ -2050,7 +2050,7 @@ impl<R: io::Read> Iterator for StringRecordsIntoIter<R> {`
`2050`	`2050`	`fn next(&mut self) -> Option<Result<StringRecord>> {`
`2051`	`2051`	`match self.rdr.read_record(&mut self.rec) {`
`2052`	`2052`	`Err(err) => Some(Err(err)),`
`2053`		`- Ok(true) => Some(Ok(self.rec.clone())),`
	`2053`	`+ Ok(true) => Some(Ok(self.rec.clone_truncated())),`
`2054`	`2054`	`Ok(false) => None,`
`2055`	`2055`	`}`
`2056`	`2056`	`}`
`@@ -2087,7 +2087,7 @@ impl<'r, R: io::Read> Iterator for StringRecordsIter<'r, R> {`
`2087`	`2087`	`fn next(&mut self) -> Option<Result<StringRecord>> {`
`2088`	`2088`	`match self.rdr.read_record(&mut self.rec) {`
`2089`	`2089`	`Err(err) => Some(Err(err)),`
`2090`		`- Ok(true) => Some(Ok(self.rec.clone())),`
	`2090`	`+ Ok(true) => Some(Ok(self.rec.clone_truncated())),`
`2091`	`2091`	`Ok(false) => None,`
`2092`	`2092`	`}`
`2093`	`2093`	`}`
`@@ -2126,7 +2126,7 @@ impl<R: io::Read> Iterator for ByteRecordsIntoIter<R> {`
`2126`	`2126`	`fn next(&mut self) -> Option<Result<ByteRecord>> {`
`2127`	`2127`	`match self.rdr.read_byte_record(&mut self.rec) {`
`2128`	`2128`	`Err(err) => Some(Err(err)),`
`2129`		`- Ok(true) => Some(Ok(self.rec.clone())),`
	`2129`	`+ Ok(true) => Some(Ok(self.rec.clone_truncated())),`
`2130`	`2130`	`Ok(false) => None,`
`2131`	`2131`	`}`
`2132`	`2132`	`}`
`@@ -2163,7 +2163,7 @@ impl<'r, R: io::Read> Iterator for ByteRecordsIter<'r, R> {`
`2163`	`2163`	`fn next(&mut self) -> Option<Result<ByteRecord>> {`
`2164`	`2164`	`match self.rdr.read_byte_record(&mut self.rec) {`
`2165`	`2165`	`Err(err) => Some(Err(err)),`
`2166`		`- Ok(true) => Some(Ok(self.rec.clone())),`
	`2166`	`+ Ok(true) => Some(Ok(self.rec.clone_truncated())),`
`2167`	`2167`	`Ok(false) => None,`
`2168`	`2168`	`}`
`2169`	`2169`	`}`