Skip to content

Commit 94acc57

Browse files
committed
Ignore line breaks in cells when counting rows
1 parent 4b6a2cd commit 94acc57

File tree

4 files changed

+32
-5
lines changed

4 files changed

+32
-5
lines changed

csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/MetaDataValidator.scala

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ package uk.gov.nationalarchives.csv.validator
1111

1212
import cats.data.{Chain, Validated, ValidatedNel}
1313
import cats.syntax.all._
14-
import com.univocity.parsers.csv.{CsvParser, CsvParserSettings}
14+
import com.univocity.parsers.csv.{CsvParser, CsvParserSettings, CsvRoutines}
1515
import org.apache.commons.io.input.BOMInputStream
1616
import uk.gov.nationalarchives.csv.validator.api.TextFile
1717
import uk.gov.nationalarchives.csv.validator.metadata.{Cell, Row}
@@ -286,10 +286,10 @@ trait MetaDataValidator {
286286

287287
protected def rulesForCell(columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): MetaDataValidation[Any]
288288

289-
protected def countRows(textFile: TextFile, schema: Schema): Int = {
289+
protected def countRows(textFile: TextFile): Int = {
290290
withReader(textFile) {
291-
reader =>
292-
countRows(reader, schema)
291+
// getInputDimension is more efficient and ignores new lines in cells but it closes the Reader after use; only use it when it's OK to discard the reader.
292+
reader => Try(new CsvRoutines().getInputDimension(reader).rowCount().toInt).getOrElse(-1)
293293
}
294294
}
295295

csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/api/CsvValidator.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ trait CsvValidator extends SchemaParser {
9999

100100
val csvValidation = withReader(csvFile) {
101101
reader =>
102-
val totalRows = countRows(csvFile, csvSchema)
102+
val totalRows = countRows(csvFile)
103103
validateKnownRows(reader, csvSchema, progress.map(p => {ProgressFor(totalRows, p)} ), rowCallback)
104104
}
105105
encodingValidationNel.isValid && csvValidation
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
col1,"this
2+
cell
3+
has
4+
multiple
5+
line
6+
breaks
7+
in
8+
it"
9+
col1, col2

csv-validator-core/src/test/scala/uk/gov/nationalarchives/csv/validator/api/CsvValidatorSpec.scala

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,24 @@ class CsvValidatorSpec extends Specification with TestResources {
6060
case Validated.Valid(_) => ok
6161
}
6262
}
63+
64+
val callback = new ProgressCallback {
65+
var processed = -1
66+
var total = -2
67+
override def update(complete: Percentage): Unit = ???
68+
69+
override def update(_total: Int, _processed: Int): Unit = {
70+
total = _total
71+
processed = _processed
72+
}
73+
}
74+
75+
"have a total (of rows) equal to the actual number of rows in the metadata file even if there are multiple line breaks in a cell" in {
76+
app.validate(TextFile(Paths.get(baseResourcePkgPath).resolve("metadataMultipleLineBreaksInCell.csv")), parse(baseResourcePkgPath + "/schema.csvs"), Some(callback)) must beLike {
77+
case Validated.Valid(_) => ok
78+
}
79+
callback.total must beEqualTo(2)
80+
}
6381
}
6482
}
6583

0 commit comments

Comments
 (0)