Skip to content

Commit 0e1fff5

Browse files
authored
Merge pull request #353 from digital-preservation/bug-130-out-of-memory-too-many-errors
Bug 130 out of memory too many errors
2 parents 3b82107 + eabb93d commit 0e1fff5

File tree

7 files changed

+231
-125
lines changed

7 files changed

+231
-125
lines changed

csv-validator-cmd/src/main/scala/uk/gov/nationalarchives/csv/validator/cmd/CsvValidatorCmdApp.scala

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -9,18 +9,18 @@
99
package uk.gov.nationalarchives.csv.validator.cmd
1010

1111

12-
import java.text.DecimalFormat
12+
import cats.data.{NonEmptyList, Validated, ValidatedNel}
1313
import scopt.Read
1414
import uk.gov.nationalarchives.csv.validator._
15-
import uk.gov.nationalarchives.csv.validator.api.{CsvValidator, TextFile}
1615
import uk.gov.nationalarchives.csv.validator.api.CsvValidator.{SubstitutePath, createValidator}
16+
import uk.gov.nationalarchives.csv.validator.api.{CsvValidator, TextFile}
1717

1818
import java.net.URL
1919
import java.nio.charset.Charset
2020
import java.nio.file.{Files, Path, Paths}
21+
import java.text.DecimalFormat
2122
import java.util.jar.{Attributes, Manifest}
2223
import scala.util.Using
23-
import cats.data.{Validated, NonEmptyList}
2424

2525
object SystemExitCodes extends Enumeration {
2626
type ExitCode = Int
@@ -134,23 +134,33 @@ object CsvValidatorCmdApp extends App {
134134
}
135135
}
136136

137-
def validate(csvFile: TextFile, schemaFile: TextFile, failFast: Boolean, pathSubstitutionsList: List[SubstitutePath], enforceCaseSensitivePathChecks: Boolean, trace: Boolean, progress: Option[ProgressCallback]): ExitStatus = {
137+
def rowCallback(row: ValidatedNel[FailMessage, Any]): Unit = row match {
138+
case Validated.Invalid(failures) =>
139+
println(prettyPrint(failures))
140+
case _ =>
141+
}
142+
143+
def validate(
144+
csvFile: TextFile,
145+
schemaFile: TextFile,
146+
failFast: Boolean,
147+
pathSubstitutionsList: List[SubstitutePath],
148+
enforceCaseSensitivePathChecks: Boolean,
149+
trace: Boolean,
150+
progress: Option[ProgressCallback],
151+
onRow: ValidatedNel[FailMessage, Any] => Unit = rowCallback
152+
): ExitStatus = {
138153
val validator = createValidator(failFast, pathSubstitutionsList, enforceCaseSensitivePathChecks, trace)
139154
validator.parseSchema(schemaFile) match {
140155
case Validated.Invalid(errors) => (prettyPrint(errors), SystemExitCodes.InvalidSchema)
141156
case Validated.Valid(schema) =>
142-
validator.validate(csvFile, schema, progress) match {
143-
case Validated.Invalid(failures) =>
144-
val failuresMsg = prettyPrint(failures)
145-
if(containsError(failures)) //checks for just warnings to determine exit code
146-
(failuresMsg + EOL + "FAIL",
147-
SystemExitCodes.InvalidCsv)
148-
else
149-
(failuresMsg + EOL + "PASS", //just warnings!
150-
SystemExitCodes.ValidCsv)
151-
152-
case Validated.Valid(_) => ("PASS", SystemExitCodes.ValidCsv)
153-
}
157+
val pass = validator.validateCsvFile(
158+
csvFile,
159+
schema,
160+
progress,
161+
onRow
162+
)
163+
if (pass) ("PASS", SystemExitCodes.ValidCsv) else ("FAIL", SystemExitCodes.InvalidCsv)
154164
}
155165
}
156166

@@ -161,7 +171,7 @@ object CsvValidatorCmdApp extends App {
161171
}).nonEmpty
162172
}
163173

164-
private def prettyPrint(l: NonEmptyList[FailMessage]): String = l.map { i =>
174+
def prettyPrint(l: NonEmptyList[FailMessage]): String = l.map { i =>
165175
i match {
166176
case FailMessage(ValidationWarning, err,_,_) => "Warning: " + err
167177
case FailMessage(ValidationError, err,_,_) => "Error: " + err

csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/AllErrorsMetaDataValidator.scala

Lines changed: 14 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -8,40 +8,33 @@
88
*/
99
package uk.gov.nationalarchives.csv.validator
1010

11-
import uk.gov.nationalarchives.csv.validator.schema.Optional
12-
import uk.gov.nationalarchives.csv.validator.schema.Rule
13-
import uk.gov.nationalarchives.csv.validator.schema.Schema
14-
import uk.gov.nationalarchives.csv.validator.schema.Warning
15-
import uk.gov.nationalarchives.csv.validator.metadata.Cell
16-
import uk.gov.nationalarchives.csv.validator.metadata.Row
17-
import scala.annotation.tailrec
1811
import cats.syntax.all._
12+
import uk.gov.nationalarchives.csv.validator.metadata.{Cell, Row}
13+
import uk.gov.nationalarchives.csv.validator.schema.{Optional, Rule, Schema, Warning}
14+
15+
import scala.annotation.tailrec
1916

2017
trait AllErrorsMetaDataValidator extends MetaDataValidator {
2118

22-
override def validateRows(rows: Iterator[Row], schema: Schema): MetaDataValidation[Any] = {
19+
override def validateRows(
20+
rows: Iterator[Row],
21+
schema: Schema,
22+
rowCallback: MetaDataValidation[Any] => Unit
23+
): Boolean = {
2324

2425
@tailrec
25-
def validateRows(results: List[MetaDataValidation[Any]] = List.empty[MetaDataValidation[Any]]) : List[MetaDataValidation[Any]] = {
26+
def inner(passing: Boolean) : Boolean = {
2627
if(!rows.hasNext) {
27-
results.reverse
28+
passing
2829
} else {
2930
val row = rows.next()
3031
val result = validateRow(row, schema, Some(rows.hasNext))
31-
/*
32-
Only store the results if they contain a warning or a failure. This means the validator is not limited by the
33-
available memory when processing large files.
34-
*/
35-
if (containsErrors(result) || containsWarnings(result)) {
36-
validateRows(result :: results)
37-
} else {
38-
validateRows(results)
39-
}
32+
rowCallback(result)
33+
inner(passing && !containsErrors(result))
4034
}
4135
}
4236

43-
val v = validateRows()
44-
v.sequence[MetaDataValidation, Any]
37+
inner(true)
4538
}
4639

4740

csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/FailFastMetaDataValidator.scala

Lines changed: 18 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -8,44 +8,39 @@
88
*/
99
package uk.gov.nationalarchives.csv.validator
1010

11-
import annotation.tailrec
12-
import uk.gov.nationalarchives.csv.validator.schema.ColumnDefinition
13-
import uk.gov.nationalarchives.csv.validator.schema.Optional
14-
import uk.gov.nationalarchives.csv.validator.schema.Rule
15-
import uk.gov.nationalarchives.csv.validator.schema.Schema
16-
import uk.gov.nationalarchives.csv.validator.schema.Warning
17-
import uk.gov.nationalarchives.csv.validator.metadata.Cell
18-
import uk.gov.nationalarchives.csv.validator.metadata.Row
1911
import cats.data.Validated.{Invalid => Failure}
2012
import cats.syntax.all._
13+
import uk.gov.nationalarchives.csv.validator.metadata.{Cell, Row}
14+
import uk.gov.nationalarchives.csv.validator.schema._
15+
16+
import scala.annotation.tailrec
2117

2218
trait FailFastMetaDataValidator extends MetaDataValidator {
2319

2420
//TODO(AR) work on removing use of `Any`
2521

26-
override def validateRows(rows: Iterator[Row], schema: Schema): MetaDataValidation[Any] = {
22+
override def validateRows(
23+
rows: Iterator[Row],
24+
schema: Schema,
25+
rowCallback: MetaDataValidation[Any] => Unit
26+
): Boolean = {
2727

2828
@tailrec
29-
def validateRows(results: List[MetaDataValidation[Any]] = List.empty[MetaDataValidation[Any]]) : List[MetaDataValidation[Any]] = {
30-
if(results.headOption.map(containsErrors(_)).getOrElse(false) || !rows.hasNext) {
31-
results.reverse
29+
def inner(passing: Boolean) : Boolean = {
30+
if(!rows.hasNext) {
31+
passing
3232
} else {
3333
val row = rows.next()
3434
val result = validateRow(row, schema, Some(rows.hasNext))
35-
/*
36-
Only store the results if they contain a warning or a failure. This means the validator is not limited by the
37-
available memory when processing large files.
38-
*/
39-
if (containsErrors(result) || containsWarnings(result)) {
40-
validateRows(result :: results)
41-
} else {
42-
validateRows(results)
43-
}
35+
rowCallback(result)
36+
if(!containsErrors(result))
37+
inner(passing)
38+
else
39+
false
4440
}
4541
}
4642

47-
val v = validateRows()
48-
v.sequence[MetaDataValidation, Any]
43+
inner(true)
4944
}
5045

5146
override protected def rules(row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): MetaDataValidation[List[Any]] = {

csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/MetaDataValidator.scala

Lines changed: 59 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -9,26 +9,21 @@
99
package uk.gov.nationalarchives.csv.validator
1010

1111

12-
import uk.gov.nationalarchives.utf8.validator.{Utf8Validator, ValidationHandler}
13-
14-
import scala.language.{postfixOps, reflectiveCalls}
15-
import scala.util.{Try, Using}
16-
17-
import java.io.{BufferedInputStream, IOException, FileInputStream => JFileInputStream, InputStreamReader => JInputStreamReader, LineNumberReader => JLineNumberReader, Reader => JReader}
18-
import java.nio.charset.{Charset, StandardCharsets}
19-
import uk.gov.nationalarchives.csv.validator.schema._
20-
import uk.gov.nationalarchives.csv.validator.metadata.Cell
21-
import org.apache.commons.io.input.BOMInputStream
22-
import com.univocity.parsers.common.TextParsingException
12+
import cats.data.{Chain, Validated, ValidatedNel}
13+
import cats.syntax.all._
2314
import com.univocity.parsers.csv.{CsvParser, CsvParserSettings}
24-
import uk.gov.nationalarchives.csv.validator.metadata.Row
25-
26-
import scala.annotation.tailrec
15+
import org.apache.commons.io.input.BOMInputStream
2716
import uk.gov.nationalarchives.csv.validator.api.TextFile
17+
import uk.gov.nationalarchives.csv.validator.metadata.{Cell, Row}
18+
import uk.gov.nationalarchives.csv.validator.schema._
19+
import uk.gov.nationalarchives.utf8.validator.{Utf8Validator, ValidationHandler}
2820

21+
import java.io.{BufferedInputStream, IOException, InputStreamReader => JInputStreamReader, LineNumberReader => JLineNumberReader, Reader => JReader}
22+
import java.nio.charset.{Charset, StandardCharsets}
2923
import java.nio.file.{Files, Path}
30-
import cats.data.ValidatedNel
31-
import cats.syntax.all._
24+
import scala.annotation.tailrec
25+
import scala.language.{postfixOps, reflectiveCalls}
26+
import scala.util.{Try, Using}
3227

3328
//error reporting classes
3429
sealed trait ErrorType
@@ -57,11 +52,39 @@ case class ProgressFor(rowsToValidate: Int, progress: ProgressCallback)
5752
trait MetaDataValidator {
5853
// Helper functions for checking if a result contains a warning or error.
5954
def containsErrors(e: MetaDataValidation[Any]): Boolean = e.fold(_.collectFirst(FailMessage.isError).nonEmpty, _ => false)
55+
6056
def containsWarnings(e: MetaDataValidation[Any]): Boolean = e.fold(_.collectFirst(FailMessage.isWarning).nonEmpty, _ => false)
6157

6258
type MetaDataValidation[S] = ValidatedNel[FailMessage, S]
6359

64-
def validate(csv: JReader, schema: Schema, progress: Option[ProgressCallback]): MetaDataValidation[Any] = {
60+
@deprecated("use validateReader or validateCsvFile")
61+
def validate(
62+
csv: JReader,
63+
schema: Schema,
64+
progress: Option[ProgressCallback]
65+
): MetaDataValidation[Any] = {
66+
var results: Chain[List[FailMessage]] = Chain.empty
67+
validateReader(
68+
csv,
69+
schema,
70+
progress,
71+
{
72+
case Validated.Invalid(x) => results = results :+ x.toList
73+
case _ =>
74+
}
75+
)
76+
results.toList.flatten.toNel match {
77+
case None => ().valid
78+
case Some(errors) => Validated.invalid(errors)
79+
}
80+
}
81+
82+
def validateReader(
83+
csv: JReader,
84+
schema: Schema,
85+
progress: Option[ProgressCallback],
86+
rowCallback: MetaDataValidation[Any] => Unit
87+
): Boolean = {
6588
//try to find the number of rows for the
6689
//purposes pf reporting progress
6790
//can only do that if we can reset()
@@ -78,10 +101,15 @@ trait MetaDataValidator {
78101
None
79102
}
80103

81-
validateKnownRows(csv, schema, pf)
104+
validateKnownRows(csv, schema, pf, rowCallback)
82105
}
83106

84-
def validateKnownRows(csv: JReader, schema: Schema, progress: Option[ProgressFor]): MetaDataValidation[Any] = {
107+
def validateKnownRows(
108+
csv: JReader,
109+
schema: Schema,
110+
progress: Option[ProgressFor],
111+
rowCallback: MetaDataValidation[Any] => Unit
112+
): Boolean = {
85113

86114
val separator: Char = schema.globalDirectives.collectFirst {
87115
case Separator(sep) =>
@@ -107,7 +135,7 @@ trait MetaDataValidator {
107135
//format.setLineSeparator(CSV_RFC1480_LINE_SEPARATOR) // CRLF
108136

109137
//we need a better CSV Reader!
110-
val result : Try[MetaDataValidation[Any]] = Using {
138+
val result : Try[Boolean] = Using {
111139
val parser = new CsvParser(settings)
112140
parser.beginParsing(csv)
113141
parser
@@ -146,20 +174,21 @@ trait MetaDataValidator {
146174

147175
maybeNoData match {
148176
case Some(noData) =>
149-
noData
177+
rowCallback(noData)
178+
false
150179
case None =>
151-
validateRows(rowIt, schema)
180+
validateRows(rowIt, schema, rowCallback)
152181
}
153182

154183
} (_.stopParsing());
155184

156185
result match {
157186
case util.Success(metadataValidation) =>
158187
metadataValidation
159-
160188
case util.Failure(ts) =>
161189
//TODO(AR) emit all errors not just first!
162-
FailMessage(ValidationError, ts.toString).invalidNel[Any]
190+
rowCallback(FailMessage(ValidationError, ts.toString).invalidNel[Any])
191+
false
163192
// ts.toList.map(t => FailMessage(ValidationError, t.toString).failureNel[Any]).sequence[MetaDataValidation, Any]
164193
}
165194
}
@@ -177,9 +206,11 @@ trait MetaDataValidator {
177206

178207
def filename(row: Row,titleIndex: Int): String = row.cells(titleIndex).value
179208

180-
181-
def validateRows(rows: Iterator[Row], schema: Schema): MetaDataValidation[Any]
182-
209+
def validateRows(
210+
rows: Iterator[Row],
211+
schema: Schema,
212+
rowCallback: MetaDataValidation[Any] => Unit
213+
): Boolean
183214

184215
def validateHeader(header: Row, schema: Schema): Option[MetaDataValidation[Any]] = {
185216
val icnc: Option[IgnoreColumnNameCase] = schema.globalDirectives.collectFirst {case i @ IgnoreColumnNameCase() => i }
@@ -200,7 +231,7 @@ trait MetaDataValidator {
200231
def validateRow(row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): MetaDataValidation[Any] = {
201232
val totalColumnsV = totalColumns(row, schema)
202233
val rulesV = rules(row, schema, mayBeLast)
203-
(totalColumnsV,rulesV).mapN { _ :: _ }
234+
(totalColumnsV, rulesV).mapN { _ :: _ }
204235
}
205236

206237
def validateUtf8Encoding(file: Path): MetaDataValidation[Any] = {

0 commit comments

Comments
 (0)