Skip to content

Commit fc501ea

Browse files
committed
Merge branch 'master' into DR2-2066_ignoreLineBreaksInCellsWhenCountingRows
# Conflicts: # csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/MetaDataValidator.scala # csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/api/CsvValidator.scala
2 parents 77fc156 + c9da827 commit fc501ea

File tree

28 files changed

+867
-254
lines changed

28 files changed

+867
-254
lines changed

README.md

Lines changed: 47 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -41,35 +41,63 @@ If you wish to use the CSV Validator from your own Java project, we provide a na
4141
<dependency>
4242
<groupId>uk.gov.nationalarchives</groupId>
4343
<artifactId>csv-validator-java-api</artifactId>
44-
<version>1.3.0</version>
44+
<version>1.4.0</version>
4545
</dependency>
4646
```
4747

4848
The Javadoc, can be found in either Maven Central or you can build it locally by executing `mvn javadoc:javadoc`.
4949

5050
Example Java code of using the CSV Validator through the Java API:
5151
```java
52-
Boolean failFast = false;
53-
List<Substitution> pathSubstitutions = new ArrayList<Substitution>();
54-
55-
List<FailMessage> messages = CsvValidator.validate(
56-
"/data/csv/data.csv",
57-
"/data/csv/data-schema.csvs",
58-
failFast,
59-
pathSubstitutions,
60-
true,
61-
false);
52+
Charset csvEncoding = JCharset.forName("UTF-8"); // default is UTF-8
53+
boolean validateCsvEncoding = true;
54+
Charset csvSchemaEncoding = JCharset.forName("UTF-8"); // default is UTF-8
55+
boolean failFast = true; // default is false
56+
List<Substitution> pathSubstitutions = new ArrayList<Substitution>(); // default is any empty ArrayList
57+
boolean enforceCaseSensitivePathChecks = true; // default is false
58+
boolean trace = false; // default is false
59+
ProgressCallback progress; // default is null
60+
boolean skipFileChecks = true; // default is false
61+
int maxCharsPerCell = 8096; // default is 4096
62+
63+
// add a substitution path
64+
pathSubstitutions.add(new Substitution("file://something", "/home/xxx"));
65+
66+
CsvValidator.ValidatorBuilder validateWithStringNames = new CsvValidator.ValidatorBuilder(
67+
"/home/dev/IdeaProjects/csv/csv-validator/csv-validator-core/data.csv",
68+
"/home/dev/IdeaProjects/csv/csv-validator/csv-validator-core/data-schema.csvs"
69+
)
70+
71+
// alternatively, you can pass in Readers for each file
72+
Reader csvReader = new Reader();
73+
Reader csvSchemaReader = new Reader();
74+
CsvValidator.ValidatorBuilder validateWithReaders = new CsvValidator.ValidatorBuilder(
75+
csvReader, csvSchemaReader
76+
)
77+
78+
List<FailMessage> messages = validateWithStringNames
79+
.usingCsvEncoding(csvEncoding, validateCsvEncoding) // should only be `true` if using UTF-8 encoding, otherwise it will throw an exception
80+
.usingCsvSchemaEncoding(csvSchemaEncoding)
81+
.usingFailFast(failFast)
82+
.usingPathSubstitutions(pathSubstitutions)
83+
.usingEnforceCaseSensitivePathChecks(enforceCaseSensitivePathChecks)
84+
.usingTrace(trace)
85+
.usingProgress(progress)
86+
.usingSkipFileChecks(skipFileChecks)
87+
.usingMaxCharsPerCell(maxCharsPerCell)
88+
.runValidation();
6289

6390
if(messages.isEmpty()) {
64-
System.out.println("Completed validation OK");
91+
System.out.println("All worked OK");
6592
} else {
66-
for(FailMessage message : messages) {
67-
if(message instanceof WarningMessage) {
68-
System.out.println("[WARN] " + message.getMessage());
69-
} else {
70-
System.out.println("[ERROR] " + message.getMessage());
71-
}
72-
}
93+
for(FailMessage message : messages) {
94+
if(message instanceof WarningMessage) {
95+
System.out.println("Warning: " + message.getMessage());
96+
} else {
97+
System.out.println("Error: " + message.getMessage());
98+
}
99+
}
100+
}
73101
}
74102
```
75103

csv-validator-cmd/README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ Designed and tested on both Windows and Linux/Unix/Mac platforms.
99
Basic Usage
1010
-----------
1111

12-
The following command will show the useage for the application:
12+
The following command will show the usage for the application:
1313

1414
```bash
1515
$ validate
@@ -32,6 +32,8 @@ Usage: validate [options] <csv-path> <csv-schema-path>
3232
The path to the CSV Schema file to use for validation
3333
--disable-utf8-validation
3434
Disable UTF-8 validation for CSV files
35+
--max-chars-per-cell
36+
Maximum number of chars allowed in cell (is set to 4096 by default)
3537
--skip-file-checks
3638
Skip integrity, checksum and file existence checks
3739
--show-progress

csv-validator-cmd/src/main/scala/uk/gov/nationalarchives/csv/validator/cmd/CsvValidatorCmdApp.scala

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ object CsvValidatorCmdApp extends App {
5151
csvSchemaPath: Path = Paths.get("."),
5252
csvSchemaEncoding: Charset = CsvValidator.DEFAULT_ENCODING,
5353
disableUtf8Validation:Boolean = false,
54+
maxCharsPerCell: Int = 4096,
5455
progressCallback: Option[ProgressCallback] = None,
5556
skipFileChecks: Boolean = false)
5657

@@ -70,6 +71,7 @@ object CsvValidatorCmdApp extends App {
7071
opt[Charset]('x', "csv-encoding").optional().action { (x,c) => c.copy(csvEncoding = x) } text("Defines the charset encoding used in the CSV file")
7172
opt[Charset]('y', "csv-schema-encoding").optional().action { (x,c) => c.copy(csvSchemaEncoding = x) }.text("Defines the charset encoding used in the CSV Schema file")
7273
opt[Unit]("disable-utf8-validation").optional().action {(_, c) => c.copy(disableUtf8Validation = true)}.text("Disable UTF-8 validation for CSV files.")
74+
opt[Int]("max-chars-per-cell").optional().action {(x, c) => c.copy(maxCharsPerCell = x)}.text("Maximum number of chars allowed in a cell (is set to 4096 by default)")
7375
opt[Unit]("skip-file-checks").optional().action {(_, c) => c.copy(progressCallback = Some(commandLineProgressCallback()))}.text("Skip integrity, checksum and file existence checks")
7476
opt[Unit]("show-progress").optional().action {(_, c) => c.copy(progressCallback = Some(commandLineProgressCallback()))}.text("Show progress")
7577
arg[Path]("<csv-path>").validate { x => if(Files.exists(x) && Files.isReadable(x)) success else failure(s"Cannot access CSV file: ${x.toString}") }.action { (x,c) => c.copy(csvPath = x) }.text("The path to the CSV file to validate")
@@ -86,6 +88,7 @@ object CsvValidatorCmdApp extends App {
8688
config.substitutePaths,
8789
config.caseSensitivePaths,
8890
config.traceParser,
91+
config.maxCharsPerCell,
8992
config.progressCallback,
9093
config.skipFileChecks
9194
)
@@ -143,11 +146,11 @@ object CsvValidatorCmdApp extends App {
143146
case _ =>
144147
}
145148

146-
def getColumnFromCsv(csvFile: TextFile, csvSchemaFile: TextFile, columnName: String): List[String] = Try {
147-
val validator = createValidator(true, Nil, false, false, false)
149+
def getColumnFromCsv(csvFile: TextFile, csvSchemaFile: TextFile, columnName: String, maxCharsPerCell: Int): List[String] = Try {
150+
val validator = createValidator(true, Nil, false, false, false, maxCharsPerCell)
148151
val csv = validator.loadCsvFile(csvFile, csvSchemaFile)
149-
csv.headOption.map(_.indexOf("identifier")).map { identifierIdx =>
150-
csv.tail.map(arr => arr(identifierIdx))
152+
csv.headOption.map(_.indexOf(columnName)).map { identifierIdx =>
153+
csv.tail.map(row => row(identifierIdx))
151154
}.getOrElse(Nil)
152155
}.getOrElse(Nil)
153156

@@ -159,11 +162,12 @@ object CsvValidatorCmdApp extends App {
159162
pathSubstitutionsList: List[SubstitutePath],
160163
enforceCaseSensitivePathChecks: Boolean,
161164
trace: Boolean,
165+
maxCharsPerCell: Int,
162166
progress: Option[ProgressCallback],
163167
skipFileChecks: Boolean,
164168
onRow: ValidatedNel[FailMessage, Any] => Unit = rowCallback
165169
): ExitStatus = {
166-
val validator = createValidator(failFast, pathSubstitutionsList, enforceCaseSensitivePathChecks, trace, skipFileChecks)
170+
val validator = createValidator(failFast, pathSubstitutionsList, enforceCaseSensitivePathChecks, trace, skipFileChecks, maxCharsPerCell)
167171
validator.parseSchema(schemaFile) match {
168172
case Validated.Invalid(errors) => (prettyPrint(errors), SystemExitCodes.InvalidSchema)
169173
case Validated.Valid(schema) =>

csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/MetaDataValidator.scala

Lines changed: 36 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ package uk.gov.nationalarchives.csv.validator
1111

1212
import cats.data.{Chain, Validated, ValidatedNel}
1313
import cats.syntax.all._
14+
import com.univocity.parsers.common.TextParsingException
1415
import com.univocity.parsers.csv.{CsvParser, CsvParserSettings, CsvRoutines}
1516
import org.apache.commons.io.input.BOMInputStream
1617
import uk.gov.nationalarchives.csv.validator.api.TextFile
@@ -23,7 +24,7 @@ import java.nio.charset.{Charset, StandardCharsets}
2324
import java.nio.file.{Files, Path}
2425
import scala.annotation.tailrec
2526
import scala.language.{postfixOps, reflectiveCalls}
26-
import scala.util.{Try, Using}
27+
import scala.util.{Failure, Success, Try, Using}
2728

2829
//error reporting classes
2930
sealed trait ErrorType
@@ -61,12 +62,14 @@ trait MetaDataValidator {
6162
def validate(
6263
csv: JReader,
6364
schema: Schema,
65+
maxCharsPerCell: Int = 4096,
6466
progress: Option[ProgressCallback]
6567
): MetaDataValidation[Any] = {
6668
var results: Chain[List[FailMessage]] = Chain.empty
6769
validateReader(
6870
csv,
6971
schema,
72+
maxCharsPerCell,
7073
progress,
7174
{
7275
case Validated.Invalid(x) => results = results :+ x.toList
@@ -82,6 +85,7 @@ trait MetaDataValidator {
8285
def validateReader(
8386
csv: JReader,
8487
schema: Schema,
88+
maxCharsPerCell: Int,
8589
progress: Option[ProgressCallback],
8690
rowCallback: MetaDataValidation[Any] => Unit
8791
): Boolean = {
@@ -101,10 +105,10 @@ trait MetaDataValidator {
101105
None
102106
}
103107

104-
validateKnownRows(csv, schema, pf, rowCallback)
108+
validateKnownRows(csv, schema, maxCharsPerCell, pf, rowCallback)
105109
}
106110

107-
def createCsvParser(schema: Schema): CsvParser = {
111+
def createCsvParser(schema: Schema, maxCharsPerCell: Int): CsvParser = {
108112
val separator: Char = schema.globalDirectives.collectFirst {
109113
case Separator(sep) =>
110114
sep
@@ -124,6 +128,7 @@ trait MetaDataValidator {
124128
settings.setIgnoreLeadingWhitespaces(false)
125129
settings.setIgnoreTrailingWhitespaces(false)
126130
settings.setLineSeparatorDetectionEnabled(true)
131+
settings.setMaxCharsPerColumn(maxCharsPerCell)
127132
// TODO(AR) should we be friendly and auto-detect line separator, or enforce RFC 1480?
128133
format.setQuoteEscape(CSV_RFC1480_QUOTE_ESCAPE_CHARACTER)
129134
//format.setLineSeparator(CSV_RFC1480_LINE_SEPARATOR) // CRLF
@@ -136,11 +141,12 @@ trait MetaDataValidator {
136141
def validateKnownRows(
137142
csv: JReader,
138143
schema: Schema,
144+
maxCharsPerCell: Int,
139145
progress: Option[ProgressFor],
140146
rowCallback: MetaDataValidation[Any] => Unit
141147
): Boolean = {
142148

143-
val parser = createCsvParser(schema)
149+
val parser = createCsvParser(schema, maxCharsPerCell)
144150

145151
val result : Try[Boolean] = Using {
146152
parser.beginParsing(csv)
@@ -153,7 +159,7 @@ trait MetaDataValidator {
153159
// if 'no header' is not set and 'permit empty' is not set but the file contains only one line - this is an error
154160

155161

156-
val rowIt = new RowIterator(reader, progress)
162+
val rowIt = new RowIterator(reader, progress, maxCharsPerCell)
157163

158164
val maybeNoData =
159165
if (schema.globalDirectives.contains(NoHeader())) {
@@ -345,24 +351,34 @@ trait ProgressCallback {
345351
def update(total: Int, processed: Int): Unit = update((processed.toFloat / total.toFloat) * 100)
346352
}
347353

348-
class RowIterator(parser: CsvParser, progress: Option[ProgressFor]) extends Iterator[Row] {
354+
class RowIterator(parser: CsvParser, progress: Option[ProgressFor], maxCharsPerCell: Int) extends Iterator[Row] {
349355

350356
private var index = 1
351-
private var current = toRow(Option(parser.parseNext()))
357+
private var current = toRow(Try(parser.parseNext()))
358+
private var potentialHeaderRow: Option[Row] = None
352359

353360
@throws(classOf[IOException])
354361
override def next(): Row = {
355362
val row = current match {
356-
case Some(row) =>
363+
case Success(row) =>
364+
if(index == 1 && potentialHeaderRow.isEmpty) potentialHeaderRow = Some(row) // this is here in case the old API is used that doesn't call 'skipHeader'
357365
row
358-
case None => {
359-
throw new IOException("End of file")
360-
}
366+
case Failure(ex: TextParsingException) if(ex.toString.contains("exceeds the maximum number of characters")) =>
367+
val cellLocationMsg =
368+
potentialHeaderRow match {
369+
case Some(headerRow) => s"in the cell located at line: ${ex.getLineIndex}, column: ${headerRow.cells(ex.getColumnIndex).value},"
370+
case None => s"in column ${ex.getColumnIndex + 1} of the header row"
371+
}
372+
373+
val customMessage =
374+
s"The number of characters $cellLocationMsg is larger than the maximum number of characters allowed in a cell ($maxCharsPerCell); increase this limit and re-run."
375+
throw new Exception(customMessage)
376+
case Failure(ex) => throw ex
361377
}
362378

363379
//move to the next
364380
this.index = index + 1
365-
this.current = toRow(Option(parser.parseNext()))
381+
this.current = toRow(Try(parser.parseNext()))
366382

367383
progress map {
368384
p =>
@@ -377,10 +393,15 @@ class RowIterator(parser: CsvParser, progress: Option[ProgressFor]) extends Iter
377393
@throws(classOf[IOException])
378394
def skipHeader(): Row = {
379395
this.index = index - 1
380-
next()
396+
val row = next()
397+
this.potentialHeaderRow = Some(row)
398+
row
381399
}
382400

383-
override def hasNext: Boolean = current.nonEmpty
401+
override def hasNext: Boolean = current match {
402+
case Failure(ex: NullPointerException) => false
403+
case _ => true
404+
}
384405

385-
private def toRow(rowData: Option[Array[String]]): Option[Row] = rowData.map(data => Row(data.toList.map(d => Cell(Option(d).getOrElse(""))), index))
406+
private def toRow(rowData: Try[Array[String]]): Try[Row] = rowData.map(data => Row(data.toList.map(d => Cell(Option(d).getOrElse(""))), index))
386407
}

csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/api/CsvValidator.scala

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,11 @@ object CsvValidator {
3030
type PathTo = String
3131
type SubstitutePath = (PathFrom, PathTo)
3232

33-
def createValidator(failFast: Boolean, pathSubstitutionsList: List[SubstitutePath], enforceCaseSensitivePathChecksSwitch: Boolean, traceSwitch: Boolean, skipFileChecksSwitch: Boolean) = {
33+
def createValidator(failFast: Boolean, pathSubstitutionsList: List[SubstitutePath], enforceCaseSensitivePathChecksSwitch: Boolean, traceSwitch: Boolean, skipFileChecksSwitch: Boolean, maxCharsPerCellLimit: Int) = {
3434
if(failFast) {
35-
new CsvValidator with FailFastMetaDataValidator { val pathSubstitutions = pathSubstitutionsList; val enforceCaseSensitivePathChecks = enforceCaseSensitivePathChecksSwitch; val trace = traceSwitch; val skipFileChecks = skipFileChecksSwitch}
35+
new CsvValidator with FailFastMetaDataValidator { val pathSubstitutions = pathSubstitutionsList; val enforceCaseSensitivePathChecks = enforceCaseSensitivePathChecksSwitch; val trace = traceSwitch; val skipFileChecks = skipFileChecksSwitch; val maxCharsPerCell = maxCharsPerCellLimit}
3636
} else {
37-
new CsvValidator with AllErrorsMetaDataValidator { val pathSubstitutions = pathSubstitutionsList; val enforceCaseSensitivePathChecks = enforceCaseSensitivePathChecksSwitch; val trace = traceSwitch; val skipFileChecks = skipFileChecksSwitch }
37+
new CsvValidator with AllErrorsMetaDataValidator { val pathSubstitutions = pathSubstitutionsList; val enforceCaseSensitivePathChecks = enforceCaseSensitivePathChecksSwitch; val trace = traceSwitch; val skipFileChecks = skipFileChecksSwitch; val maxCharsPerCell = maxCharsPerCellLimit }
3838
}
3939
}
4040
}
@@ -81,8 +81,8 @@ trait CsvValidator extends SchemaParser {
8181
parseSchema(csvSchemaFile) match {
8282
case Validated.Valid(schema) =>
8383
withReader(csvFile) { reader =>
84-
createCsvParser(schema).parseAll(reader)
85-
}.asScala.toList
84+
createCsvParser(schema, this.maxCharsPerCell).parseAll(reader)
85+
}.asScala.toList
8686
case Validated.Invalid(_) => Nil
8787
}
8888
}
@@ -100,7 +100,7 @@ trait CsvValidator extends SchemaParser {
100100
val csvValidation = withReader(csvFile) {
101101
reader =>
102102
val totalRows = countRows(csvFile)
103-
validateKnownRows(reader, csvSchema, progress.map(p => {ProgressFor(totalRows, p)} ), rowCallback)
103+
validateKnownRows(reader, csvSchema, this.maxCharsPerCell, progress.map(p => {ProgressFor(totalRows, p)} ), rowCallback)
104104
}
105105
encodingValidationNel.isValid && csvValidation
106106
}

csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/SchemaParser.scala

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ with TraceableParsers {
5151

5252
val skipFileChecks: Boolean
5353

54+
val maxCharsPerCell: Int
55+
5456
lazy val versionHeader: PackratParser[String] = "VersionDecl" ::= ("version" ~> versionLiteral )
5557

5658

@@ -147,6 +149,7 @@ with TraceableParsers {
147149
val ps = pathSubstitutions
148150
val t = trace
149151
val sfc = skipFileChecks
152+
val mcpc = maxCharsPerCell
150153

151154
SchemaValidator.versionValid(version).map(Failure(_, next)).getOrElse {
152155
version match {
@@ -155,6 +158,7 @@ with TraceableParsers {
155158
override val pathSubstitutions: List[(String, String)] = ps
156159
override val trace: Boolean = t
157160
override val skipFileChecks: Boolean = sfc
161+
override val maxCharsPerCell: Int = mcpc
158162
}
159163

160164
parser1_2.parseVersionAware(reader) match {
@@ -168,6 +172,7 @@ with TraceableParsers {
168172
override val pathSubstitutions: List[(String, String)] = ps
169173
override val trace: Boolean = t
170174
override val skipFileChecks: Boolean = sfc
175+
override val maxCharsPerCell: Int = mcpc
171176
}
172177

173178
parser1_1.parseVersionAware(reader) match {
@@ -181,6 +186,7 @@ with TraceableParsers {
181186
override val pathSubstitutions: List[(String, String)] = ps
182187
override val trace: Boolean = t
183188
override val skipFileChecks: Boolean = sfc
189+
override val maxCharsPerCell: Int = mcpc
184190
}
185191

186192
parser1_0.parseVersionAware(reader) match {
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
col1,col2
2+
row1Col1,row1Col2LongCellLength

0 commit comments

Comments
 (0)