Skip to content

Commit 916dd43

Browse files
committed
Manually merged branch JessFlan:master
added a cache to store compiled regex and stopped storing success mes… Closes #135
2 parents b7ed078 + a4bbe81 commit 916dd43

File tree

5 files changed

+53
-14
lines changed

5 files changed

+53
-14
lines changed

csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/AllErrorsMetaDataValidator.scala

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,15 @@ trait AllErrorsMetaDataValidator extends MetaDataValidator {
2828
} else {
2929
val row = rows.next()
3030
val result = validateRow(row, schema, Some(rows.hasNext))
31-
validateRows(result :: results)
31+
/*
32+
Only store the results if they contain a warning or a failure. This means the validator is not limited by the
33+
available memory when processing large files.
34+
*/
35+
if (containsErrors(result) || containsWarnings(result)) {
36+
validateRows(result :: results)
37+
} else {
38+
validateRows(results)
39+
}
3240
}
3341
}
3442

@@ -61,4 +69,4 @@ trait AllErrorsMetaDataValidator extends MetaDataValidator {
6169
if(isWarningDirective) toWarnings(ruleResult, row.lineNumber, columnIndex) else toErrors(ruleResult, row.lineNumber, columnIndex)
6270
}}.sequence[MetaDataValidation, Any]
6371
}
64-
}
72+
}

csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/FailFastMetaDataValidator.scala

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,16 +26,22 @@ trait FailFastMetaDataValidator extends MetaDataValidator {
2626

2727
override def validateRows(rows: Iterator[Row], schema: Schema): MetaDataValidation[Any] = {
2828

29-
def containsErrors(e: MetaDataValidation[Any]): Boolean = e.fold(_.list.collectFirst(FailMessage.isError).nonEmpty, _ => false)
30-
3129
@tailrec
3230
def validateRows(results: List[MetaDataValidation[Any]] = List.empty[MetaDataValidation[Any]]) : List[MetaDataValidation[Any]] = {
3331
if(results.headOption.map(containsErrors(_)).getOrElse(false) || !rows.hasNext) {
3432
results.reverse
3533
} else {
3634
val row = rows.next()
3735
val result = validateRow(row, schema, Some(rows.hasNext))
38-
validateRows(result :: results)
36+
/*
37+
Only store the results if they contain a warning or a failure. This means the validator is not limited by the
38+
available memory when processing large files.
39+
*/
40+
if (containsErrors(result) || containsWarnings(result)) {
41+
validateRows(result :: results)
42+
} else {
43+
validateRows(results)
44+
}
3945
}
4046
}
4147

@@ -90,4 +96,4 @@ trait FailFastMetaDataValidator extends MetaDataValidator {
9096
else if(isWarningDirective) validateAllRulesForCell(columnDefinition.rules)
9197
else validateRulesForCell(columnDefinition.rules)
9298
}
93-
}
99+
}

csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/MetaDataValidator.scala

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,15 @@ object FailMessage {
4646
}
4747
}
4848

49+
50+
4951
case class ProgressFor(rowsToValidate: Int, progress: ProgressCallback)
5052

5153
trait MetaDataValidator {
54+
// Helper functions for checking if a result contains a warning or error.
55+
def containsErrors(e: MetaDataValidation[Any]): Boolean = e.fold(_.list.collectFirst(FailMessage.isError).nonEmpty, _ => false)
56+
def containsWarnings(e: MetaDataValidation[Any]): Boolean = e.fold(_.list.collectFirst(FailMessage.isWarning).nonEmpty, _ => false)
57+
5258
type MetaDataValidation[S] = ValidationNel[FailMessage, S]
5359

5460
def validate(csv: JReader, schema: Schema, progress: Option[ProgressCallback]): MetaDataValidation[Any] = {

csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/Rule.scala

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import scala.util.Try
1616
import scala.util.parsing.input.Positional
1717
import scalaz._
1818
import scalaz.Scalaz._
19+
import java.util.regex.{Pattern, Matcher}
1920

2021
abstract class Rule(name: String, val argProviders: ArgProvider*) extends Positional {
2122

@@ -75,7 +76,6 @@ abstract class Rule(name: String, val argProviders: ArgProvider*) extends Positi
7576

7677
def explicitName: Option[String] = explicitColumn.map("$" + _.ref + "/")
7778

78-
7979
def ruleName: String = explicitName.getOrElse("") + name
8080

8181
def columnIdentifierToIndex(schema: Schema, id: ColumnIdentifier): Int = {
@@ -89,7 +89,6 @@ abstract class Rule(name: String, val argProviders: ArgProvider*) extends Positi
8989
}
9090

9191

92-
9392
def toValueError(row: Row, columnIndex:Int ) =
9493
s"""value: ${'"'}${row.cells(columnIndex).value}${'"'}"""
9594

@@ -99,8 +98,30 @@ abstract class Rule(name: String, val argProviders: ArgProvider*) extends Positi
9998

10099
}
101100

101+
/**
102+
* This object is a place to store the precompiled regexs
103+
* @author Jess Flanagan
104+
*/
105+
object RegexCache {
106+
val cache = collection.mutable.Map[String, Pattern]()
107+
108+
/**
109+
* This function returns compiled regexs.
110+
* First we check to see if its already in the cache. Otherwise we compile it, add it to the cache and return the
111+
* compiled version. This results in a significant speed up for processing large files.
112+
* @param pattern A regex pattern string.
113+
*
114+
* @return A compiled representation of a regular expression.
115+
* @author Jess Flanagan
116+
*/
117+
def getCompiledRegex(pattern: String): Pattern = cache.getOrElseUpdate(pattern, Pattern.compile(pattern))
118+
}
119+
102120
abstract class PatternRule(name: String, pattern: String) extends Rule(name) {
103-
override def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean = cellValue matches pattern
121+
// Uses the cache to retrieve a compiled regex representation for the pattern string.
122+
override def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean = {
123+
RegexCache.getCompiledRegex(pattern).matcher(cellValue).matches()
124+
}
104125
}
105126

106127
trait DateParser {
@@ -114,4 +135,4 @@ abstract class DateRule(name: String, dateRegex: String, dateParser: DateParser)
114135
case _ => false
115136
}
116137
}
117-
}
138+
}

csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/v1_0/Rule.scala

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ import uk.gov.nationalarchives.csv.validator.Util.{FileSystem, TypedPath}
1717
import uk.gov.nationalarchives.csv.validator.api.CsvValidator._
1818
import uk.gov.nationalarchives.csv.validator.metadata.Row
1919
import uk.gov.nationalarchives.csv.validator.schema._
20-
20+
import java.util.regex.{Pattern, Matcher}
2121
import scala.annotation.tailrec
2222
import scala.collection.mutable
2323
import scala.util.Try
@@ -92,12 +92,10 @@ case class IfRule(condition: Rule, rules: List[Rule], elseRules: Option[List[Rul
9292
}
9393
}
9494

95-
9695
case class RegExpRule(regex: String) extends Rule("regex") {
9796
override def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean = {
98-
9997
val regexp = if (columnDefinition.directives.contains(IgnoreCase())) "(?i)" + regex else regex
100-
cellValue matches regexp
98+
RegexCache.getCompiledRegex(regexp).matcher(cellValue).matches()
10199
}
102100

103101
override def toError = {

0 commit comments

Comments
 (0)