Skip to content

Commit bac71c1

Browse files
JessFlanRadek Hubner
authored andcommitted
added a cache to store compiled regex and stopped storing success messages to reduce memory usage
1 parent fd784d3 commit bac71c1

File tree

5 files changed

+68
-11
lines changed

5 files changed

+68
-11
lines changed

csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/AllErrorsMetaDataValidator.scala

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,15 @@ trait AllErrorsMetaDataValidator extends MetaDataValidator {
2828
} else {
2929
val row = rows.next()
3030
val result = validateRow(row, schema, Some(rows.hasNext))
31-
validateRows(result :: results)
31+
/*
32+
** Only store the results if they contain a warning or a failure. This means the validator is not limited by the available memory
33+
** when processing large files.
34+
*/
35+
if (containsErrors(result) || containsWarnings(result) ) {
36+
validateRows(result :: results)
37+
} else {
38+
validateRows(results)
39+
}
3240
}
3341
}
3442

csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/FailFastMetaDataValidator.scala

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,16 +26,22 @@ trait FailFastMetaDataValidator extends MetaDataValidator {
2626

2727
override def validateRows(rows: Iterator[Row], schema: Schema): MetaDataValidation[Any] = {
2828

29-
def containsErrors(e: MetaDataValidation[Any]): Boolean = e.fold(_.list.collectFirst(FailMessage.isError).nonEmpty, _ => false)
30-
3129
@tailrec
3230
def validateRows(results: List[MetaDataValidation[Any]] = List.empty[MetaDataValidation[Any]]) : List[MetaDataValidation[Any]] = {
3331
if(results.headOption.map(containsErrors(_)).getOrElse(false) || !rows.hasNext) {
3432
results.reverse
3533
} else {
3634
val row = rows.next()
3735
val result = validateRow(row, schema, Some(rows.hasNext))
38-
validateRows(result :: results)
36+
/*
37+
** Only store the results if they contain a warning or a failure. This means the validator is not limited by the available memory
38+
** when processing large files.
39+
*/
40+
if (containsErrors(result) || containsWarnings(result) ) {
41+
validateRows(result :: results)
42+
} else {
43+
validateRows(results)
44+
}
3945
}
4046
}
4147

@@ -90,4 +96,4 @@ trait FailFastMetaDataValidator extends MetaDataValidator {
9096
else if(isWarningDirective) validateAllRulesForCell(columnDefinition.rules)
9197
else validateRulesForCell(columnDefinition.rules)
9298
}
93-
}
99+
}

csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/MetaDataValidator.scala

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,15 @@ object FailMessage {
4646
}
4747
}
4848

49+
50+
4951
case class ProgressFor(rowsToValidate: Int, progress: ProgressCallback)
5052

5153
trait MetaDataValidator {
54+
// Helper functions for checking if a result contains a warning or error.
55+
def containsErrors(e: MetaDataValidation[Any]): Boolean = e.fold(_.list.collectFirst(FailMessage.isError).nonEmpty, _ => false)
56+
def containsWarnings(e: MetaDataValidation[Any]): Boolean = e.fold(_.list.collectFirst(FailMessage.isWarning).nonEmpty, _ => false)
57+
5258
type MetaDataValidation[S] = ValidationNel[FailMessage, S]
5359

5460
def validate(csv: JReader, schema: Schema, progress: Option[ProgressCallback]): MetaDataValidation[Any] = {

csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/Rule.scala

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import scala.util.Try
1616
import scala.util.parsing.input.Positional
1717
import scalaz._
1818
import scalaz.Scalaz._
19+
import java.util.regex.{Pattern, Matcher}
1920

2021
abstract class Rule(name: String, val argProviders: ArgProvider*) extends Positional {
2122

@@ -75,7 +76,6 @@ abstract class Rule(name: String, val argProviders: ArgProvider*) extends Positi
7576

7677
def explicitName: Option[String] = explicitColumn.map("$" + _.ref + "/")
7778

78-
7979
def ruleName: String = explicitName.getOrElse("") + name
8080

8181
def columnIdentifierToIndex(schema: Schema, id: ColumnIdentifier): Int = {
@@ -89,7 +89,6 @@ abstract class Rule(name: String, val argProviders: ArgProvider*) extends Positi
8989
}
9090

9191

92-
9392
def toValueError(row: Row, columnIndex:Int ) =
9493
s"""value: ${'"'}${row.cells(columnIndex).value}${'"'}"""
9594

@@ -99,8 +98,36 @@ abstract class Rule(name: String, val argProviders: ArgProvider*) extends Positi
9998

10099
}
101100

101+
/**
102+
* This object is a place to store the precompilled regexs
103+
* @author Jess Flanagan
104+
*/
105+
object RegexCache
106+
{
107+
val cache = collection.mutable.Map[String, Pattern]()
108+
109+
/**
110+
* This function returns compiled regexs.
111+
* First we check to see if its already in the cache. Otherwise we compile it, add it to the cache and return the
112+
* compiled version. This results in a significant speed up for processing large files.
113+
* @param pattern A regex pattern string.
114+
* @returns A compiled representation of a regular expression.
115+
* @author Jess Flanagan
116+
*/
117+
def getCompiledRegex(pattern: String): Pattern ={
118+
if (!cache.contains(pattern)){
119+
cache += (pattern -> Pattern.compile(pattern))
120+
}
121+
cache(pattern);
122+
}
123+
}
124+
102125
abstract class PatternRule(name: String, pattern: String) extends Rule(name) {
103-
override def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean = cellValue matches pattern
126+
// Uses the cache to retrieve a compiled regex representation for the pattern string.
127+
override def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean ={
128+
RegexCache.getCompiledRegex(pattern).matcher(cellValue).matches()
129+
130+
}
104131
}
105132

106133
trait DateParser {
@@ -114,4 +141,4 @@ abstract class DateRule(name: String, dateRegex: String, dateParser: DateParser)
114141
case _ => false
115142
}
116143
}
117-
}
144+
}

csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/v1_0/Rule.scala

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ import uk.gov.nationalarchives.csv.validator.Util.{FileSystem, TypedPath}
1717
import uk.gov.nationalarchives.csv.validator.api.CsvValidator._
1818
import uk.gov.nationalarchives.csv.validator.metadata.Row
1919
import uk.gov.nationalarchives.csv.validator.schema._
20-
20+
import java.util.regex.{Pattern, Matcher}
2121
import scala.annotation.tailrec
2222
import scala.collection.mutable
2323
import scala.util.Try
@@ -92,12 +92,22 @@ case class IfRule(condition: Rule, rules: List[Rule], elseRules: Option[List[Rul
9292
}
9393
}
9494

95+
abstract class PatternRule(name: String, pattern: String) extends Rule(name) {
96+
// Uses the cache to retrieve a compiled regex representation for the pattern string.
97+
override def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean ={
98+
RegexCache.getCompiledRegex(pattern).matcher(cellValue).matches()
99+
100+
}
101+
102+
}
95103

96104
case class RegExpRule(regex: String) extends Rule("regex") {
97105
override def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean = {
98106

99107
val regexp = if (columnDefinition.directives.contains(IgnoreCase())) "(?i)" + regex else regex
100-
cellValue matches regexp
108+
109+
RegexCache.getCompiledRegex(regexp).matcher(cellValue).matches()
110+
101111
}
102112

103113
override def toError = {

0 commit comments

Comments
 (0)