urlDecode -> uriDecode. Add compatibility test.

Radek Hubner · Radek Hubner · commit aa952fc399d3 · 2016-05-25T11:41:34.000+01:00
diff --git a/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/v1_2/Schema.scala b/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/v1_2/Schema.scala
@@ -12,7 +12,7 @@ import uk.gov.nationalarchives.csv.validator.metadata.Row
 import uk.gov.nationalarchives.csv.validator.schema.{Schema, ArgProvider}
 import java.net.{URLDecoder => JURLDecoder}
 
-case class UrlDecode(value: ArgProvider, charset: Option[ArgProvider]) extends ArgProvider {
+case class UriDecode(value: ArgProvider, charset: Option[ArgProvider]) extends ArgProvider {
 
   val DefaultCharset = "UTF-8"
 
@@ -24,5 +24,5 @@ case class UrlDecode(value: ArgProvider, charset: Option[ArgProvider]) extends A
 
   })
 
-  override def toError: String = "urlDecode(" + value.toError + ")"
+  override def toError: String = "uriDecode(" + value.toError + ")"
 }
diff --git a/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/v1_2/SchemaParser.scala b/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/v1_2/SchemaParser.scala
@@ -21,8 +21,8 @@ trait SchemaParser extends SchemaParser1_1 {
     s => Literal(Some(s))
   }
 
-  lazy val urlDecode: PackratParser[ArgProvider] = "UrlDecode" ::= "urlDecode(" ~> stringProvider ~ opt("," ~> stringProvider)  <~ ")" ^^ {
-    case value ~ charset => UrlDecode(value, charset)
+  lazy val urlDecode: PackratParser[ArgProvider] = "UriDecode" ::= "uriDecode(" ~> stringProvider ~ opt("," ~> stringProvider)  <~ ")" ^^ {
+    case value ~ charset => UriDecode(value, charset)
   }
 
 }
diff --git a/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/acceptance/uriDecode.csvs b/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/acceptance/uriDecode.csvs
@@ -1,4 +1,4 @@
 version 1.2
 @totalColumns 2 @noHeader
 identifier:
-filename: in(urlDecode($identifier))
+filename: in(uriDecode($identifier))
diff --git a/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/acceptance/uriDecodeFail.csv b/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/acceptance/uriDecodeFail.csv
diff --git a/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/acceptance/uriDecodePass.csv b/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/acceptance/uriDecodePass.csv
diff --git a/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/acceptance/uriDecodeWithCharset.csvs b/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/acceptance/uriDecodeWithCharset.csvs
@@ -1,5 +1,5 @@
 version 1.2
 @totalColumns 3 @noHeader
 identifier:
-filename: in(urlDecode($identifier, $charset))
+filename: in(uriDecode($identifier, $charset))
 charset:
diff --git a/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/acceptance/uriDecodeWithCharsetPass.csv b/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/acceptance/uriDecodeWithCharsetPass.csv
diff --git a/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/integrityCheck/WO_95/tech_acq_metadata_v1_WO95Y14B000_v1.2.csvs b/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/integrityCheck/WO_95/tech_acq_metadata_v1_WO95Y14B000_v1.2.csvs
@@ -0,0 +1,120 @@
+version 1.2
+@totalColumns 33
+/*---------------------------------------------------------------------------------------------------------------
+|This schema is for the validation of technical acquisition metadata                                            |
+|csv files according to the specification given for digitised surrogates in                                     |
+|http://www.nationalarchives.gov.uk/documents/information-management/digitisation-at-the-national-archives.pdf  |
+|This version is for WO 95 digitisation in the period 2014-15                                                   |
+|  20140818   Version 1.0   DHU   First release version for this project                                        |
+|  20140910   Version 1.1   DHU   Updated date regex to fix issues, allowed items up to 14, disallow fullstops  |
+|at end of description as this causes search issues in Discovery.                                               |
+|  20141016 version 1.2 NW Updated regex to allow 20 items, 500 ordinals & addition of legal_status             |
+|and held_by fields, changed date column to covering_date                                                       |
+|  20141110 version 1.3 NW fixed sub_sub_series rule                                                            |
+|from sub_sub_series: range(1,7) or is("115") or if($piece/is("5500"),is(""))                                   |
+|to sub_sub_series: if($piece/is("5500"),is(""),(range(1,7) or is("115")))                                      |
+|  20160511 - RH - Update schema version to test CSV validator backward compatibility                           |
+---------------------------------------------------------------------------------------------------------------*/
+/*The header of the schema file, ie the statements version 1.0 and @totalColumns 31, indicates that this schema
+  is using version 1.0 of the schema language (NB, not that that it is version 1.0 of this particular schema),
+  and that there are 31 columns in total in the file.*/
+batch_code: length(1,11) regex("^WO95Y14B([0-9]{3}|smp)$")
+  //1st part, batch_code must be between 1 and 11 characters long, and (implicitly multiple conditions are joined
+  //by a logical AND unless another boolean is provided). 2nd part restricts to form similar to WO95Y14B000 (last
+  //three digits are running number for batches throughout the project.
+department: is("WO") and (in($file_path) and in($resource_uri))
+  //Parentheses control evaluation order of booleans as might be expected
+  //Department is fixed value of WO for this project.
+  //The grouped "in" statements say that the value found in this field must also be found as part of the fields
+  //"file_path" and "resource_uri"
+division: is("13")
+  //this field must be precisely 13
+series: is("95") and (in($file_path) and in($resource_uri))
+  //Fixed value of 95 for this project
+  //The value must also be part of the fields "file_path" and "resource_uri"
+sub_series: is("1")
+  //For the 2014-15 project all material to be digitised is in sub_series 1 (France and Flanders)
+sub_sub_series: if($piece/is("5500"),is(""),(range(1,7) or is("115")))
+  //As described in Appendix E of the ITT, the 1914-15 project is scanning material in sub_sub_series 1-7 and 115,
+  //Piece 5500 is also included which is not in any sub_sub_series, so the value is blank for that piece only.
+piece: if($sub_sub_series/is("1"),range(1,85),if($sub_sub_series/is("2"),range(86,153),if($sub_sub_series/is("3"),range(154,267),if($sub_sub_series/is("4"),range(268,358),if($sub_sub_series/is("5"),range(359,430),if($sub_sub_series/is("6"),range(431,517),if($sub_sub_series/is("7"),range(518,571),if($sub_sub_series/is("115"),range(3949,4193),if($sub_sub_series/is(""),is("5500")))))))))) and (in($file_path) and in($resource_uri))
+  //For this project there is a defined relationship between piece ranges as listed in Appendix E
+  //This is encapsulated in this rather complex if,then,else statement
+ //The value must also be part of the fields "file_path" and "resource_uri"
+item: (range(1,20) and in($file_path)) or is("")
+  //Most pieces are subdivided into items, there are not expected to be more than 10 per piece
+  //The value must also be part of the fields "file_path" and "resource_uri"
+  //In many cases the item level is not used, so this would be left blank.
+  //as the sorting/cataloguing process advances this condition may be tightened
+ordinal: range(1,500) and in($file_path) unique($department,$division,$series,$sub_series,$sub_sub_series,$piece,$item,$ordinal)
+  //the ordinal is a simple running count of the images within an item (or piece if not itemised).
+  //No single item (or piece if not itemised) should contain more than 150 pages but rule changed to 500 to allow for exceptions
+  //This (with leading zeroes) also forms the final part of the filepath, immediately before the .jp2 extension
+  //the combination of fields indicated should be unique within the file
+description: not("") and regex("^.*[^\.]$")
+  //description is a fairly free-form field, but must not be empty
+covering_date: regex("^19(14|15|16|17|18|19|20|21|22|23)( (Jan|Feb|Mar|Apr|May|June|July|Aug|Sept|Oct|Nov|Dec)( ([1-3][0-9]|[1-9]))?)?(-19(14|15|16|17|18|19|20|21|22|23)( (Jan|Feb|Mar|Apr|May|June|July|Aug|Sept|Oct|Nov|Dec)( ([1-3][0-9]|[1-9]))?)?)?$")
+  //dates according to The National Archives' cataloguing standards, expected to be a range for this project, but may be relaxed
+legal_status: is("Public Record")
+held_by: is("The National Archives, Kew")
+file_uuid: uuid4 unique
+  //must be a version 4 uuid, and the value must be unique within the file.  uuids must be lower case.
+file_path: uri starts("file:///WO_95/") unique fileExists integrityCheck("excludeFolder")
+  //fileExists checks that there is actually a file of the given name at the specified location on the file system.
+  //In practice, the validator will normally be run with the --path switch
+  //(see http://digital-preservation.github.io/csv-validator/)
+  //We also require that the path is a valid uri, and begins file:///WO_95/ as this is the top-level folder for each batch
+  //(Conditions specified on earlier columns say that the values of those columns must also appear as part of the
+  //content of this field)
+  //must be unique within the file
+file_checksum: unique checksum(file($file_path),"SHA-256")
+  //Compare the value given in this field to the checksum calculated for the file found at the location given in
+  //the "file_path" field (again path substitution may well be applied as described for the "file_path" field itself).
+  //Use the specified checksum algorithm (must use lowercase hex characters).
+  //unique within the file - an identical checksum would imply identical images
+resource_uri: uri starts("http://datagov.nationalarchives.gov.uk/66/WO/95/") unique
+  //Must be a valid uri which starts with the specified string, the uri is constructed such that it must be unique in the file
+  //(Conditions specified on earlier columns say that the values of those columns must also appear as part of the
+  //content of this field)
+scan_operator: length(1,12) regex("^[0-9a-zA-Z]{1,12}$")
+  //12 alphanumeric characters representing the identity of the scanning operator (the ability to decode this is
+  //restricted to the scanning company to avoid personally identifying data being held in the file
+scan_id: length(1,12) regex("^[0-9a-zA-Z]{1,12}$")
+  //Like "scan_operator", but this code represents the actual scanner or camera used
+scan_location: regex("[-\w\s,.]+")
+  //Address or other description of the location where scanning physically occurred. The regex allows any number
+  //of characters, allows general word and whitespace characters plus hyphen, comma and full stop
+image_resolution: positiveInteger (is("300") or is("600"))
+  //Always a positive (non-zero) integer, and in general explicitly 300. Occasionally a higher resolution used.
+  //Depending how this is populated (whether nominal or actual resolution), it might be better to use a range
+  //eg range(298,302) to capture slight variances in resolution.
+image_width: positiveInteger
+  //Must be a positive (non-zero) integer. The material in this series is very varied in size, so no checking is attempted beyond this
+image_height: positiveInteger
+  //Must be a positive (non-zero) integer. The material in this series is very varied in size, so no checking is attempted beyond this
+image_tonal_resolution: is("24-bit colour")
+  //must be string: 24-bit colour (precisely - case as shown).  Occasionally a different value might be specified.
+image_format: is("x-fmt/392")
+  //must be string: x-fmt/392 (precisely) - ie a jp2 file as understood by PRONOM
+  //(http://www.nationalarchives.gov.uk/PRONOM/x-fmt/392)
+image_compression: positiveInteger is("6")
+  //Always a positive (non-zero) integer, generally 6 to represent 6-fold compression with the lossy algorithm
+  //available in the JPEG2000 specification
+image_colour_space: is("sRGB")
+  //must be string: sRGB (precisely - case as shown)
+image_split: is("yes") or is("no") or is("composite")
+  //must be string: yes; or string: no or string: composite (precisely - case as shown).  Used if eg an image of complete double page
+  //subsequently split into two separate images of each page individually, or if an oversize document is imaged as a composite of several images
+image_split_ordinal: if($image_split/is("composite"),range(1,9),is(""))
+  //describes the ordering of the individual "tiles" when an oversize documents has to be imaged in sections as a composite.
+  //9 is expected to be sufficient, but will be reviewed if required
+  //if image_split is not composite it must be blank
+image_split_other_uuid: if($image_split/is("no"),is(""),regex("^[a-f0-9]{8}-[a-f0-9]{4}-4[a-f0-9]{3}-[89ab][a-f0-9]{3}-?[a-f0-9]{12}(,[a-f0-9]{8}-[a-f0-9]{4}-4[a-f0-9]{3}-[89ab][a-f0-9]{3}-?[a-f0-9]{12}){0,8}$"))
+  //if "image_split" field is no, must be blank
+  //else it must be a uuid4 or comma separated list of up to 9 uuid4s
+  //due to the requirement to allow a comma separated list regex has had to be used, rather than the built in uuid4 datatype
+image_crop: is("auto") or is("manual") or is("none")
+  //must be string: auto; or string: manual or string: none (precisely - case as shown)
+image_deskew: is("yes") or is("no")
+  //must be string: yes; or string: no (precisely - case as shown)
+comments: regex("[\w\s,.]+") @optional
diff --git a/csv-validator-core/src/test/scala/uk/gov/nationalarchives/csv/validator/MetaDataValidatorAcceptanceSpec.scala b/csv-validator-core/src/test/scala/uk/gov/nationalarchives/csv/validator/MetaDataValidatorAcceptanceSpec.scala
@@ -518,16 +518,16 @@ class MetaDataValidatorAcceptanceSpec extends Specification with TestResources {
 
   "Url decode string provider" should {
     "decode url string to normal string" in {
-      validate(TextFile(Path.fromString(base) / "urlDecodePass.csv"), parse(base + "/urlDecode.csvs"), None).isSuccess mustEqual true
+      validate(TextFile(Path.fromString(base) / "uriDecodePass.csv"), parse(base + "/uriDecode.csvs"), None).isSuccess mustEqual true
     }
 
     "fail for wrong url" in {
-      validate(TextFile(Path.fromString(base) / "urlDecodeFail.csv"), parse(base + "/urlDecode.csvs"), None).isFailure mustEqual true
+      validate(TextFile(Path.fromString(base) / "uriDecodeFail.csv"), parse(base + "/uriDecode.csvs"), None).isFailure mustEqual true
     }
 
     "decode URL with optional charset parameter" in {
 
-      validate(TextFile(Path.fromString(base) / "urlDecodeWithCharsetPass.csv"), parse(base + "/urlDecodeWithCharset.csvs"), None).isSuccess mustEqual true
+      validate(TextFile(Path.fromString(base) / "uriDecodeWithCharsetPass.csv"), parse(base + "/uriDecodeWithCharset.csvs"), None).isSuccess mustEqual true
     }
   }
 
diff --git a/csv-validator-core/src/test/scala/uk/gov/nationalarchives/csv/validator/MetaDataValidatorIntegrityCheckSpec.scala b/csv-validator-core/src/test/scala/uk/gov/nationalarchives/csv/validator/MetaDataValidatorIntegrityCheckSpec.scala
@@ -108,6 +108,13 @@ class MetaDataValidatorIntegrityCheckSpec extends Specification with TestResourc
       validator.validate(TextFile(Path.fromString(WO95Path) / "tech_acq_metadata_v1_WO95Y14B003.csv"), parse(WO95Path + "/tech_acq_metadata_v1_WO95Y14B000.csvs",validator), None).isSuccess mustEqual true
     }
 
+    "Validate WO 95 with 1.2 schema version to test backward compatibility" in {
+
+      val substitutionPaths = List(("file:///WO_95",WO95Path))
+      val validator = buildValidator(substitutionPaths)
+      validator.validate(TextFile(Path.fromString(WO95Path) / "tech_acq_metadata_v1_WO95Y14B003.csv"), parse(WO95Path + "/tech_acq_metadata_v1_WO95Y14B000_v1.2.csvs",validator), None).isSuccess mustEqual true
+    }
+
 
     "succeed with alternative substitution paths - header" in {
 
diff --git a/csv-validator-core/src/test/scala/uk/gov/nationalarchives/csv/validator/UtilSpec.scala b/csv-validator-core/src/test/scala/uk/gov/nationalarchives/csv/validator/UtilSpec.scala
@@ -65,7 +65,7 @@ class UtilSpec extends Specification with TestResources  {
 
       val integrityCheckFiles =  Util.findAllFiles(true, new File(base))
 
-      integrityCheckFiles  must haveLength(42)
+      integrityCheckFiles  must haveLength(43)
 
       integrityCheckFiles must contain (new File(s"$basePath/uk/gov/nationalarchives/csv/validator/integrityCheck/header/integrityCheckSchema.csvs"))
 
@@ -82,7 +82,7 @@ class UtilSpec extends Specification with TestResources  {
 
       val integrityCheckFilesNoFolder =  Util.findAllFiles(false, new File(base))
 
-      integrityCheckFilesNoFolder  must haveLength(28)
+      integrityCheckFilesNoFolder  must haveLength(29)
 
       integrityCheckFilesNoFolder must contain (new File(s"$basePath/uk/gov/nationalarchives/csv/validator/integrityCheck/header/content/file1"))
 
diff --git a/csv-validator-core/src/test/scala/uk/gov/nationalarchives/csv/validator/schema/v1_2/SchemaSpec.scala b/csv-validator-core/src/test/scala/uk/gov/nationalarchives/csv/validator/schema/v1_2/SchemaSpec.scala
@@ -20,18 +20,18 @@ import uk.gov.nationalarchives.csv.validator.schema.v1_1.NoExt
 @RunWith(classOf[JUnitRunner])
 class SchemaSpec extends SchemaSpecBase {
 
-  "UrlDecode Arg provider" should {
+  "UriDecode Arg provider" should {
 
     "decode url parameter" in {
 
-      val result = UrlDecode(Literal(Some("text%20text")), None).referenceValue(1, Row(List(Cell("Germany")), 1), buildSchema1_2(TotalColumns(0))())
+      val result = UriDecode(Literal(Some("text%20text")), None).referenceValue(1, Row(List(Cell("Germany")), 1), buildSchema1_2(TotalColumns(0))())
 
       result must beSome("text text")
 
     }
 
     "decode URL parameter with different charset" in {
-      val result = UrlDecode(Literal(Some("text%9Atext")), Some(Literal(Some("windows-1252")))).referenceValue(1, Row(List(Cell("Germany")), 1), buildSchema1_2(TotalColumns(0))())
+      val result = UriDecode(Literal(Some("text%9Atext")), Some(Literal(Some("windows-1252")))).referenceValue(1, Row(List(Cell("Germany")), 1), buildSchema1_2(TotalColumns(0))())
 
       result must beSome("text\u0161text")
     }

Original file line number	Diff line number	Diff line change
`@@ -21,8 +21,8 @@ trait SchemaParser extends SchemaParser1_1 {`
`21`	`21`	`s => Literal(Some(s))`
`22`	`22`	`}`
`23`	`23`
`24`		`- lazy val urlDecode: PackratParser[ArgProvider] = "UrlDecode" ::= "urlDecode(" ~> stringProvider ~ opt("," ~> stringProvider) <~ ")" ^^ {`
`25`		`- case value ~ charset => UrlDecode(value, charset)`
	`24`	`+ lazy val urlDecode: PackratParser[ArgProvider] = "UriDecode" ::= "uriDecode(" ~> stringProvider ~ opt("," ~> stringProvider) <~ ")" ^^ {`
	`25`	`+ case value ~ charset => UriDecode(value, charset)`
`26`	`26`	`}`
`27`	`27`
`28`	`28`	`}`
Original file line number	Diff line number	Diff line change
`@@ -518,16 +518,16 @@ class MetaDataValidatorAcceptanceSpec extends Specification with TestResources {`
`518`	`518`
`519`	`519`	`"Url decode string provider" should {`
`520`	`520`	`"decode url string to normal string" in {`
`521`		`- validate(TextFile(Path.fromString(base) / "urlDecodePass.csv"), parse(base + "/urlDecode.csvs"), None).isSuccess mustEqual true`
	`521`	`+ validate(TextFile(Path.fromString(base) / "uriDecodePass.csv"), parse(base + "/uriDecode.csvs"), None).isSuccess mustEqual true`
`522`	`522`	`}`
`523`	`523`
`524`	`524`	`"fail for wrong url" in {`
`525`		`- validate(TextFile(Path.fromString(base) / "urlDecodeFail.csv"), parse(base + "/urlDecode.csvs"), None).isFailure mustEqual true`
	`525`	`+ validate(TextFile(Path.fromString(base) / "uriDecodeFail.csv"), parse(base + "/uriDecode.csvs"), None).isFailure mustEqual true`
`526`	`526`	`}`
`527`	`527`
`528`	`528`	`"decode URL with optional charset parameter" in {`
`529`	`529`
`530`		`- validate(TextFile(Path.fromString(base) / "urlDecodeWithCharsetPass.csv"), parse(base + "/urlDecodeWithCharset.csvs"), None).isSuccess mustEqual true`
	`530`	`+ validate(TextFile(Path.fromString(base) / "uriDecodeWithCharsetPass.csv"), parse(base + "/uriDecodeWithCharset.csvs"), None).isSuccess mustEqual true`
`531`	`531`	`}`
`532`	`532`	`}`
`533`	`533`