Skip to content

Commit aa952fc

Browse files
author
Radek Hubner
committed
urlDecode -> uriDecode. Add compatibility test.
1 parent 8266aee commit aa952fc

File tree

12 files changed

+141
-14
lines changed

12 files changed

+141
-14
lines changed

csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/v1_2/Schema.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ import uk.gov.nationalarchives.csv.validator.metadata.Row
1212
import uk.gov.nationalarchives.csv.validator.schema.{Schema, ArgProvider}
1313
import java.net.{URLDecoder => JURLDecoder}
1414

15-
case class UrlDecode(value: ArgProvider, charset: Option[ArgProvider]) extends ArgProvider {
15+
case class UriDecode(value: ArgProvider, charset: Option[ArgProvider]) extends ArgProvider {
1616

1717
val DefaultCharset = "UTF-8"
1818

@@ -24,5 +24,5 @@ case class UrlDecode(value: ArgProvider, charset: Option[ArgProvider]) extends A
2424

2525
})
2626

27-
override def toError: String = "urlDecode(" + value.toError + ")"
27+
override def toError: String = "uriDecode(" + value.toError + ")"
2828
}

csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/v1_2/SchemaParser.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ trait SchemaParser extends SchemaParser1_1 {
2121
s => Literal(Some(s))
2222
}
2323

24-
lazy val urlDecode: PackratParser[ArgProvider] = "UrlDecode" ::= "urlDecode(" ~> stringProvider ~ opt("," ~> stringProvider) <~ ")" ^^ {
25-
case value ~ charset => UrlDecode(value, charset)
24+
lazy val urlDecode: PackratParser[ArgProvider] = "UriDecode" ::= "uriDecode(" ~> stringProvider ~ opt("," ~> stringProvider) <~ ")" ^^ {
25+
case value ~ charset => UriDecode(value, charset)
2626
}
2727

2828
}
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
version 1.2
22
@totalColumns 2 @noHeader
33
identifier:
4-
filename: in(urlDecode($identifier))
4+
filename: in(uriDecode($identifier))
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
version 1.2
22
@totalColumns 3 @noHeader
33
identifier:
4-
filename: in(urlDecode($identifier, $charset))
4+
filename: in(uriDecode($identifier, $charset))
55
charset:
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
version 1.2
2+
@totalColumns 33
3+
/*---------------------------------------------------------------------------------------------------------------
4+
|This schema is for the validation of technical acquisition metadata |
5+
|csv files according to the specification given for digitised surrogates in |
6+
|http://www.nationalarchives.gov.uk/documents/information-management/digitisation-at-the-national-archives.pdf |
7+
|This version is for WO 95 digitisation in the period 2014-15 |
8+
| 20140818 Version 1.0 DHU First release version for this project |
9+
| 20140910 Version 1.1 DHU Updated date regex to fix issues, allowed items up to 14, disallow fullstops |
10+
|at end of description as this causes search issues in Discovery. |
11+
| 20141016 version 1.2 NW Updated regex to allow 20 items, 500 ordinals & addition of legal_status |
12+
|and held_by fields, changed date column to covering_date |
13+
| 20141110 version 1.3 NW fixed sub_sub_series rule |
14+
|from sub_sub_series: range(1,7) or is("115") or if($piece/is("5500"),is("")) |
15+
|to sub_sub_series: if($piece/is("5500"),is(""),(range(1,7) or is("115"))) |
16+
| 20160511 - RH - Update schema version to test CSV validator backward compatibility |
17+
---------------------------------------------------------------------------------------------------------------*/
18+
/*The header of the schema file, ie the statements version 1.0 and @totalColumns 31, indicates that this schema
19+
is using version 1.0 of the schema language (NB, not that that it is version 1.0 of this particular schema),
20+
and that there are 31 columns in total in the file.*/
21+
batch_code: length(1,11) regex("^WO95Y14B([0-9]{3}|smp)$")
22+
//1st part, batch_code must be between 1 and 11 characters long, and (implicitly multiple conditions are joined
23+
//by a logical AND unless another boolean is provided). 2nd part restricts to form similar to WO95Y14B000 (last
24+
//three digits are running number for batches throughout the project.
25+
department: is("WO") and (in($file_path) and in($resource_uri))
26+
//Parentheses control evaluation order of booleans as might be expected
27+
//Department is fixed value of WO for this project.
28+
//The grouped "in" statements say that the value found in this field must also be found as part of the fields
29+
//"file_path" and "resource_uri"
30+
division: is("13")
31+
//this field must be precisely 13
32+
series: is("95") and (in($file_path) and in($resource_uri))
33+
//Fixed value of 95 for this project
34+
//The value must also be part of the fields "file_path" and "resource_uri"
35+
sub_series: is("1")
36+
//For the 2014-15 project all material to be digitised is in sub_series 1 (France and Flanders)
37+
sub_sub_series: if($piece/is("5500"),is(""),(range(1,7) or is("115")))
38+
//As described in Appendix E of the ITT, the 1914-15 project is scanning material in sub_sub_series 1-7 and 115,
39+
//Piece 5500 is also included which is not in any sub_sub_series, so the value is blank for that piece only.
40+
piece: if($sub_sub_series/is("1"),range(1,85),if($sub_sub_series/is("2"),range(86,153),if($sub_sub_series/is("3"),range(154,267),if($sub_sub_series/is("4"),range(268,358),if($sub_sub_series/is("5"),range(359,430),if($sub_sub_series/is("6"),range(431,517),if($sub_sub_series/is("7"),range(518,571),if($sub_sub_series/is("115"),range(3949,4193),if($sub_sub_series/is(""),is("5500")))))))))) and (in($file_path) and in($resource_uri))
41+
//For this project there is a defined relationship between piece ranges as listed in Appendix E
42+
//This is encapsulated in this rather complex if,then,else statement
43+
//The value must also be part of the fields "file_path" and "resource_uri"
44+
item: (range(1,20) and in($file_path)) or is("")
45+
//Most pieces are subdivided into items, there are not expected to be more than 10 per piece
46+
//The value must also be part of the fields "file_path" and "resource_uri"
47+
//In many cases the item level is not used, so this would be left blank.
48+
//as the sorting/cataloguing process advances this condition may be tightened
49+
ordinal: range(1,500) and in($file_path) unique($department,$division,$series,$sub_series,$sub_sub_series,$piece,$item,$ordinal)
50+
//the ordinal is a simple running count of the images within an item (or piece if not itemised).
51+
//No single item (or piece if not itemised) should contain more than 150 pages but rule changed to 500 to allow for exceptions
52+
//This (with leading zeroes) also forms the final part of the filepath, immediately before the .jp2 extension
53+
//the combination of fields indicated should be unique within the file
54+
description: not("") and regex("^.*[^\.]$")
55+
//description is a fairly free-form field, but must not be empty
56+
covering_date: regex("^19(14|15|16|17|18|19|20|21|22|23)( (Jan|Feb|Mar|Apr|May|June|July|Aug|Sept|Oct|Nov|Dec)( ([1-3][0-9]|[1-9]))?)?(-19(14|15|16|17|18|19|20|21|22|23)( (Jan|Feb|Mar|Apr|May|June|July|Aug|Sept|Oct|Nov|Dec)( ([1-3][0-9]|[1-9]))?)?)?$")
57+
//dates according to The National Archives' cataloguing standards, expected to be a range for this project, but may be relaxed
58+
legal_status: is("Public Record")
59+
held_by: is("The National Archives, Kew")
60+
file_uuid: uuid4 unique
61+
//must be a version 4 uuid, and the value must be unique within the file. uuids must be lower case.
62+
file_path: uri starts("file:///WO_95/") unique fileExists integrityCheck("excludeFolder")
63+
//fileExists checks that there is actually a file of the given name at the specified location on the file system.
64+
//In practice, the validator will normally be run with the --path switch
65+
//(see http://digital-preservation.github.io/csv-validator/)
66+
//We also require that the path is a valid uri, and begins file:///WO_95/ as this is the top-level folder for each batch
67+
//(Conditions specified on earlier columns say that the values of those columns must also appear as part of the
68+
//content of this field)
69+
//must be unique within the file
70+
file_checksum: unique checksum(file($file_path),"SHA-256")
71+
//Compare the value given in this field to the checksum calculated for the file found at the location given in
72+
//the "file_path" field (again path substitution may well be applied as described for the "file_path" field itself).
73+
//Use the specified checksum algorithm (must use lowercase hex characters).
74+
//unique within the file - an identical checksum would imply identical images
75+
resource_uri: uri starts("http://datagov.nationalarchives.gov.uk/66/WO/95/") unique
76+
//Must be a valid uri which starts with the specified string, the uri is constructed such that it must be unique in the file
77+
//(Conditions specified on earlier columns say that the values of those columns must also appear as part of the
78+
//content of this field)
79+
scan_operator: length(1,12) regex("^[0-9a-zA-Z]{1,12}$")
80+
//12 alphanumeric characters representing the identity of the scanning operator (the ability to decode this is
81+
//restricted to the scanning company to avoid personally identifying data being held in the file
82+
scan_id: length(1,12) regex("^[0-9a-zA-Z]{1,12}$")
83+
//Like "scan_operator", but this code represents the actual scanner or camera used
84+
scan_location: regex("[-\w\s,.]+")
85+
//Address or other description of the location where scanning physically occurred. The regex allows any number
86+
//of characters, allows general word and whitespace characters plus hyphen, comma and full stop
87+
image_resolution: positiveInteger (is("300") or is("600"))
88+
//Always a positive (non-zero) integer, and in general explicitly 300. Occasionally a higher resolution used.
89+
//Depending how this is populated (whether nominal or actual resolution), it might be better to use a range
90+
//eg range(298,302) to capture slight variances in resolution.
91+
image_width: positiveInteger
92+
//Must be a positive (non-zero) integer. The material in this series is very varied in size, so no checking is attempted beyond this
93+
image_height: positiveInteger
94+
//Must be a positive (non-zero) integer. The material in this series is very varied in size, so no checking is attempted beyond this
95+
image_tonal_resolution: is("24-bit colour")
96+
//must be string: 24-bit colour (precisely - case as shown). Occasionally a different value might be specified.
97+
image_format: is("x-fmt/392")
98+
//must be string: x-fmt/392 (precisely) - ie a jp2 file as understood by PRONOM
99+
//(http://www.nationalarchives.gov.uk/PRONOM/x-fmt/392)
100+
image_compression: positiveInteger is("6")
101+
//Always a positive (non-zero) integer, generally 6 to represent 6-fold compression with the lossy algorithm
102+
//available in the JPEG2000 specification
103+
image_colour_space: is("sRGB")
104+
//must be string: sRGB (precisely - case as shown)
105+
image_split: is("yes") or is("no") or is("composite")
106+
//must be string: yes; or string: no or string: composite (precisely - case as shown). Used if eg an image of complete double page
107+
//subsequently split into two separate images of each page individually, or if an oversize document is imaged as a composite of several images
108+
image_split_ordinal: if($image_split/is("composite"),range(1,9),is(""))
109+
//describes the ordering of the individual "tiles" when an oversize documents has to be imaged in sections as a composite.
110+
//9 is expected to be sufficient, but will be reviewed if required
111+
//if image_split is not composite it must be blank
112+
image_split_other_uuid: if($image_split/is("no"),is(""),regex("^[a-f0-9]{8}-[a-f0-9]{4}-4[a-f0-9]{3}-[89ab][a-f0-9]{3}-?[a-f0-9]{12}(,[a-f0-9]{8}-[a-f0-9]{4}-4[a-f0-9]{3}-[89ab][a-f0-9]{3}-?[a-f0-9]{12}){0,8}$"))
113+
//if "image_split" field is no, must be blank
114+
//else it must be a uuid4 or comma separated list of up to 9 uuid4s
115+
//due to the requirement to allow a comma separated list regex has had to be used, rather than the built in uuid4 datatype
116+
image_crop: is("auto") or is("manual") or is("none")
117+
//must be string: auto; or string: manual or string: none (precisely - case as shown)
118+
image_deskew: is("yes") or is("no")
119+
//must be string: yes; or string: no (precisely - case as shown)
120+
comments: regex("[\w\s,.]+") @optional

csv-validator-core/src/test/scala/uk/gov/nationalarchives/csv/validator/MetaDataValidatorAcceptanceSpec.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -518,16 +518,16 @@ class MetaDataValidatorAcceptanceSpec extends Specification with TestResources {
518518

519519
"Url decode string provider" should {
520520
"decode url string to normal string" in {
521-
validate(TextFile(Path.fromString(base) / "urlDecodePass.csv"), parse(base + "/urlDecode.csvs"), None).isSuccess mustEqual true
521+
validate(TextFile(Path.fromString(base) / "uriDecodePass.csv"), parse(base + "/uriDecode.csvs"), None).isSuccess mustEqual true
522522
}
523523

524524
"fail for wrong url" in {
525-
validate(TextFile(Path.fromString(base) / "urlDecodeFail.csv"), parse(base + "/urlDecode.csvs"), None).isFailure mustEqual true
525+
validate(TextFile(Path.fromString(base) / "uriDecodeFail.csv"), parse(base + "/uriDecode.csvs"), None).isFailure mustEqual true
526526
}
527527

528528
"decode URL with optional charset parameter" in {
529529

530-
validate(TextFile(Path.fromString(base) / "urlDecodeWithCharsetPass.csv"), parse(base + "/urlDecodeWithCharset.csvs"), None).isSuccess mustEqual true
530+
validate(TextFile(Path.fromString(base) / "uriDecodeWithCharsetPass.csv"), parse(base + "/uriDecodeWithCharset.csvs"), None).isSuccess mustEqual true
531531
}
532532
}
533533

csv-validator-core/src/test/scala/uk/gov/nationalarchives/csv/validator/MetaDataValidatorIntegrityCheckSpec.scala

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,13 @@ class MetaDataValidatorIntegrityCheckSpec extends Specification with TestResourc
108108
validator.validate(TextFile(Path.fromString(WO95Path) / "tech_acq_metadata_v1_WO95Y14B003.csv"), parse(WO95Path + "/tech_acq_metadata_v1_WO95Y14B000.csvs",validator), None).isSuccess mustEqual true
109109
}
110110

111+
"Validate WO 95 with 1.2 schema version to test backward compatibility" in {
112+
113+
val substitutionPaths = List(("file:///WO_95",WO95Path))
114+
val validator = buildValidator(substitutionPaths)
115+
validator.validate(TextFile(Path.fromString(WO95Path) / "tech_acq_metadata_v1_WO95Y14B003.csv"), parse(WO95Path + "/tech_acq_metadata_v1_WO95Y14B000_v1.2.csvs",validator), None).isSuccess mustEqual true
116+
}
117+
111118

112119
"succeed with alternative substitution paths - header" in {
113120

0 commit comments

Comments
 (0)