1+ version 1.1
2+ @totalColumns 27
3+ /*--------------------------------------------------------------------------------------------------------------
4+ |This schema is for the validation of technical acquisition metadata |
5+ |csv files according to the specification given for digitised surrogates in |
6+ |http://www.nationalarchives.gov.uk/documents/information-management/digitisation-at-the-national-archives.pdf |
7+ |This version is generic, for a given digitisation project, specific values/ranges for department, division, |
8+ |series, sub_series, sub_sub_series, piece and item would be given, along with a specific format for batch_code|
9+ |(usually reflecting department and series) |
10+ --------------------------------------------------------------------------------------------------------------*/
11+ /*The header of the schema file, ie the statements version 1.0 and @totalColumns 27, indicates that this schema
12+ is using version 1.0 of the schema language (NB, not that that it is version 1.0 of this particular schema),
13+ and that there are 27 columns in total in the file.*/
14+ batch_code: length(1,16) regex("^[0-9a-zA-Z]{1,16}$")
15+ //1st part, batch_code must be between 1 and 16 characters long, and (implicitly multiple conditions are joined
16+ //by a logical AND unless another boolean is provided). 2nd part restricts to alphanumeric characters as
17+ //specified in digitisation standards p 31. Would usually comprise project identifier (eg department and series),
18+ //plus running count of batch number within that.
19+ department: regex("[A-Z]{1,4}") and (in($file_path) and in($resource_uri))
20+ //Parentheses control evaluation order of booleans as might be expected
21+ //The regex statement says that this field must consist of between 1 and 4 upper case alphabetic characters.
22+ //The grouped "in" statements say that the value found in this field must also be found as part of the fields
23+ //"file_path" and "resource_uri"
24+ division: positiveInteger or is("")
25+ //this field must either be a positive integer or be blank (defined per project)
26+ series: positiveInteger and (in($file_path) and in($resource_uri))
27+ //in general we expect this field will be a positive (non-zero) integer. For a particular project, a specific
28+ //value will normally be given. The value must also be part of the fields "file_path" and "resource_uri"
29+ sub_series: positiveInteger or is("")
30+ //this field must either be a positive integer or be blank (defined per project)
31+ sub_sub_series: positiveInteger or is("")
32+ //this field must either be a positive integer or be blank (defined per project)
33+ piece: positiveInteger and (in($file_path) and in($resource_uri))
34+ //Generally this value will be a positive integer, rarely the piece reference may take a more complicated form
35+ //which would be defined on a per project basis.
36+ //Often the range of values for piece would be known, and so a statement such as range(1,578) might be used.
37+ //The value must also be part of the fields "file_path" and "resource_uri"
38+ item: (positiveInteger and (in($file_path) and in($resource_uri))) or is("")
39+ //Generally (if used) this value will be a positive integer, rarely the item reference may take a more
40+ //complicated form which would be defined on a per project basis.
41+ //The value must also be part of the fields "file_path" and "resource_uri"
42+ //In many cases the item level is not used, so this would be left blank.
43+ file_uuid: uuid4 unique
44+ //must be a version 4 uuid, and the value must be unique within the file. uuids must be lower case.
45+ file_path: fileExists uri starts(concat("file:///",$department,"_",$series,"/",$piece,"/",$item,"/",$piece,"_",$item,"_")) regex(".*[12]_[12]_((00[1-9])|(010)).xml$") integrityCheck("","","excludeFolder")
46+ //fileExists checks that there is actually a file of the given name at the specified location on the file system.
47+ //In practice, the validator will normally be run with the --path switch
48+ //(see http://digital-preservation.github.io/csv-validator/)
49+ //We also require that the path is a valid uri, and begins file:///<department>_<series>/<piece>/<item>/<piece>_<item>_
50+ //and that the filename at the end of the path is of the desired form specified by regex
51+ //(Conditions specified on earlier columns say that the values of those columns must also appear as part of the
52+ //content of this field)
53+ file_checksum: checksum(file($file_path),"SHA-256")
54+ //Compare the value given in this field to the checksum calculated for the file found at the location given in
55+ //the "file_path" field (again path substitution may well be applied as described for the "file_path" field itself).
56+ //Use the specified checksum algorithm (must use lowercase hex characters).
57+ resource_uri: uri is(concat("http://datagov.nationalarchives.gov.uk/66/",$department,"/",$series,"/",$piece,"/",$item,"/",$file_uuid))
58+ //Must be a valid uri which starts with the specified string
59+ //(Conditions specified on earlier columns say that the values of those columns must also appear as part of the
60+ //content of this field)
61+ scan_operator: length(1,12) regex("^[0-9a-zA-Z]{1,12}$")
62+ //12 alphanumeric characters representing the identity of the scanning operator (the ability to decode this is
63+ //restricted to the scanning company to avoid personally identifying data being held in the file
64+ scan_id: length(1,12) regex("^[0-9a-zA-Z]{1,12}$")
65+ //Like "scan_operator", but this code represents the actually scanner or camera used
66+ scan_location: regex("[-\w\s,.]+")
67+ //Address or other description of the location where scanning physically occurred. The regex allows any number
68+ //of characters, allows general word and whitespace characters plus hyphen, comma and full stop
69+ image_resolution: positiveInteger is("300")
70+ //Always a positive (non-zero) integer, and in general explicitly 300. Occasionally a higher resolution used.
71+ //Depending how this is populated (whether nominal or actual resolution), it might be better to use a range
72+ //eg range(298,302) to capture slight variances in resolution.
73+ image_width: positiveInteger
74+ //Must be a positive (non-zero) integer. If the size of the material being digitised is well understood could use
75+ //a range check to ensure values are within a "sensible" range eg range(2400,2600) for A4 material - just over
76+ //8" wide (portrait), plus border, and assuming 300 ppi
77+ image_height: positiveInteger
78+ //Must be a positive (non-zero) integer. If the size of the material being digitised is well understood could use
79+ //a range check to ensure values are within a "sensible" range eg range(3450,3650) for A4 material - just over
80+ //11.5" high (portrait), plus border, and assuming 300 ppi
81+ image_tonal_resolution: is("24-bit colour")
82+ //must be string: 24-bit colour (precisely - case as shown). Occasionally a different value might be specified.
83+ image_format: is("x-fmt/392")
84+ //must be string: x-fmt/392 (precisely) - ie a jp2 file as understood by PRONOM
85+ //(http://www.nationalarchives.gov.uk/PRONOM/x-fmt/392)
86+ image_compression: positiveInteger is("6")
87+ //Always a positive (non-zero) integer, generally 6 to represent 6-fold compression with the lossy algorithm
88+ //available in the JPEG2000 specification
89+ image_colour_space: is("sRGB")
90+ //must be string: sRGB (precisely - case as shown). Other colour spaces might be used for specific projects
91+ image_split: is("yes") or is("no")
92+ //must be string: yes; or string: no (precisely - case as shown). Used if eg an image of complete double page
93+ //subsequently split into two separate images of each page individually
94+ image_split_other_uuid: if($image_split/is("yes"),uuid4,is(""))
95+ //if "image_split" field is yes, must be a uuid4, else must be blank (in certain circumstances it would be
96+ //possible that this could be a list of uuids, in which case the conditions would have to be reworked)
97+ image_crop: is("auto") or is("manual") or is("none")
98+ //must be string: auto; or string: manual or string: none (precisely - case as shown)
99+ image_deskew: is("yes") or is("no")
100+ //must be string: yes; or string: no (precisely - case as shown)
101+ comments: regex("[\w\s,.]+") @optional
0 commit comments