Skip to content

Commit 98d5336

Browse files
committed
Reinstate example CSV Schemas provided by TNA
1 parent 4e5736d commit 98d5336

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+777
-2
lines changed

README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ is then published as HTML. The Schema language is formally expressed in EBNF.
99
You can find the the documentation and latest published specification here:
1010
http://digital-preservation.github.io/csv-schema.
1111

12+
* Examples of CSV Schemas can be found in the [`example-schemas`](https://github.com/adamretter/csv-schema/tree/master/example-schemas) folder.
13+
1214

1315
Repository Organisation
1416
-----------------------
@@ -19,5 +21,4 @@ Repository Organisation
1921
* There is one tag from master each time a version of the specification is published. The tag name reflects
2022
the specification version number.
2123

22-
23-
Released under the [Mozilla Public Licence version 2.0](http://www.mozilla.org/MPL/2.0/).
24+
Released under the [Mozilla Public Licence version 2.0](http://www.mozilla.org/MPL/2.0/).
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
version 1.0
2+
@totalColumns 42
3+
/********************************************************************************
4+
*Schema: ADM_363-technical-acquisition-with-minimal-transcription.csvs *
5+
*Authors: Nicki Welch *
6+
* David Underdown *
7+
*Purpose: To capture metadata about the digitisation of the ADM 363 series *
8+
* Primarily technical metadata, but with a minimal amount of *
9+
* transcription to verify that the records may be publicly released *
10+
* after receipt by The National Archives *
11+
*Revision: 1.0 first release *
12+
* 1.1 update as some official numbers only single digit *
13+
* 1.2 allow M as official number prefix too *
14+
* 1.3 further additions to prefixes, L, S, SS, SSX *
15+
* 1.4 allow for * and ? in official number *
16+
********************************************************************************/
17+
batch_code: length(10) regex("^ADM362B([0-9]{3})$")
18+
department: (is("ADM") if($file_path/notEmpty,in($file_path) and in($resource_uri)))
19+
series: is("362") and if($file_path/notEmpty,in($file_path) and in($resource_uri))
20+
piece: range(1,69720) if($file_path/notEmpty,in($file_path) and in($resource_uri))
21+
item: ((positiveInteger unique($piece,$item,$ordinal)) or empty) if($file_path/notEmpty,in($file_path) and in($resource_uri))
22+
ordinal: if($item/empty,empty,unique($item,$ordinal))
23+
file_uuid: if($ordinal/empty,empty,uuid4 unique)
24+
file_path: uri if($ordinal/empty,empty,unique fileExists regex("^file:\/\/\/ADM_362\/[0-9]{1,5}\/[0-9]{1,5}\/[0-9]{1,4}_.+\.jp2$"))
25+
file_checksum: if($ordinal/empty,empty,checksum(file($file_path),"SHA-256"))
26+
resource_uri: if($ordinal/notEmpty,uri starts("http://datagov.nationalarchives.gov.uk/66/"))
27+
scan_operator: if($ordinal/empty,empty,length(1,12) regex("^[0-9a-zA-Z]{1,12}$"))
28+
scan_id: if($ordinal/empty,empty,length(1,12) regex("^[0-9a-zA-Z_]{1,12}$"))
29+
scan_location: if($ordinal/empty,empty,regex("[-\w\s,]+"))
30+
scan_native_format: if($ordinal/empty,empty,regex("[0-9\w\s,.:]+"))
31+
scan_timestamp: if($ordinal/empty,empty,xDateTime)
32+
image_resolution: if($ordinal/empty,empty,is("300"))
33+
image_width: if($ordinal/empty,empty,positiveInteger)
34+
image_height: if($ordinal/empty,empty,positiveInteger)
35+
image_tonal_resolution: if($ordinal/empty,empty,is("24-bit colour"))
36+
image_format: if($ordinal/empty,empty,is("x-fmt/392"))
37+
image_colour_space: if($ordinal/empty,empty,is("sRGB"))
38+
process_location: if($ordinal/empty,empty,regex("[-\w\s,]+"))
39+
jp2_creation_timestamp: if($ordinal/empty,empty,xDateTime)
40+
uuid_timestamp: if($ordinal/empty,empty,xDateTime)
41+
embed_timestamp: if($ordinal/empty,empty,xDateTime)
42+
image_split: if($ordinal/empty,empty,is("yes") or is("no"))
43+
image_split_other_uuid: if($ordinal/empty,empty,if($image_split/is("yes"),uuid4,is("")))
44+
image_split_operator: if($ordinal/empty,empty,if($image_split/is("yes"),length(1,12) and regex("^[0-9a-zA-Z]{1,12}$"),is("")))
45+
image_split_timestamp: if($ordinal/empty,empty,if($image_split/is("yes"),xDateTime,is("")))
46+
image_crop: if($ordinal/empty,empty,is("auto") or is("manual") or is("none"))
47+
image_crop_operator: if($ordinal/empty,empty,if($image_split/is("manual"),length(1,12) and regex("^[0-9a-zA-Z]{1,12}$"),is("")))
48+
image_crop_timestamp: if($ordinal/empty,empty,if($image_crop/is("none"),empty,xDateTime))
49+
image_deskew: if($ordinal/empty,empty,is("yes") or is("no"))
50+
image_deskew_operator: if($ordinal/empty,empty,if($image_deskew/is("yes"),regex("^[0-9a-zA-Z]{1,12}$"),is("")))
51+
image_deskew_timestamp: if($ordinal/empty,empty,if($image_deskew/is("yes"),xDateTime,is("")))
52+
QA-code: regex("^[0-9/,]{1,2}$") @optional
53+
comments: regex("[\w\s,\.]+") @optional
54+
transcribed_volume_number: if($item/empty,regex("[0-9A-Z\-\s]{1,15}"),is(""))
55+
transcribed_birth_date_day: if(($ordinal/empty and $item/notEmpty),regex("^\*|([0\?][1-9\?])|([1-2\?][0-9\?])|([3\?][0-1\?])$"),is(""))
56+
transcribed_birth_date_month: if(($ordinal/empty and $item/notEmpty),is("*") or is("?") or is("January") or is("February") or is("March") or is("April") or is("May") or is("June") or is("July") or is("August") or is("September") or is("October") or is("November") or is("December"), is(""))
57+
transcribed_birth_date_year: if(($ordinal/empty and $item/notEmpty),regex("^1[7-9][0-9\?]{2}|\*$"),is(""))
58+
transcribed_official_number: if(($ordinal/empty and $item/notEmpty),regex("^([FJKLMS]|SS|SSX)[/*/?0-9]{1,6}$"),is(""))
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
version 1.0
2+
@totalColumns 42
3+
/********************************************************************************
4+
*Schema: ADM_363-technical-acquisition-with-minimal-transcription.csvs *
5+
*Authors: Nicki Welch *
6+
* David Underdown *
7+
*Purpose: To capture metadata about the digitisation of the ADM 363 series *
8+
* Primarily technical metadata, but with a minimal amount of *
9+
* transcription to verify that the records may be publicly released *
10+
* after receipt by The National Archives *
11+
*Revision: 1.0 first release *
12+
* 1.1 update as some official numbers only single digit *
13+
* 1.2 allow M as official number prefix too *
14+
* 1.3 further additions to prefixes, L, S, SS, SSX *
15+
* 1.4 allow for * and ? in official number *
16+
********************************************************************************/
17+
batch_code: length(10) regex("^ADM36[23]B([0-9]{3})$")
18+
department: (is("ADM") if($file_path/notEmpty,in($file_path) and in($resource_uri)))
19+
series: is("363") and if($file_path/notEmpty,in($file_path) and in($resource_uri))
20+
piece: range(1,69720) if($file_path/notEmpty,in($file_path) and in($resource_uri))
21+
item: ((positiveInteger unique($piece,$item,$ordinal)) or empty) if($file_path/notEmpty,in($file_path) and in($resource_uri))
22+
ordinal: if($item/empty,empty,unique($item,$ordinal))
23+
file_uuid: if($ordinal/empty,empty,uuid4 unique)
24+
file_path: uri if($ordinal/empty,empty,unique fileExists regex("^file:\/\/\/ADM_363\/[0-9]{1,5}\/[0-9]{1,5}\/[0-9]{1,4}_.+\.jp2$"))
25+
file_checksum: if($ordinal/empty,empty,checksum(file($file_path),"SHA-256"))
26+
resource_uri: if($ordinal/notEmpty,uri starts("http://datagov.nationalarchives.gov.uk/66/"))
27+
scan_operator: if($ordinal/empty,empty,length(1,12) regex("^[0-9a-zA-Z]{1,12}$"))
28+
scan_id: if($ordinal/empty,empty,length(1,12) regex("^[0-9a-zA-Z_]{1,12}$"))
29+
scan_location: if($ordinal/empty,empty,regex("[-\w\s,]+"))
30+
scan_native_format: if($ordinal/empty,empty,regex("[0-9\w\s,.:]+"))
31+
scan_timestamp: if($ordinal/empty,empty,xDateTime)
32+
image_resolution: if($ordinal/empty,empty,is("300"))
33+
image_width: if($ordinal/empty,empty,positiveInteger)
34+
image_height: if($ordinal/empty,empty,positiveInteger)
35+
image_tonal_resolution: if($ordinal/empty,empty,is("24-bit colour"))
36+
image_format: if($ordinal/empty,empty,is("x-fmt/392"))
37+
image_colour_space: if($ordinal/empty,empty,is("sRGB"))
38+
process_location: if($ordinal/empty,empty,regex("[-\w\s,]+"))
39+
jp2_creation_timestamp: if($ordinal/empty,empty,xDateTime)
40+
uuid_timestamp: if($ordinal/empty,empty,xDateTime)
41+
embed_timestamp: if($ordinal/empty,empty,xDateTime)
42+
image_split: if($ordinal/empty,empty,is("yes") or is("no"))
43+
image_split_other_uuid: if($ordinal/empty,empty,if($image_split/is("yes"),uuid4,is("")))
44+
image_split_operator: if($ordinal/empty,empty,if($image_split/is("yes"),length(1,12) and regex("^[0-9a-zA-Z]{1,12}$"),is("")))
45+
image_split_timestamp: if($ordinal/empty,empty,if($image_split/is("yes"),xDateTime,is("")))
46+
image_crop: if($ordinal/empty,empty,is("auto") or is("manual") or is("none"))
47+
image_crop_operator: if($ordinal/empty,empty,if($image_split/is("manual"),length(1,12) and regex("^[0-9a-zA-Z]{1,12}$"),is("")))
48+
image_crop_timestamp: if($ordinal/empty,empty,if($image_crop/is("none"),empty,xDateTime))
49+
image_deskew: if($ordinal/empty,empty,is("yes") or is("no"))
50+
image_deskew_operator: if($ordinal/empty,empty,if($image_deskew/is("yes"),regex("^[0-9a-zA-Z]{1,12}$"),is("")))
51+
image_deskew_timestamp: if($ordinal/empty,empty,if($image_deskew/is("yes"),xDateTime,is("")))
52+
QA-code: regex("^[0-9/,]{1,2}$") @optional
53+
comments: regex("[\w\s,\.]+") @optional
54+
transcribed_volume_number: if($item/empty,regex("[0-9A-Z\-\s]{1,15}"),is(""))
55+
transcribed_birth_date_day: if(($ordinal/empty and $item/notEmpty),regex("^\*|([0\?][1-9\?])|([1-2\?][0-9\?])|([3\?][0-1\?])$"),is(""))
56+
transcribed_birth_date_month: if(($ordinal/empty and $item/notEmpty),is("*") or is("?") or is("January") or is("February") or is("March") or is("April") or is("May") or is("June") or is("July") or is("August") or is("September") or is("October") or is("November") or is("December"), is(""))
57+
transcribed_birth_date_year: if(($ordinal/empty and $item/notEmpty),regex("^1[7-9][0-9\?]{2}|\*$"),is(""))
58+
transcribed_official_number: if(($ordinal/empty and $item/notEmpty),regex("^([FJKLMS]|SS|SSX)[/*/?0-9]{1,6}$"),is(""))

example-schemas/README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
CSV Schemas
2+
===========
3+
4+
CSV Schemas expressed in the [CSV Schema Language](http://digital-preservation.github.io/csv-schema/csv-schema-1.0.html).
5+
6+
CSV Schema created by the Digital Preservation and Digital Repository Infrastructure teams at The National Archives will be added to this folder to make them available to digitisation partners and to serve as examples of the use of the CSV Schema Language.
7+
8+
An initial example CSV can be found in the [`example-data`](http://github.com/digital-preservation/csv-schema/tree/master/example-schemas/example-data) folder, which relates to the xml files to be found in its subfolder TEST_1 and further subfolders. This is designed to be validated against the schema [`digitised_surrogate_tech_acq_metadata_v1_TESTBATCH000.csvs`](https://github.com/digital-preservation/csv-schema/blob/master/example-schemas/digitised_surrogate_tech_acq_metadata_v1_TESTBATCH000.csvs). In a genuine digitisation project, the files described by the metadata CSV would be JPEG2000s, but these would tend to be quite large, so to make downloading more practical for demonstration purposes, we have supplied only the XML which would normally be embedded within the JPEG2000 file.
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
version 1.0
2+
@totalColumns 27
3+
/***************************************************************************************************************
4+
*This schema is for the validation of technical acquisition metadata *
5+
*csv files according to the specification given for digitised surrogates in *
6+
*http://www.nationalarchives.gov.uk/documents/information-management/digitisation-at-the-national-archives.pdf *
7+
*This version is an example only, using "fake" values/ranges for department, division, series, sub_series, *
8+
*sub_sub_series, piece and item. A specific format for batch_code is given though this reflects only the "fake"*
9+
*department code, rather than also reflecting the series reference as would usually be the case *
10+
***************************************************************************************************************/
11+
/*The header of the schema file, ie the statements version 1.0 and @totalColumns 27, indicates that this schema
12+
is using version 1.0 of the schema language (NB, not that that it is version 1.0 of this particular schema),
13+
and that there are 27 columns in total in the file.*/
14+
batch_code: starts("TESTBATCH") length(1,16) regex("^[0-9a-zA-Z]{1,16}$")
15+
//1st part, batch_code must be between 1 and 16 characters long, and (implicitly multiple conditions are joined
16+
//by a logical AND unless another boolean is provided). 2nd part restricts to alphanumeric characters as
17+
//specified in digitisation standards p 31. Would usually comprise project identifier (eg department and series),
18+
//plus running count of batch number within that - in this case TESTBATCH followed by count (zero padded).
19+
department: is("TEST") regex("[A-Z]{1,4}") and (in($file_path) and in($resource_uri))
20+
//Parentheses control evaluation order of booleans as might be expected
21+
//The regex statement says that this field must consist of between 1 and 4 upper case alphabetic characters.
22+
//The grouped "in" statements say that the value found in this field must also be found as part of the fields
23+
//"file_path" and "resource_uri"
24+
division: is("")
25+
//this field must be blank for this example
26+
series: is("1") positiveInteger and (in($file_path) and in($resource_uri))
27+
//in general we expect this field will be a positive (non-zero) integer. For this example, a specific
28+
//value of 1 is given. The value must also be part of the fields "file_path" and "resource_uri"
29+
sub_series: positiveInteger or is("")
30+
//this field must either be a positive integer or be blank (defined per project). For this example, a mixture
31+
//of values will be used for demo purposes (this would not be the case in a real project)
32+
sub_sub_series: is("")
33+
//this field must be blank (defined per project)
34+
piece: range(1,3) positiveInteger and (in($file_path) and in($resource_uri))
35+
//Generally this value will be a positive integer, rarely the piece reference may take a more complicated form
36+
//which would be defined on a per project basis.
37+
//Often the range of values for piece would be known, and so a statement such as range(1,3) etc might be used as
38+
//in this example.
39+
//The value must also be part of the fields "file_path" and "resource_uri"
40+
item: (positiveInteger and (in($file_path) and in($resource_uri))) or is("")
41+
//Generally (if used) this value will be a positive integer, rarely the item reference may take a more
42+
//complicated form which would be defined on a per project basis.
43+
//The value must also be part of the fields "file_path" and "resource_uri"
44+
//In many cases the item level is not used, so this would be left blank.
45+
//for this example a mixture of blanks and integers is used (this is unlikely to be the case in a real project)
46+
file_uuid: uuid4 unique
47+
//must be a version 4 uuid, and the value must be unique within the file. uuids must be lower case.
48+
file_path: fileExists uri starts("file:///")
49+
//fileExists checks that there is actually a file of the given name at the specified location on the file system.
50+
//In practice, the validator will normally be run with the --path switch
51+
//(see http://digital-preservation.github.io/csv-validator/)
52+
//We also require that the path is a valid uri, and begins file:///
53+
//(Conditions specified on earlier columns say that the values of those columns must also appear as part of the
54+
//content of this field)
55+
file_checksum: checksum(file($file_path),"SHA-256")
56+
//Compare the value given in this field to the checksum calculated for the file found at the location given in
57+
//the "file_path" field (again path substitution may well be applied as described for the "file_path" field itself).
58+
//Use the specified checksum algorithm (must use lowercase hex characters).
59+
resource_uri: uri starts("http://datagov.nationalarchives.gov.uk/66/")
60+
//Must be a valid uri which starts with the specified string
61+
//(Conditions specified on earlier columns say that the values of those columns must also appear as part of the
62+
//content of this field)
63+
scan_operator: length(1,12) regex("^[0-9a-zA-Z]{1,12}$")
64+
//12 alphanumeric characters representing the identity of the scanning operator (the ability to decode this is
65+
//restricted to the scanning company to avoid personally identifying data being held in the file
66+
scan_id: length(1,12) regex("^[0-9a-zA-Z]{1,12}$")
67+
//Like "scan_operator", but this code represents the actually scanner or camera used
68+
scan_location: regex("[-\w\s,.]+")
69+
//Address or other description of the location where scanning physically occurred. The regex allows any number
70+
//of characters, allows general word and whitespace characters plus hyphen, comma and full stop
71+
image_resolution: positiveInteger is("300")
72+
//Always a positive (non-zero) integer, and in general explicitly 300. Occasionally a higher resolution used.
73+
//Depending how this is populated (whether nominal or actual resolution), it might be better to use a range
74+
//eg range(298,302) to capture slight variances in resolution.
75+
image_width: positiveInteger
76+
//Must be a positive (non-zero) integer. If the size of the material being digitised is well understood could use
77+
//a range check to ensure values are within a "sensible" range eg range(2400,2600) for A4 material - just over
78+
//8" wide (portrait), plus border, and assuming 300 ppi
79+
image_height: positiveInteger
80+
//Must be a positive (non-zero) integer. If the size of the material being digitised is well understood could use
81+
//a range check to ensure values are within a "sensible" range eg range(3450,3650) for A4 material - just over
82+
//11.5" high (portrait), plus border, and assuming 300 ppi
83+
image_tonal_resolution: is("24-bit colour")
84+
//must be string: 24-bit colour (precisely - case as shown). Occasionally a different value might be specified.
85+
image_format: is("x-fmt/392")
86+
//must be string: x-fmt/392 (precisely) - ie a jp2 file as understood by PRONOM
87+
//(http://www.nationalarchives.gov.uk/PRONOM/x-fmt/392)
88+
image_compression: positiveInteger is("6")
89+
//Always a positive (non-zero) integer, generally 6 to represent 6-fold compression with the lossy algorithm
90+
//available in the JPEG2000 specification
91+
image_colour_space: is("sRGB")
92+
//must be string: sRGB (precisely - case as shown). Other colour spaces might be used for specific projects
93+
image_split: is("yes") or is("no")
94+
//must be string: yes; or string: no (precisely - case as shown). Used if eg an image of complete double page
95+
//subsequently split into two separate images of each page individually
96+
image_split_other_uuid: if($image_split/is("yes"),uuid4,is(""))
97+
//if "image_split" field is yes, must be a uuid4, else must be blank (in certain circumstances it would be
98+
//possible that this could be a list of uuids, in which case the conditions would have to be reworked)
99+
image_crop: is("auto") or is("manual") or is("none")
100+
//must be string: auto; or string: manual or string: none (precisely - case as shown)
101+
image_deskew: is("yes") or is("no")
102+
//must be string: yes; or string: no (precisely - case as shown)
103+
comments: regex("[\w\s,.]+") @optional
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
<?xml version="1.0" encoding="utf-8"?>
2+
<DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3+
<UUID>5fe890e9-6650-46db-bc74-81985a4a9580</UUID>
4+
<URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/1/1/5fe890e9-6650-46db-bc74-81985a4a9580</URI>
5+
<Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
6+
</DigitalFile>

0 commit comments

Comments
 (0)