digital-preservation
diff --git a/‎README.md‎
Lines changed: 3 additions & 2 deletions b/‎README.md‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎example-schemas/ADM_362-technical-acquisition-with-minimal-transcription.csvs‎
Lines changed: 58 additions & 0 deletions b/‎example-schemas/ADM_362-technical-acquisition-with-minimal-transcription.csvs‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎example-schemas/ADM_363-technical-acquisition-with-minimal-transcription.csvs‎
Lines changed: 58 additions & 0 deletions b/‎example-schemas/ADM_363-technical-acquisition-with-minimal-transcription.csvs‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎example-schemas/README.md‎
Lines changed: 8 additions & 0 deletions b/‎example-schemas/README.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎example-schemas/digitised_surrogate_tech_acq_metadata_v1_TESTBATCH000.csvs‎
Lines changed: 103 additions & 0 deletions b/‎example-schemas/digitised_surrogate_tech_acq_metadata_v1_TESTBATCH000.csvs‎
Lines changed: 103 additions & 0 deletions
diff --git a/‎example-schemas/example-data/TEST_1/1/1/1_1_001.xml‎
Lines changed: 6 additions & 0 deletions b/‎example-schemas/example-data/TEST_1/1/1/1_1_001.xml‎
Lines changed: 6 additions & 0 deletions
@@ -9,6 +9,8 @@ is then published as HTML. The Schema language is formally expressed in EBNF.
 You can find the the documentation and latest published specification here:
 	http://digital-preservation.github.io/csv-schema.
 
+* Examples of CSV Schemas can be found in the [`example-schemas`](https://github.com/adamretter/csv-schema/tree/master/example-schemas) folder.
+
 
 Repository Organisation
 -----------------------
@@ -19,5 +21,4 @@ Repository Organisation
 * There is one tag from master each time a version of the specification is published. The tag name reflects
 the specification version number.
 
-
-Released under the [Mozilla Public Licence version 2.0](http://www.mozilla.org/MPL/2.0/).
+Released under the [Mozilla Public Licence version 2.0](http://www.mozilla.org/MPL/2.0/).
@@ -0,0 +1,58 @@
+version 1.0
+@totalColumns 42
+/********************************************************************************
+*Schema:   ADM_363-technical-acquisition-with-minimal-transcription.csvs        *
+*Authors:  Nicki Welch                                                          *
+*          David Underdown                                                      *
+*Purpose:  To capture metadata about the digitisation of the ADM 363 series     *
+*          Primarily technical metadata, but with a minimal amount of           *
+*          transcription to verify that the records may be publicly released    *
+*          after receipt by The National Archives                               *
+*Revision: 1.0 first release                                                    *
+*          1.1 update as some official numbers only single digit                *
+*          1.2 allow M as official number prefix too                            *
+*          1.3 further additions to prefixes, L, S, SS, SSX                     *
+*          1.4 allow for * and ? in official number                             *
+********************************************************************************/
+batch_code: length(10) regex("^ADM362B([0-9]{3})$")
+department: (is("ADM") if($file_path/notEmpty,in($file_path) and in($resource_uri)))
+series: is("362") and if($file_path/notEmpty,in($file_path) and in($resource_uri))
+piece: range(1,69720) if($file_path/notEmpty,in($file_path) and in($resource_uri))
+item: ((positiveInteger unique($piece,$item,$ordinal)) or empty) if($file_path/notEmpty,in($file_path) and in($resource_uri))
+ordinal: if($item/empty,empty,unique($item,$ordinal))
+file_uuid: if($ordinal/empty,empty,uuid4 unique)
+file_path: uri if($ordinal/empty,empty,unique fileExists regex("^file:\/\/\/ADM_362\/[0-9]{1,5}\/[0-9]{1,5}\/[0-9]{1,4}_.+\.jp2$"))
+file_checksum: if($ordinal/empty,empty,checksum(file($file_path),"SHA-256"))
+resource_uri: if($ordinal/notEmpty,uri starts("http://datagov.nationalarchives.gov.uk/66/"))
+scan_operator: if($ordinal/empty,empty,length(1,12) regex("^[0-9a-zA-Z]{1,12}$"))
+scan_id: if($ordinal/empty,empty,length(1,12) regex("^[0-9a-zA-Z_]{1,12}$"))
+scan_location: if($ordinal/empty,empty,regex("[-\w\s,]+"))
+scan_native_format: if($ordinal/empty,empty,regex("[0-9\w\s,.:]+"))
+scan_timestamp: if($ordinal/empty,empty,xDateTime)
+image_resolution: if($ordinal/empty,empty,is("300"))
+image_width: if($ordinal/empty,empty,positiveInteger)
+image_height: if($ordinal/empty,empty,positiveInteger)
+image_tonal_resolution: if($ordinal/empty,empty,is("24-bit colour"))
+image_format: if($ordinal/empty,empty,is("x-fmt/392"))
+image_colour_space: if($ordinal/empty,empty,is("sRGB"))
+process_location: if($ordinal/empty,empty,regex("[-\w\s,]+"))
+jp2_creation_timestamp: if($ordinal/empty,empty,xDateTime)
+uuid_timestamp: if($ordinal/empty,empty,xDateTime)
+embed_timestamp: if($ordinal/empty,empty,xDateTime)
+image_split: if($ordinal/empty,empty,is("yes") or is("no"))
+image_split_other_uuid: if($ordinal/empty,empty,if($image_split/is("yes"),uuid4,is("")))
+image_split_operator: if($ordinal/empty,empty,if($image_split/is("yes"),length(1,12) and regex("^[0-9a-zA-Z]{1,12}$"),is("")))
+image_split_timestamp: if($ordinal/empty,empty,if($image_split/is("yes"),xDateTime,is("")))
+image_crop: if($ordinal/empty,empty,is("auto") or is("manual") or is("none"))
+image_crop_operator: if($ordinal/empty,empty,if($image_split/is("manual"),length(1,12) and regex("^[0-9a-zA-Z]{1,12}$"),is("")))
+image_crop_timestamp: if($ordinal/empty,empty,if($image_crop/is("none"),empty,xDateTime))
+image_deskew: if($ordinal/empty,empty,is("yes") or is("no"))
+image_deskew_operator: if($ordinal/empty,empty,if($image_deskew/is("yes"),regex("^[0-9a-zA-Z]{1,12}$"),is("")))
+image_deskew_timestamp: if($ordinal/empty,empty,if($image_deskew/is("yes"),xDateTime,is("")))
+QA-code: regex("^[0-9/,]{1,2}$") @optional
+comments: regex("[\w\s,\.]+") @optional
+transcribed_volume_number: if($item/empty,regex("[0-9A-Z\-\s]{1,15}"),is(""))
+transcribed_birth_date_day:  if(($ordinal/empty and $item/notEmpty),regex("^\*|([0\?][1-9\?])|([1-2\?][0-9\?])|([3\?][0-1\?])$"),is("")) 
+transcribed_birth_date_month: if(($ordinal/empty and $item/notEmpty),is("*") or is("?") or is("January") or is("February") or is("March") or is("April") or is("May") or is("June") or is("July") or is("August") or is("September") or is("October") or is("November") or is("December"), is(""))
+transcribed_birth_date_year: if(($ordinal/empty and $item/notEmpty),regex("^1[7-9][0-9\?]{2}|\*$"),is(""))
+transcribed_official_number: if(($ordinal/empty and $item/notEmpty),regex("^([FJKLMS]|SS|SSX)[/*/?0-9]{1,6}$"),is(""))
@@ -0,0 +1,58 @@
+version 1.0
+@totalColumns 42
+/********************************************************************************
+*Schema:   ADM_363-technical-acquisition-with-minimal-transcription.csvs        *
+*Authors:  Nicki Welch                                                          *
+*          David Underdown                                                      *
+*Purpose:  To capture metadata about the digitisation of the ADM 363 series     *
+*          Primarily technical metadata, but with a minimal amount of           *
+*          transcription to verify that the records may be publicly released    *
+*          after receipt by The National Archives                               *
+*Revision: 1.0 first release                                                    *
+*          1.1 update as some official numbers only single digit                *
+*          1.2 allow M as official number prefix too                            *
+*          1.3 further additions to prefixes, L, S, SS, SSX                     *
+*          1.4 allow for * and ? in official number                             *
+********************************************************************************/
+batch_code: length(10) regex("^ADM36[23]B([0-9]{3})$")
+department: (is("ADM") if($file_path/notEmpty,in($file_path) and in($resource_uri)))
+series: is("363") and if($file_path/notEmpty,in($file_path) and in($resource_uri))
+piece: range(1,69720) if($file_path/notEmpty,in($file_path) and in($resource_uri))
+item: ((positiveInteger unique($piece,$item,$ordinal)) or empty) if($file_path/notEmpty,in($file_path) and in($resource_uri))
+ordinal: if($item/empty,empty,unique($item,$ordinal))
+file_uuid: if($ordinal/empty,empty,uuid4 unique)
+file_path: uri if($ordinal/empty,empty,unique fileExists regex("^file:\/\/\/ADM_363\/[0-9]{1,5}\/[0-9]{1,5}\/[0-9]{1,4}_.+\.jp2$"))
+file_checksum: if($ordinal/empty,empty,checksum(file($file_path),"SHA-256"))
+resource_uri: if($ordinal/notEmpty,uri starts("http://datagov.nationalarchives.gov.uk/66/"))
+scan_operator: if($ordinal/empty,empty,length(1,12) regex("^[0-9a-zA-Z]{1,12}$"))
+scan_id: if($ordinal/empty,empty,length(1,12) regex("^[0-9a-zA-Z_]{1,12}$"))
+scan_location: if($ordinal/empty,empty,regex("[-\w\s,]+"))
+scan_native_format: if($ordinal/empty,empty,regex("[0-9\w\s,.:]+"))
+scan_timestamp: if($ordinal/empty,empty,xDateTime)
+image_resolution: if($ordinal/empty,empty,is("300"))
+image_width: if($ordinal/empty,empty,positiveInteger)
+image_height: if($ordinal/empty,empty,positiveInteger)
+image_tonal_resolution: if($ordinal/empty,empty,is("24-bit colour"))
+image_format: if($ordinal/empty,empty,is("x-fmt/392"))
+image_colour_space: if($ordinal/empty,empty,is("sRGB"))
+process_location: if($ordinal/empty,empty,regex("[-\w\s,]+"))
+jp2_creation_timestamp: if($ordinal/empty,empty,xDateTime)
+uuid_timestamp: if($ordinal/empty,empty,xDateTime)
+embed_timestamp: if($ordinal/empty,empty,xDateTime)
+image_split: if($ordinal/empty,empty,is("yes") or is("no"))
+image_split_other_uuid: if($ordinal/empty,empty,if($image_split/is("yes"),uuid4,is("")))
+image_split_operator: if($ordinal/empty,empty,if($image_split/is("yes"),length(1,12) and regex("^[0-9a-zA-Z]{1,12}$"),is("")))
+image_split_timestamp: if($ordinal/empty,empty,if($image_split/is("yes"),xDateTime,is("")))
+image_crop: if($ordinal/empty,empty,is("auto") or is("manual") or is("none"))
+image_crop_operator: if($ordinal/empty,empty,if($image_split/is("manual"),length(1,12) and regex("^[0-9a-zA-Z]{1,12}$"),is("")))
+image_crop_timestamp: if($ordinal/empty,empty,if($image_crop/is("none"),empty,xDateTime))
+image_deskew: if($ordinal/empty,empty,is("yes") or is("no"))
+image_deskew_operator: if($ordinal/empty,empty,if($image_deskew/is("yes"),regex("^[0-9a-zA-Z]{1,12}$"),is("")))
+image_deskew_timestamp: if($ordinal/empty,empty,if($image_deskew/is("yes"),xDateTime,is("")))
+QA-code: regex("^[0-9/,]{1,2}$") @optional
+comments: regex("[\w\s,\.]+") @optional
+transcribed_volume_number: if($item/empty,regex("[0-9A-Z\-\s]{1,15}"),is(""))
+transcribed_birth_date_day:  if(($ordinal/empty and $item/notEmpty),regex("^\*|([0\?][1-9\?])|([1-2\?][0-9\?])|([3\?][0-1\?])$"),is("")) 
+transcribed_birth_date_month: if(($ordinal/empty and $item/notEmpty),is("*") or is("?") or is("January") or is("February") or is("March") or is("April") or is("May") or is("June") or is("July") or is("August") or is("September") or is("October") or is("November") or is("December"), is(""))
+transcribed_birth_date_year: if(($ordinal/empty and $item/notEmpty),regex("^1[7-9][0-9\?]{2}|\*$"),is(""))
+transcribed_official_number: if(($ordinal/empty and $item/notEmpty),regex("^([FJKLMS]|SS|SSX)[/*/?0-9]{1,6}$"),is(""))
@@ -0,0 +1,8 @@
+CSV Schemas
+===========
+
+CSV Schemas expressed in the [CSV Schema Language](http://digital-preservation.github.io/csv-schema/csv-schema-1.0.html).
+
+CSV Schema created by the Digital Preservation and Digital Repository Infrastructure teams at The National Archives will be added to this folder to make them available to digitisation partners and to serve as examples of the use of the CSV Schema Language.
+
+An initial example CSV can be found in the [`example-data`](http://github.com/digital-preservation/csv-schema/tree/master/example-schemas/example-data) folder, which relates to the xml files to be found in its subfolder TEST_1 and further subfolders.  This is designed to be validated against the schema [`digitised_surrogate_tech_acq_metadata_v1_TESTBATCH000.csvs`](https://github.com/digital-preservation/csv-schema/blob/master/example-schemas/digitised_surrogate_tech_acq_metadata_v1_TESTBATCH000.csvs).  In a genuine digitisation project, the files described by the metadata CSV would be JPEG2000s, but these would tend to be quite large, so to make downloading more practical for demonstration purposes, we have supplied only the XML which would normally be embedded within the JPEG2000 file.
@@ -0,0 +1,103 @@
+version 1.0
+@totalColumns 27
+/***************************************************************************************************************
+*This schema is for the validation of technical acquisition metadata                                           *
+*csv files according to the specification given for digitised surrogates in                                    *
+*http://www.nationalarchives.gov.uk/documents/information-management/digitisation-at-the-national-archives.pdf *
+*This version is an example only, using "fake" values/ranges for department, division, series, sub_series,     *
+*sub_sub_series, piece and item. A specific format for batch_code is given though this reflects only the "fake"*
+*department code, rather than also reflecting the series reference as would usually be the case                *
+***************************************************************************************************************/
+/*The header of the schema file, ie the statements version 1.0 and @totalColumns 27, indicates that this schema 
+  is using version 1.0 of the schema language (NB, not that that it is version 1.0 of this particular schema), 
+  and that there are 27 columns in total in the file.*/
+batch_code: starts("TESTBATCH") length(1,16) regex("^[0-9a-zA-Z]{1,16}$")
+  //1st part, batch_code must be between 1 and 16 characters long, and (implicitly multiple conditions are joined  
+  //by a logical AND unless another boolean is provided). 2nd part restricts to alphanumeric characters as 
+  //specified in digitisation standards p 31. Would usually comprise project identifier (eg department and series),
+  //plus running count of batch number within that - in this case TESTBATCH followed by count (zero padded).
+department: is("TEST") regex("[A-Z]{1,4}") and (in($file_path) and in($resource_uri))
+  //Parentheses control evaluation order of booleans as might be expected
+  //The regex statement says that this field must consist of between 1 and 4 upper case alphabetic characters. 
+  //The grouped "in" statements say that the value found in this field must also be found as part of the fields 
+  //"file_path" and "resource_uri"
+division: is("")
+  //this field must be blank for this example
+series: is("1") positiveInteger and (in($file_path) and in($resource_uri))
+  //in general we expect this field will be a positive (non-zero) integer.  For this example, a specific
+  //value of 1 is given. The value must also be part of the fields "file_path" and "resource_uri"
+sub_series: positiveInteger or is("")
+  //this field must either be a positive integer or be blank (defined per project).  For this example, a mixture
+  //of values will be used for demo purposes (this would not be the case in a real project)
+sub_sub_series: is("")
+  //this field must be blank (defined per project)
+piece: range(1,3) positiveInteger and (in($file_path) and in($resource_uri))
+  //Generally this value will be a positive integer, rarely the piece reference may take a more complicated form
+  //which would be defined on a per project basis.
+  //Often the range of values for piece would be known, and so a statement such as range(1,3) etc might be used as
+  //in this example.
+  //The value must also be part of the fields "file_path" and "resource_uri"
+item: (positiveInteger and (in($file_path) and in($resource_uri))) or is("")
+  //Generally (if used) this value will be a positive integer, rarely the item reference may take a more 
+  //complicated form which would be defined on a per project basis.
+  //The value must also be part of the fields "file_path" and "resource_uri"
+  //In many cases the item level is not used, so this would be left blank.
+  //for this example a mixture of blanks and integers is used (this is unlikely to be the case in a real project)
+file_uuid: uuid4 unique
+  //must be a version 4 uuid, and the value must be unique within the file.  uuids must be lower case.												  
+file_path: fileExists uri starts("file:///")
+  //fileExists checks that there is actually a file of the given name at the specified location on the file system.
+  //In practice, the validator will normally be run with the --path switch 
+  //(see http://digital-preservation.github.io/csv-validator/)
+  //We also require that the path is a valid uri, and begins file:///
+  //(Conditions specified on earlier columns say that the values of those columns must also appear as part of the 
+  //content of this field)
+file_checksum: checksum(file($file_path),"SHA-256")
+  //Compare the value given in this field to the checksum calculated for the file found at the location given in 
+  //the "file_path" field (again path substitution may well be applied as described for the "file_path" field itself).
+  //Use the specified checksum algorithm (must use lowercase hex characters).
+resource_uri: uri starts("http://datagov.nationalarchives.gov.uk/66/")
+  //Must be a valid uri which starts with the specified string
+  //(Conditions specified on earlier columns say that the values of those columns must also appear as part of the 
+  //content of this field)
+scan_operator: length(1,12) regex("^[0-9a-zA-Z]{1,12}$")
+  //12 alphanumeric characters representing the identity of the scanning operator (the ability to decode this is
+  //restricted to the scanning company to avoid personally identifying data being held in the file
+scan_id: length(1,12) regex("^[0-9a-zA-Z]{1,12}$")
+  //Like "scan_operator", but this code represents the actually scanner or camera used
+scan_location: regex("[-\w\s,.]+")
+  //Address or other description of the location where scanning physically occurred. The regex allows any number
+  //of characters, allows general word and whitespace characters plus hyphen, comma and full stop
+image_resolution: positiveInteger is("300")
+  //Always a positive (non-zero) integer, and in general explicitly 300.  Occasionally a higher resolution used.
+  //Depending how this is populated (whether nominal or actual resolution), it might be better to use a range
+  //eg range(298,302) to capture slight variances in resolution.
+image_width: positiveInteger
+  //Must be a positive (non-zero) integer.  If the size of the material being digitised is well understood could use
+  //a range check to ensure values are within a "sensible" range eg range(2400,2600) for A4 material - just over
+  //8" wide (portrait), plus border, and assuming 300 ppi
+image_height: positiveInteger
+  //Must be a positive (non-zero) integer.  If the size of the material being digitised is well understood could use
+  //a range check to ensure values are within a "sensible" range eg range(3450,3650) for A4 material - just over
+  //11.5" high (portrait), plus border, and assuming 300 ppi
+image_tonal_resolution: is("24-bit colour")
+  //must be string: 24-bit colour (precisely - case as shown).  Occasionally a different value might be specified.
+image_format: is("x-fmt/392")
+  //must be string: x-fmt/392 (precisely) - ie a jp2 file as understood by PRONOM
+  //(http://www.nationalarchives.gov.uk/PRONOM/x-fmt/392)
+image_compression: positiveInteger is("6")
+  //Always a positive (non-zero) integer, generally 6 to represent 6-fold compression with the lossy algorithm 
+  //available in the JPEG2000 specification
+image_colour_space: is("sRGB")
+  //must be string: sRGB (precisely - case as shown). Other colour spaces might be used for specific projects
+image_split: is("yes") or is("no")
+  //must be string: yes; or string: no (precisely - case as shown).  Used if eg an image of complete double page
+  //subsequently split into two separate images of each page individually
+image_split_other_uuid: if($image_split/is("yes"),uuid4,is(""))
+  //if "image_split" field is yes, must be a uuid4, else must be blank  (in certain circumstances it would be  
+  //possible that this could be a list of uuids, in which case the conditions would have to be reworked)												  
+image_crop: is("auto") or is("manual") or is("none")
+  //must be string: auto; or string: manual or string: none (precisely - case as shown)
+image_deskew: is("yes") or is("no")
+  //must be string: yes; or string: no (precisely - case as shown)
+comments: regex("[\w\s,.]+") @optional
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="utf-8"?>
+<DigitalFile xmlns="http://nationalarchives.gov.uk/2012/dri/artifact/embedded/metadata" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+<UUID>5fe890e9-6650-46db-bc74-81985a4a9580</UUID>
+<URI>http://datagov.nationalarchives.gov.uk/66/TEST/1/1/1/5fe890e9-6650-46db-bc74-81985a4a9580</URI>
+<Copyright>&#169; Crown copyright: The National Archives of the UK</Copyright>
+</DigitalFile>