Skip to content

Commit 91a2b76

Browse files
author
DavidUnderdown
committed
updated schemas various tweaks based on live data
1 parent cd6cd49 commit 91a2b76

File tree

3 files changed

+23
-18
lines changed

3 files changed

+23
-18
lines changed

example-schemas/transcription_metadata_v1.3_RG101B0000 - names, ages only.csvs

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ last_date_day:
1919
last_date_month:
2020
last_date_year:
2121
county:
22-
ed_letter_code:
22+
ED_letter_code:
2323
borough:
2424
registration_district:
2525
sub_district:
@@ -29,19 +29,20 @@ street_name:
2929
schedule_no:
3030
sub_schedule_no:
3131
surname: (regex("^((((([dDL][\?aeiou]([- ]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e( ?))))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15})([- ](((([dDL][\?aeiou]([- ])?)|([dDAL]\')|([dD]e([- ])?[lL]a([- ])?)|(St(e?[- ]?))|([Vv][\?ao]n( )?([Dd]e( ?))))|((M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15}))){0,1})$") and regex("^(.)*\S$")) or is("???") or is("*") @warning @optional
32-
surname_other: (regex("^((((([dDL][\?aeiou]([- ]?))|([dDAL](e?)\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e( ?))))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15})([-, ](((([dDL][\?aeiou]([- ])?)|([dDAL]')|([dD]e([- ])?[lL]a([- ])?)|(St(e?[- ]?))|([Vv][\?ao]n( )?([Dd]e( ?))))|((M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15}))){0,3})$") and regex("^(.)*\S$")) or is("???") or is("*") @warning @optional
32+
//ORIGINAL CODE surname_other: (regex("^((((([dDL][\?aeiou]([- ]?))|([dDAL](e?)\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e( ?))))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15})([-, ](((([dDL][\?aeiou]([- ])?)|([dDAL]')|([dD]e([- ])?[lL]a([- ])?)|(St(e?[- ]?))|([Vv][\?ao]n( )?([Dd]e( ?))))|((M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15}))){0,3})$") and regex("^(.)*\S$")) or is("???") or is("*") @warning @optional
33+
surname_other: (regex("^((((([dDL][\?aeiou]([- ]?))|([dDAL](e?)\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e( ?))))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15})([-, ,\,](((([dDL][\?aeiou]([- ])?)|([dDAL]')|([dD]e([- ])?[lL]a([- ])?)|(St(e?[- ]?))|([Vv][\?ao]n( )?([Dd]e( ?))))|((M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15}))){0,3})$") and regex("^(.)*\S$")) or is("???") or is("*") @warning @optional //added acceptance of , between surnames : [-, ,\,]
3334
//The regex used for both surname fields is identical, both are set as optional, so the check only applies if something is in the field, the other schema checks that the field is only filled when it should be
3435
//The surname is divided into an optional prefix, eg Mc, Mac, De, De La, with varied capitalisation, with or without space or hyphen to separate it from the mandatory part of the name which must begin with a capital letter, and then be lowercase only, then we allow for a single additional "barrel" (with identical formatting rules), separated by a space or hyphen. Names with more barrells are sufficiently uncommon that it's probably worth double checking that one aprt isn't actually a middle name, or an amendment that's not been properly captured
3536
forenames: regex("^(Rev\: )?(M[\?a]?[\?c]|M\'|O\')?[\?A-Z][\?a-z]{0,15}([- ]((M[\?a]?[\?c]|M\'|O\')?[\?A-Zdv][\?a-z]{0,15}))*( M\.A\.| B\.A\.)?$") or is("???") or is("*") @optional @warning
3637
forenames_other: regex("^(Rev\: )?(M[\?a]?[\?c]|M\'|O\')?[\?A-Z][\?a-z]{0,15}([-, ]((M[\?a]?[\?c]|M\'|O\')?[\?A-Zdv][\?a-z]{0,15}))*( M\.A\.| B\.A\.)?$") or is("???") or is("*") @optional @warning
37-
ovspi:
38+
OVSPI:
3839
gender:
3940
birth_date_day:
4041
birth_date_month:
41-
birth_date_year: if(positiveInteger,if($marital_status/is("*") or $marital_status/is("?") or $marital_status/is("single"),range(1845,1939),if($marital_status/is("married") or $marital_status/is("divorced") or $marital_status/is("widowed"),range(1845,1923))))
42+
birth_date_year: if(positiveInteger,if($marital_status/is("*") or $marital_status/is("?") or $marital_status/is("single"),range(1845,1939),if($marital_status/is("married") or $marital_status/is("divorced") or $marital_status/is("widowed"),range(1840,1923)))) //changed from range(1845,1923)
4243
marital_status:
4344
occupation:
44-
instructions:
45+
refers_to:
4546
sensitive_annotation:
4647
legal_status:
4748
held_by:

example-schemas/transcription_metadata_v1.3_RG101B0000 - with file exists.csvs

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ sub_item: if($metadata_type/starts("SUBITEM_"), range(1,44) unique($piece, $item
2525
metadata_type: is("PIECE") or is("ITEM") or is("ITEM_TITLEPAGE") or is("ITEM_MAINPAGE") or is("ITEM_CONPAGE") or is("SUBITEM_NAME") or is("SUBITEM_CONNAME") or is("SUBITEM_VACANT") or is("SUBITEM_BLANK") or is("SUBITEM_REMOVED") or is("SUBITEM_QADDRESS")
2626
//This check prescribes the acceptable values for metadata_type (this value is then used by most of the rest of the checks to define acceptable values for that type of metadata row
2727
file_path: if($metadata_type/starts("ITEM_"), uri and regex("^file:\/\/\/RG_101\/[0-9]{1,5}[A-Z]\/[0-9]{1,3}\/[0-9]{1,5}[A-Z]_[0-9]{1,3}_[0-9]{1,4}.+\.jp2$") and unique fileExists, is(""))
28-
//For item rows we expect to have the JP2 image, this should be given in the form of a URI, which is of the form file:///RG_101/{piece}/{item}/{piece}_{item}_0001.jp2 (values in {} are those for the appropriate field of the same name), the file_path must also be unique, and we check that the file actually exists (so a path substitution will be required)
28+
//For item rows we expect to have the JP2 image, this should be given in the form of a URI, which is of the form file:///RG_101/{piece}/{item}/{piece}_{item}_0001.jp2 (values in {} are those for the appropriate field of the same name), the file_path must also be unique
2929
page_number: if($metadata_type/regex("(ITEM_MAINPAGE)|(ITEM_CONPAGE)"), regex("^([0-9a-zA-Z]{1,4})$") or is("missing"), is(""))
3030
//For actual register pages with entries on them (as opposed to cover pages), we should a have a page_number, transcribed from the top right hand corner of the page (lost in redaction), 1-4 alphanumeric characters are allowed (as often additional pages were added in later and so on), or the value 2missing" if the page number was not present for some reason
3131
volume_number: if($metadata_type/is("PIECE"), regex("^[A-Z]{4}\s\-\s[A-Z]{4}$") or regex("^[A-Z]{4}\s\-\s[A-Z]{3}$") or regex("^[A-Z]{2}\s\([A-Z]{2}\)$") or regex("^[0-9]{0,5}(\s){0,1}[A-Z]{2}(\s){0,1}\([A-Z]{2}(\s){0,1}\-(\s){0,1}[A-Z]{2,3}\)$") or regex("^[0-9]{0,5}(\s)?[A-Z]{2}\([A-Z]{2}\)(\s)?\-(\s)?[A-Z]{2}\([A-Z]{2}\)$"), is(""))
@@ -43,7 +43,7 @@ last_date_year: if($metadata_type/is("PIECE"), is("1939"), is(""))
4343
//The various date fields allow us to construct a covering date for this information - we are suing just the date on which the original register was compiled, and not making any attempt to capture when the data was subsequently updated when the register was in use
4444
county: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_VACANT)|(SUBITEM_CONNAME)|(SUBITEM_REMOVED)|(SUBITEM_QADDRESS)|(PIECE)"), regex("^[a-zA-Z ]{3,}$"), is(""))
4545
//County is data derived by DCTFH based on the enumeration district. We are merely checking that there are at least 3 alphanumeric characters here, we are not attempting to introduce a controlled vocab
46-
ed_letter_code: if($metadata_type/is("PIECE"), regex("^[A-Z]{4,5}$"), is(""))
46+
ED_letter_code: if($metadata_type/is("PIECE"), regex("^[A-Z]{4,5}$"), is(""))
4747
//each enumeration district (ed) was assigned a four or five letter code
4848
borough: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_VACANT)|(SUBITEM_CONNAME)|(SUBITEM_REMOVED)|(SUBITEM_QADDRESS)|(PIECE)"), regex("^[a-zA-Z \.]{3,}$"), is(""))
4949
//The borough should be recorded at the start of each register booklet (PIECE), and for search purposes is then copied down into the SUBITEMs
@@ -56,11 +56,11 @@ house_number: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_VACANT)|(SUBITEM_
5656
house_name: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_VACANT)|(SUBITEM_CONNAME)|(SUBITEM_REMOVED)|(SUBITEM_QADDRESS)"), if(not(""), length(2, *), $house_number/not("")), is(""))
5757
//Like house_number, for sub_item rows other than SUBITEM_BLANK, at least one of house_number and house_name must be populated (having both is permitted). The only real restriction on a name is that it must have at least two characters
5858
street_name: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_VACANT)|(SUBITEM_CONNAME)|(SUBITEM_REMOVED)|(SUBITEM_QADDRESS)"), regex("^[0-9A-Za-z, \-\(\)\?'\.\\]{3,}$"), is("")) @optional
59-
//Again for sub_items other than SUBITEM_BLANK this field must have at least three chracters of any sort (though in fact it's optional, as not everywhere has named streets, so only a field with only 1 or 2 characters would actually produce an error!)
59+
//Again for sub_items other than SUBITEM_BLANK this field must have at least three characters of any sort (though in fact it's optional, as not everywhere has named streets, so only a field with only 1 or 2 characters would actually produce an error!)
6060
schedule_no: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_VACANT)|(SUBITEM_CONNAME)|(SUBITEM_REMOVED)|(SUBITEM_QADDRESS)"), if(positiveInteger,range(1,619),regex("^[1-9][0-9]{0,2}(([a-fA-F])|(DUP[0-9]{1,2}))$")),is(""))
6161
//Within each enumeration district the enumerator numbered the schedules issued to each individual household beginning at 1. Occasionally duplicate schedule numbers were accidentally created, these were corrected at the time by adding an alphabetic suffix, or some have only just been found where we distinguish by adding DUPnn (where nn is a numeric with 1 or 2 digits)
6262
sub_schedule_no: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_QADDRESS)"), range(1,613) and unique($piece,$schedule_no,$sub_schedule_no), if($metadata_type/is("SUBITEM_CONNAME"),range(1,613),is("")))
63-
//Within each individual household, each individual person was given a sub_schedule_no - this only applied to SUBITEM types where individuals actaully lived there, so if the property was vacant etc, this field will be blank. Normally we expect that the combination of piece, schedule_no and sub_schedule_no should be unique, but if it's a continuation sub_item then by definition that combination exists elsewhere
63+
//Within each individual household, each individual person was given a sub_schedule_no - this only applied to SUBITEM types where individuals actually lived there, so if the property was vacant etc, this field will be blank. Normally we expect that the combination of piece, schedule_no and sub_schedule_no should be unique, but if it's a continuation sub_item then by definition that combination exists elsewhere
6464
surname: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_CONNAME)|(SUBITEM_QADDRESS)"), is("*") or is("???") or length(1,*), is(""))
6565
//Here we just check that something is filled in for surname for relevant SUBITEM types - see separate schema for some more detailed checking
6666
surname_other: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_CONNAME)|(SUBITEM_QADDRESS)"), is("*") or is("???") or length(1,*), is("")) @optional
@@ -69,7 +69,7 @@ forenames: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_CONNAME)|(SUBITEM_QA
6969
//Here we just check that something is filled in for forenames for relevant SUBITEM types - see separate schema for some more detailed checking
7070
forenames_other: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_CONNAME)|(SUBITEM_QADDRESS)"), is("*") or is("???") or length(1,*), is("")) @optional
7171
//Here if forenames have been amended at some point this field should be populated
72-
ovspi: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_CONNAME)|(SUBITEM_QADDRESS)"), is("Officer") or is("Visitor") or is("Servant") or is("Patient") or is("Inmate") or is("?") or is("*"), is(""))
72+
OVSPI: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_CONNAME)|(SUBITEM_QADDRESS)"), is("Officer") or is("Visitor") or is("Servant") or is("Patient") or is("Inmate") or is("?") or is("*"), is(""))
7373
//Records the data recorded in one of the original form columns, expanded from the single character originally used according to the detailed instructions to enumerators, again only required for appropriate surname types
7474
gender: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_CONNAME)|(SUBITEM_QADDRESS)"), is("male") or is("female") or is("*"))
7575
birth_date_day: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_CONNAME)|(SUBITEM_QADDRESS)"), regex("^\*|([0\?][1-9\?])|([1-2\?][0-9\?])|([3\?][0-1\?])$"), is("")) //validator will fail if you open csv template as expecting single digit days to have a leading 0 and Excel removes it
@@ -80,10 +80,12 @@ marital_status: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_CONNAME)|(SUBIT
8080
//Like OVSPI, expanded from the single character used in the original register to the full word the character represents
8181
occupation: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_CONNAME)|(SUBITEM_QADDRESS)"), is("*") or length(3,*), is(""))
8282
//a fairly freeform field as no sort of controlled vocab was used in the register
83-
instructions: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_CONNAME)|(SUBITEM_QADDRESS)"), is("*") or length(3,*) or is("continuation") or regex("^([0-9a-zA-Z]{1,4})$") , is("")) @optional
83+
refers_to: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_CONNAME)|(SUBITEM_QADDRESS)"), is("continuation") or regex("^([0-9a-zA-Z]{1,4})$") , is("")) @optional
84+
//range(1,200) replaced by the regex to allow refs to pages in other registers, this field also records membership of Home Guard, Auxiliary Fire Service, ARP etc - added by AG 8th Oct 2015
85+
//code for instructions: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_CONNAME)|(SUBITEM_QADDRESS)"), is("*") or length(3,*) or is("continuation") or regex("^([0-9a-zA-Z]{1,4})$") , is("")) @optional
8486
//range(1,200) replaced by the regex to allow refs to pages in other registers, this field also records membership of Home Guard, Auxiliary Fire Service, ARP etc
8587
sensitive_annotation: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_CONNAME)|(SUBITEM_QADDRESS)"), is("Adopted"), is("")) @optional
86-
//It was believed that there was potential for some records to show that a person had been adopted, if this does happen, it should be recorded in this field and the relevant line redacted in perpertuity
88+
//It was believed that there was potential for some records to show that a person had been adopted, if this does happen, it should be recorded in this field and the relevant line redacted in perpetuity
8789
legal_status: is("Public Record")
8890
//Fixed value
8991
held_by: is("The National Archives, Kew")

0 commit comments

Comments
 (0)