You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: example-schemas/transcription_metadata_v1.3_RG101B0000 - names, ages only.csvs
+3-1Lines changed: 3 additions & 1 deletion
Original file line number
Diff line number
Diff line change
@@ -28,8 +28,10 @@ house_name:
28
28
street_name:
29
29
schedule_no:
30
30
sub_schedule_no:
31
-
surname: (regex("^((((([dDL][\?aeiou]([- ]?))|([dDAL](e?)\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e( ?))))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15})([- ](((([dDL][\?aeiou]([- ])?)|([dDAL]')|([dD]e([- ])?[lL]a([- ])?)|(St(e?[- ]?))|([Vv][\?ao]n( )?([Dd]e( ?))))|((M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15}))){0,1})$") and regex("^(.)*\S$")) or is("???") or is("*") @warning @optional
31
+
surname: (regex("^((((([dDL][\?aeiou]([- ]?))|([dDAL](e)?\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e( ?))))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15})([- ](((([dDL][\?aeiou]([- ])?)|([dDAL]\')|([dD]e([- ])?[lL]a([- ])?)|(St(e?[- ]?))|([Vv][\?ao]n( )?([Dd]e( ?))))|((M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15}))){0,1})$") and regex("^(.)*\S$")) or is("???") or is("*") @warning @optional
32
32
surname_other: (regex("^((((([dDL][\?aeiou]([- ]?))|([dDAL](e?)\')|([dD]e([- ]?)[lL]a([- ]?))|(St(e?[- ]?))|([Vv][\?ao]n( ?)([Dd]e( ?))))|(M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15})([-, ](((([dDL][\?aeiou]([- ])?)|([dDAL]')|([dD]e([- ])?[lL]a([- ])?)|(St(e?[- ]?))|([Vv][\?ao]n( )?([Dd]e( ?))))|((M[\?a]?[\?c]|M\'|O\'))?[\?A-Z][\?a-z]{2,15}))){0,3})$") and regex("^(.)*\S$")) or is("???") or is("*") @warning @optional
33
+
//The regex used for both surname fields is identical, both are set as optional, so the check only applies if something is in the field, the other schema checks that the field is only filled when it should be
34
+
//The surname is divided into an optional prefix, eg Mc, Mac, De, De La, with varied capitalisation, with or without space or hyphen to separate it from the mandatory part of the name which must begin with a capital letter, and then be lowercase only, then we allow for a single additional "barrel" (with identical formatting rules), separated by a space or hyphen. Names with more barrells are sufficiently uncommon that it's probably worth double checking that one aprt isn't actually a middle name, or an amendment that's not been properly captured
33
35
forenames: regex("^(Rev\: )?(M[\?a]?[\?c]|M\'|O\')?[\?A-Z][\?a-z]{0,15}([- ]((M[\?a]?[\?c]|M\'|O\')?[\?A-Zdv][\?a-z]{0,15}))*( M\.A\.| B\.A\.)?$") or is("???") or is("*") @optional @warning
34
36
forenames_other: regex("^(Rev\: )?(M[\?a]?[\?c]|M\'|O\')?[\?A-Z][\?a-z]{0,15}([-, ]((M[\?a]?[\?c]|M\'|O\')?[\?A-Zdv][\?a-z]{0,15}))*( M\.A\.| B\.A\.)?$") or is("???") or is("*") @optional @warning
//Schema updates: brackets, hyphen in street name, extend house number to allow eg 112-111, question marks in house number
4
+
//for continuations - by definition schedule/subsched should already have appeared.
5
+
/*Updates following meeting 24/7. Allow for non-numeric schedule refs where duplicates are distinguished in original by addition of a-f or A-F as a suffix,
6
+
or where we have had to distinguish duplicates ourselves which will add suffix of form DUPnn (if the value is an integer the previous range will still be checked.)
7
+
Allow additional punctuation in street_name - instructions should already be OK as the length check should allow any string characters */
8
+
/*For general understanding of the data, a piece is an individual booklet of registration data (these were usually bound into a volume after the register was completed), compiled by an
9
+
enumerator from the household returns for a specific area. An item is an individual page from the registers, this may be a cover page (ITEM_TITLEPAGE) which contains various summary
10
+
information about the area covered, or a register page (ITEM_MAINPAGE or ITEM_CONPAGE) which has header info relating to the place, then around 40 or so lines filled to some degree with
11
+
individual rows of data about each person in a household (or a brief entry for vacant houses and similar) these are the various SUBITEM types*/
12
+
//In the pure transcription fields we allow the use of ? to represent a single unreadable character, ??? for a completely unreadable word, * for a field left blank that would be expected to be filled
//batch_code is not on sub_item rows, but is on all other row types, it comprises the string RG101 to identify the project, followed by B for batch, then a running number (starting from one, padded to 4 digits with leading zeroes as required)
//All types of row have the piece value, which has between 1 and 5 numeric characters, followed by an upper case alphabetic value. For item level rows, we also check that the piece value appears in the file_path field
//For item and sub_item rows (ie anything that's not a piece level row), item should have a numeric value in the range 1-500 (inclusive) and the combination of piece reference, item reference and sub_item reference should be unique. For piece level rows this field should be blank
//For sub_item rows (ie the transcription of an individual row on a register page) this field should have a numeric value in the range 1-44 (inclusive), and we reiterate the requirement for the combination of piece reference, item reference and sub_item reference to be unique. For piece and item rows, this field should be blank
25
+
metadata_type: is("PIECE") or is("ITEM") or is("ITEM_TITLEPAGE") or is("ITEM_MAINPAGE") or is("ITEM_CONPAGE") or is("SUBITEM_NAME") or is("SUBITEM_CONNAME") or is("SUBITEM_VACANT") or is("SUBITEM_BLANK") or is("SUBITEM_REMOVED") or is("SUBITEM_QADDRESS")
26
+
//This check prescribes the acceptable values for metadata_type (this value is then used by most of the rest of the checks to define acceptable values for that type of metadata row
27
+
file_path: if($metadata_type/starts("ITEM_"), uri and regex("^file:\/\/\/RG_101\/[0-9]{1,5}[A-Z]\/[0-9]{1,3}\/[0-9]{1,5}[A-Z]_[0-9]{1,3}_[0-9]{1,4}.+\.jp2$") and unique fileExists, is(""))
28
+
//For item rows we expect to have the JP2 image, this should be given in the form of a URI, which is of the form file:///RG_101/{piece}/{item}/{piece}_{item}_0001.jp2 (values in {} are those for the appropriate field of the same name), the file_path must also be unique, and we check that the file actually exists (so a path substitution will be required)
29
+
page_number: if($metadata_type/regex("(ITEM_MAINPAGE)|(ITEM_CONPAGE)"), regex("^([0-9a-zA-Z]{1,4})$") or is("missing"), is(""))
30
+
//For actual register pages with entries on them (as opposed to cover pages), we should a have a page_number, transcribed from the top right hand corner of the page (lost in redaction), 1-4 alphanumeric characters are allowed (as often additional pages were added in later and so on), or the value 2missing" if the page number was not present for some reason
31
+
volume_number: if($metadata_type/is("PIECE"), regex("^[A-Z]{4}\s\-\s[A-Z]{4}$") or regex("^[A-Z]{4}\s\-\s[A-Z]{3}$") or regex("^[A-Z]{2}\s\([A-Z]{2}\)$") or regex("^[0-9]{0,5}(\s){0,1}[A-Z]{2}(\s){0,1}\([A-Z]{2}(\s){0,1}\-(\s){0,1}[A-Z]{2,3}\)$") or regex("^[0-9]{0,5}(\s)?[A-Z]{2}\([A-Z]{2}\)(\s)?\-(\s)?[A-Z]{2}\([A-Z]{2}\)$"), is(""))
32
+
//Piece level rows should have a volume_number derived from the original label on the spine - in tech_acq name of this field seems to have changed to transcribed_volume_code ??) - keep an eye out for how this is given in the metadata files, may need to change name here too.
33
+
file_uuid: if($metadata_type/starts("ITEM_"), uuid4 and unique, is(""))
34
+
//For item level rows (which described the actual image file) we should have the file_uuid, a version 4 UUID
//This field should record the UUID of the image on which there is a continuation entry for an individual sub_item (ie individual row on a register page)
//The various date fields allow us to construct a covering date for this information - we are suing just the date on which the original register was compiled, and not making any attempt to capture when the data was subsequently updated when the register was in use
//County is data derived by DCTFH based on the enumeration district. We are merely checking that there are at least 3 alphanumeric characters here, we are not attempting to introduce a controlled vocab
//house_number should be blank except at sub-item level. Where an address is populated, which is every type of sub_item except SUBITEM_BLANK which represents an entirely blank row, at least one of house_number and house_name must be populated. House number is not strictly numeric, obviously things like 1A are not uncommon, and we've also found various instances of punctuation etc, so these are allowed
//Like house_number, for sub_item rows other than SUBITEM_BLANK, at least one of house_number and house_name must be populated (having both is permitted). The only real restriction on a name is that it must have at least two characters
//Again for sub_items other than SUBITEM_BLANK this field must have at least three chracters of any sort (though in fact it's optional, as not everywhere has named streets, so only a field with only 1 or 2 characters would actually produce an error!)
//Within each enumeration district the enumerator numbered the schedules issued to each individual household beginning at 1. Occasionally duplicate schedule numbers were accidentally created, these were corrected at the time by adding an alphabetic suffix, or some have only just been found where we distinguish by adding DUPnn (where nn is a numeric with 1 or 2 digits)
62
+
sub_schedule_no: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_QADDRESS)"), range(1,613) and unique($piece,$schedule_no,$sub_schedule_no), if($metadata_type/is("SUBITEM_CONNAME"),range(1,613),is("")))
63
+
//Within each individual household, each individual person was given a sub_schedule_no - this only applied to SUBITEM types where individuals actaully lived there, so if the property was vacant etc, this field will be blank. Normally we expect that the combination of piece, schedule_no and sub_schedule_no should be unique, but if it's a continuation sub_item then by definition that combination exists elsewhere
64
+
surname: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_CONNAME)|(SUBITEM_QADDRESS)"), is("*") or is("???") or length(1,*), is(""))
65
+
//Here we just check that something is filled in for surname for relevant SUBITEM types - see separate schema for some more detailed checking
66
+
surname_other: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_CONNAME)|(SUBITEM_QADDRESS)"), is("*") or is("???") or length(1,*), is("")) @optional
67
+
//Here if a surname has been amended at some point this field should be populated
68
+
forenames: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_CONNAME)|(SUBITEM_QADDRESS)"), is("*") or is("???") or length(1,*), is(""))
69
+
//Here we just check that something is filled in for forenames for relevant SUBITEM types - see separate schema for some more detailed checking
70
+
forenames_other: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_CONNAME)|(SUBITEM_QADDRESS)"), is("*") or is("???") or length(1,*), is("")) @optional
71
+
//Here if forenames have been amended at some point this field should be populated
72
+
ovspi: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_CONNAME)|(SUBITEM_QADDRESS)"), is("Officer") or is("Visitor") or is("Servant") or is("Patient") or is("Inmate") or is("?") or is("*"), is(""))
73
+
//Records the data recorded in one of the original form columns, expanded from the single character originally used according to the detailed instructions to enumerators, again only required for appropriate surname types
74
+
gender: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_CONNAME)|(SUBITEM_QADDRESS)"), is("male") or is("female") or is("*"))
75
+
birth_date_day: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_CONNAME)|(SUBITEM_QADDRESS)"), regex("^\*|([0\?][1-9\?])|([1-2\?][0-9\?])|([3\?][0-1\?])$"), is("")) //validator will fail if you open csv template as expecting single digit days to have a leading 0 and Excel removes it
76
+
birth_date_month: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_CONNAME)|(SUBITEM_QADDRESS)"), is("*") or is("?") or is("January") or is("February") or is("March") or is("April") or is("May") or is("June") or is("July") or is("August") or is("September") or is("October") or is("November") or is("December"), is(""))
//dates are broken down into 3 separate fields for ease of parsing
79
+
marital_status: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_CONNAME)|(SUBITEM_QADDRESS)"), is("*") or is("?") or is("single") or is("married") or is("widowed") or is("divorced"), is(""))
80
+
//Like OVSPI, expanded from the single character used in the original register to the full word the character represents
81
+
occupation: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_CONNAME)|(SUBITEM_QADDRESS)"), is("*") or length(3,*), is(""))
82
+
//a fairly freeform field as no sort of controlled vocab was used in the register
83
+
instructions: if($metadata_type/regex("(SUBITEM_NAME)|(SUBITEM_CONNAME)|(SUBITEM_QADDRESS)"), is("*") or length(3,*) or is("continuation") or regex("^([0-9a-zA-Z]{1,4})$") , is("")) @optional
84
+
//range(1,200) replaced by the regex to allow refs to pages in other registers, this field also records membership of Home Guard, Auxiliary Fire Service, ARP etc
//It was believed that there was potential for some records to show that a person had been adopted, if this does happen, it should be recorded in this field and the relevant line redacted in perpertuity
87
+
legal_status: is("Public Record")
88
+
//Fixed value
89
+
held_by: is("The National Archives, Kew")
90
+
//fixed value
91
+
comments:
92
+
//Free entry field to record anything else of use or note
0 commit comments