1- # Convert a internal TIND CaltechDATA record into a DataCite 4 standard schema json record
1+ # Convert a internal TIND CaltechDATA record into a DataCite 4 or 4.3 standard schema json record
22import json
33import argparse
44
55
66def decustomize_schema (
7- json_record , pass_emails = False , pass_media = False , pass_owner = False
7+ json_record , pass_emails = False , pass_media = False , pass_owner = False ,
8+ schema = '4'
89):
10+ if schema == '4' :
11+ return decustomize_schema_4 (json_record , pass_emails ,
12+ pass_media ,pass_owner )
13+ elif schema == '43' :
14+ return decustomize_schema_4 (json_record , pass_emails ,
15+ pass_media ,pass_owner )
16+ else :
17+ raise ValueError (f'Error: schema { schema } not defined' )
918
19+ def decustomize_standard (json_record , pass_emails , pass_media , pass_owner ):
1020 # Extract subjects to single string
1121 if "subjects" in json_record :
1222 if isinstance (json_record ["subjects" ], str ):
@@ -21,18 +31,6 @@ def decustomize_schema(
2131 array .append ({"subject" : s })
2232 json_record ["subjects" ] = array
2333
24- # Extract identifier and label as DOI
25- if "doi" in json_record :
26- doi = json_record ["doi" ]
27- json_record ["identifier" ] = {
28- "identifier" : json_record ["doi" ],
29- "identifierType" : "DOI" ,
30- }
31- del json_record ["doi" ]
32- #Fail out if a DOI is not present
33- #else:
34- # raise ValueError(f'Error: Record does not have a DOI {json_record}')
35-
3634 # Extract title
3735 if "title" in json_record :
3836 json_record ["titles" ] = [{"title" : json_record ["title" ]}]
@@ -64,6 +62,151 @@ def decustomize_schema(
6462 json_record ["relatedIdentifiers" ] = [relation ]
6563 del json_record ["publications" ]
6664
65+ # format
66+ if "format" in json_record :
67+ if isinstance (json_record ["format" ], list ):
68+ json_record ["formats" ] = json_record .pop ("format" )
69+ else :
70+ json_record ["formats" ] = [json_record .pop ("format" )]
71+
72+ # dates
73+ datetypes = set ()
74+ # Save set of types for handling publicationDate
75+ if "relevantDates" in json_record :
76+ dates = json_record ["relevantDates" ]
77+ for d in dates :
78+ d ["date" ] = d .pop ("relevantDateValue" )
79+ d ["dateType" ] = d .pop ("relevantDateType" )
80+ datetypes .add (d ["dateType" ])
81+ json_record ["dates" ] = json_record .pop ("relevantDates" )
82+
83+ # Set publicationYear and save publicationDate
84+ if "publicationDate" in json_record :
85+ # If 'Issued' date type was not manually set in metadata
86+ # the system created publicationDate is correct
87+ if "Issued" not in datetypes :
88+ if "dates" in json_record :
89+ json_record ["dates" ].append (
90+ {"date" : json_record ["publicationDate" ], "dateType" : "Issued" }
91+ )
92+ else :
93+ json_record ["dates" ] = [
94+ {"date" : json_record ["publicationDate" ], "dateType" : "Issued" }
95+ ]
96+ year = json_record ["publicationDate" ].split ("-" )[0 ]
97+ json_record ["publicationYear" ] = year
98+ # Otherwise pick 'Issued' date for publicationYear
99+ else :
100+ for d in json_record ["dates" ]:
101+ if d ["dateType" ] == "Issued" :
102+ year = d ["date" ].split ("-" )[0 ]
103+ json_record ["publicationYear" ] = year
104+
105+ del json_record ["publicationDate" ]
106+
107+ else :
108+ print ("No publication date set - something is odd with the record " , json_record )
109+
110+
111+ def decustomize_schema_43 (json_record , pass_emails , pass_media , pass_owner ):
112+ #Do standard transformations
113+ json_record = decustomize_standard (json_record , pass_emails , pass_media , pass_owner )
114+
115+ # Extract identifier and label as DOI
116+ identifiers = []
117+ if "doi" in json_record :
118+ doi = json_record ["doi" ]
119+ identifiers .append ( {
120+ "identifier" : json_record ["doi" ],
121+ "identifierType" : "DOI" ,
122+ })
123+ del json_record ["doi" ]
124+
125+ # change author formatting
126+ if "authors" in json_record :
127+ authors = json_record ["authors" ]
128+ newa = []
129+ for a in authors :
130+ new = {}
131+ if "authorAffiliation" in a :
132+ if isinstance (a ["authorAffiliation" ], list ) == False :
133+ a ["authorAffiliation" ] = [a ["authorAffiliation" ]]
134+ affiliation = []
135+ for aff in a ["authorAffiliation" ]:
136+ name = {}
137+ name ['name' ] = a ["affiliation" ]
138+ if 'ROR' in a :
139+ name ['ROR' ] = a ['ROR' ]
140+ new ["affiliation" ] = affiliation
141+ if "authorIdentifiers" in a :
142+ idv = []
143+ if isinstance (a ["authorIdentifiers" ], list ):
144+ for cid in a ["authorIdentifiers" ]:
145+ nid = {}
146+ nid ["nameIdentifier" ] = cid .pop ("authorIdentifier" )
147+ nid ["nameIdentifierScheme" ] = cid .pop ("authorIdentifierScheme" )
148+ idv .append (nid )
149+ new ["nameIdentifiers" ] = idv
150+ else :
151+ print ("Author identifiers not an array - please check" , doi )
152+ del a ["authorIdentifiers" ]
153+ new ["name" ] = a ["authorName" ]
154+ newa .append (new )
155+ json_record ["creators" ] = newa
156+ del json_record ["authors" ]
157+
158+ # contributors
159+ if "contributors" in json_record :
160+ contributors = json_record ["contributors" ]
161+ newc = []
162+ for c in contributors :
163+ new = {}
164+ if "contributorAffiliation" in c :
165+ if isinstance (c ["contributorAffiliation" ], list ) == False :
166+ c ["contributorAffiliation" ] = [c ["contributorAffiliation" ]]
167+ affiliation = []
168+ for aff in a ["contributorAffiliation" ]:
169+ name = {}
170+ name ['name' ] = a ["affiliation" ]
171+ if 'ROR' in a :
172+ name ['ROR' ] = a ['ROR' ]
173+ new ['affiliation' ] = affiliation
174+ if "contributorIdentifiers" in c :
175+ if isinstance (c ["contributorIdentifiers" ], list ):
176+ newa = []
177+ for cid in c ["contributorIdentifiers" ]:
178+ new = {}
179+ new ["nameIdentifier" ] = cid .pop ("contributorIdentifier" )
180+ if "contributorIdentifierScheme" in cid :
181+ new ["nameIdentifierScheme" ] = cid .pop (
182+ "contributorIdentifierScheme"
183+ )
184+ newa .append (new )
185+ new ["nameIdentifiers" ] = newa
186+ else :
187+ print ("Contributor identifier not an array - please check" , doi )
188+ del c ["contributorIdentifiers" ]
189+ new ["name" ] = c ["creatorName" ]
190+ if pass_emails == True :
191+ if "contributorEmail" in c :
192+ new ["contributorEmail" ] = c ["contributorEmail" ]
193+ newc .append (new )
194+ json_record ["contributors" ] = newc
195+
196+
197+ def decustomize_schema_4 (json_record , pass_emails , pass_media , pass_owner ):
198+ #Do standard transformations
199+ json_record = decustomize_standard (json_record , pass_emails , pass_media , pass_owner )
200+
201+ # Extract identifier and label as DOI
202+ if "doi" in json_record :
203+ doi = json_record ["doi" ]
204+ json_record ["identifier" ] = {
205+ "identifier" : json_record ["doi" ],
206+ "identifierType" : "DOI" ,
207+ }
208+ del json_record ["doi" ]
209+
67210 # change author formatting
68211 # Could do better with multiple affiliations
69212 if "authors" in json_record :
@@ -119,58 +262,6 @@ def decustomize_schema(
119262 if pass_emails == False :
120263 if "contributorEmail" in c :
121264 del c ["contributorEmail" ]
122- # format
123- if "format" in json_record :
124- if isinstance (json_record ["format" ], list ):
125- json_record ["formats" ] = json_record .pop ("format" )
126- else :
127- json_record ["formats" ] = [json_record .pop ("format" )]
128-
129- # dates
130- datetypes = set ()
131- # Save set of types for handling publicationDate
132- if "relevantDates" in json_record :
133- dates = json_record ["relevantDates" ]
134- for d in dates :
135- d ["date" ] = d .pop ("relevantDateValue" )
136- d ["dateType" ] = d .pop ("relevantDateType" )
137- datetypes .add (d ["dateType" ])
138- json_record ["dates" ] = json_record .pop ("relevantDates" )
139-
140- # Set publicationYear and save publicationDate
141- if "publicationDate" in json_record :
142- # If 'Issued' date type was not manually set in metadata
143- # the system created publicationDate is correct
144- if "Issued" not in datetypes :
145- if "dates" in json_record :
146- json_record ["dates" ].append (
147- {"date" : json_record ["publicationDate" ], "dateType" : "Issued" }
148- )
149- else :
150- json_record ["dates" ] = [
151- {"date" : json_record ["publicationDate" ], "dateType" : "Issued" }
152- ]
153- year = json_record ["publicationDate" ].split ("-" )[0 ]
154- json_record ["publicationYear" ] = year
155- # Otherwise pick 'Issued' date for publicationYear
156- else :
157- for d in json_record ["dates" ]:
158- if d ["dateType" ] == "Issued" :
159- year = d ["date" ].split ("-" )[0 ]
160- json_record ["publicationYear" ] = year
161-
162- del json_record ["publicationDate" ]
163-
164- else :
165- print ("No publication date set - something is odd with the record " , json_record )
166-
167- # license - no url available
168- if "rightsList" not in json_record :
169- if "license" in json_record :
170- json_record ["rightsList" ] = [{"rights" : json_record .pop ("license" )}]
171- if "rightsList" in json_record :
172- if not isinstance (json_record ["rightsList" ], list ):
173- json_record ["rightsList" ] = [json_record ["rightsList" ]]
174265
175266 # Funding
176267 if "fundings" in json_record :
0 commit comments