Skip to content

Commit 59bf835

Browse files
committed
Start 4 and 4.3 metadata version support
1 parent 2661174 commit 59bf835

File tree

1 file changed

+157
-66
lines changed

1 file changed

+157
-66
lines changed

caltechdata_api/decustomize_schema.py

Lines changed: 157 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,22 @@
1-
# Convert a internal TIND CaltechDATA record into a DataCite 4 standard schema json record
1+
# Convert a internal TIND CaltechDATA record into a DataCite 4 or 4.3 standard schema json record
22
import json
33
import argparse
44

55

66
def decustomize_schema(
7-
json_record, pass_emails=False, pass_media=False, pass_owner=False
7+
json_record, pass_emails=False, pass_media=False, pass_owner=False,
8+
schema='4'
89
):
10+
if schema == '4':
11+
return decustomize_schema_4(json_record, pass_emails,
12+
pass_media,pass_owner)
13+
elif schema == '43':
14+
return decustomize_schema_4(json_record, pass_emails,
15+
pass_media,pass_owner)
16+
else:
17+
raise ValueError(f'Error: schema {schema} not defined')
918

19+
def decustomize_standard(json_record, pass_emails, pass_media, pass_owner):
1020
# Extract subjects to single string
1121
if "subjects" in json_record:
1222
if isinstance(json_record["subjects"], str):
@@ -21,18 +31,6 @@ def decustomize_schema(
2131
array.append({"subject": s})
2232
json_record["subjects"] = array
2333

24-
# Extract identifier and label as DOI
25-
if "doi" in json_record:
26-
doi = json_record["doi"]
27-
json_record["identifier"] = {
28-
"identifier": json_record["doi"],
29-
"identifierType": "DOI",
30-
}
31-
del json_record["doi"]
32-
#Fail out if a DOI is not present
33-
#else:
34-
# raise ValueError(f'Error: Record does not have a DOI {json_record}')
35-
3634
# Extract title
3735
if "title" in json_record:
3836
json_record["titles"] = [{"title": json_record["title"]}]
@@ -64,6 +62,151 @@ def decustomize_schema(
6462
json_record["relatedIdentifiers"] = [relation]
6563
del json_record["publications"]
6664

65+
# format
66+
if "format" in json_record:
67+
if isinstance(json_record["format"], list):
68+
json_record["formats"] = json_record.pop("format")
69+
else:
70+
json_record["formats"] = [json_record.pop("format")]
71+
72+
# dates
73+
datetypes = set()
74+
# Save set of types for handling publicationDate
75+
if "relevantDates" in json_record:
76+
dates = json_record["relevantDates"]
77+
for d in dates:
78+
d["date"] = d.pop("relevantDateValue")
79+
d["dateType"] = d.pop("relevantDateType")
80+
datetypes.add(d["dateType"])
81+
json_record["dates"] = json_record.pop("relevantDates")
82+
83+
# Set publicationYear and save publicationDate
84+
if "publicationDate" in json_record:
85+
# If 'Issued' date type was not manually set in metadata
86+
# the system created publicationDate is correct
87+
if "Issued" not in datetypes:
88+
if "dates" in json_record:
89+
json_record["dates"].append(
90+
{"date": json_record["publicationDate"], "dateType": "Issued"}
91+
)
92+
else:
93+
json_record["dates"] = [
94+
{"date": json_record["publicationDate"], "dateType": "Issued"}
95+
]
96+
year = json_record["publicationDate"].split("-")[0]
97+
json_record["publicationYear"] = year
98+
# Otherwise pick 'Issued' date for publicationYear
99+
else:
100+
for d in json_record["dates"]:
101+
if d["dateType"] == "Issued":
102+
year = d["date"].split("-")[0]
103+
json_record["publicationYear"] = year
104+
105+
del json_record["publicationDate"]
106+
107+
else:
108+
print("No publication date set - something is odd with the record ", json_record)
109+
110+
111+
def decustomize_schema_43(json_record, pass_emails, pass_media, pass_owner):
112+
#Do standard transformations
113+
json_record = decustomize_standard(json_record, pass_emails, pass_media, pass_owner)
114+
115+
# Extract identifier and label as DOI
116+
identifiers = []
117+
if "doi" in json_record:
118+
doi = json_record["doi"]
119+
identifiers.append( {
120+
"identifier": json_record["doi"],
121+
"identifierType": "DOI",
122+
})
123+
del json_record["doi"]
124+
125+
# change author formatting
126+
if "authors" in json_record:
127+
authors = json_record["authors"]
128+
newa = []
129+
for a in authors:
130+
new = {}
131+
if "authorAffiliation" in a:
132+
if isinstance(a["authorAffiliation"], list) == False:
133+
a["authorAffiliation"] = [a["authorAffiliation"]]
134+
affiliation = []
135+
for aff in a["authorAffiliation"]:
136+
name = {}
137+
name['name'] = a["affiliation"]
138+
if 'ROR' in a:
139+
name['ROR'] = a['ROR']
140+
new["affiliation"] = affiliation
141+
if "authorIdentifiers" in a:
142+
idv = []
143+
if isinstance(a["authorIdentifiers"], list):
144+
for cid in a["authorIdentifiers"]:
145+
nid = {}
146+
nid["nameIdentifier"] = cid.pop("authorIdentifier")
147+
nid["nameIdentifierScheme"] = cid.pop("authorIdentifierScheme")
148+
idv.append(nid)
149+
new["nameIdentifiers"] = idv
150+
else:
151+
print("Author identifiers not an array - please check", doi)
152+
del a["authorIdentifiers"]
153+
new["name"] = a["authorName"]
154+
newa.append(new)
155+
json_record["creators"] = newa
156+
del json_record["authors"]
157+
158+
# contributors
159+
if "contributors" in json_record:
160+
contributors = json_record["contributors"]
161+
newc = []
162+
for c in contributors:
163+
new = {}
164+
if "contributorAffiliation" in c:
165+
if isinstance(c["contributorAffiliation"], list) == False:
166+
c["contributorAffiliation"] = [c["contributorAffiliation"]]
167+
affiliation = []
168+
for aff in a["contributorAffiliation"]:
169+
name = {}
170+
name['name'] = a["affiliation"]
171+
if 'ROR' in a:
172+
name['ROR'] = a['ROR']
173+
new['affiliation'] = affiliation
174+
if "contributorIdentifiers" in c:
175+
if isinstance(c["contributorIdentifiers"], list):
176+
newa = []
177+
for cid in c["contributorIdentifiers"]:
178+
new = {}
179+
new["nameIdentifier"] = cid.pop("contributorIdentifier")
180+
if "contributorIdentifierScheme" in cid:
181+
new["nameIdentifierScheme"] = cid.pop(
182+
"contributorIdentifierScheme"
183+
)
184+
newa.append(new)
185+
new["nameIdentifiers"] = newa
186+
else:
187+
print("Contributor identifier not an array - please check", doi)
188+
del c["contributorIdentifiers"]
189+
new["name"] = c["creatorName"]
190+
if pass_emails == True:
191+
if "contributorEmail" in c:
192+
new["contributorEmail"] = c["contributorEmail"]
193+
newc.append(new)
194+
json_record["contributors"] = newc
195+
196+
197+
def decustomize_schema_4(json_record, pass_emails, pass_media, pass_owner):
198+
#Do standard transformations
199+
json_record = decustomize_standard(json_record, pass_emails, pass_media, pass_owner)
200+
201+
# Extract identifier and label as DOI
202+
if "doi" in json_record:
203+
doi = json_record["doi"]
204+
json_record["identifier"] = {
205+
"identifier": json_record["doi"],
206+
"identifierType": "DOI",
207+
}
208+
del json_record["doi"]
209+
67210
# change author formatting
68211
# Could do better with multiple affiliations
69212
if "authors" in json_record:
@@ -119,58 +262,6 @@ def decustomize_schema(
119262
if pass_emails == False:
120263
if "contributorEmail" in c:
121264
del c["contributorEmail"]
122-
# format
123-
if "format" in json_record:
124-
if isinstance(json_record["format"], list):
125-
json_record["formats"] = json_record.pop("format")
126-
else:
127-
json_record["formats"] = [json_record.pop("format")]
128-
129-
# dates
130-
datetypes = set()
131-
# Save set of types for handling publicationDate
132-
if "relevantDates" in json_record:
133-
dates = json_record["relevantDates"]
134-
for d in dates:
135-
d["date"] = d.pop("relevantDateValue")
136-
d["dateType"] = d.pop("relevantDateType")
137-
datetypes.add(d["dateType"])
138-
json_record["dates"] = json_record.pop("relevantDates")
139-
140-
# Set publicationYear and save publicationDate
141-
if "publicationDate" in json_record:
142-
# If 'Issued' date type was not manually set in metadata
143-
# the system created publicationDate is correct
144-
if "Issued" not in datetypes:
145-
if "dates" in json_record:
146-
json_record["dates"].append(
147-
{"date": json_record["publicationDate"], "dateType": "Issued"}
148-
)
149-
else:
150-
json_record["dates"] = [
151-
{"date": json_record["publicationDate"], "dateType": "Issued"}
152-
]
153-
year = json_record["publicationDate"].split("-")[0]
154-
json_record["publicationYear"] = year
155-
# Otherwise pick 'Issued' date for publicationYear
156-
else:
157-
for d in json_record["dates"]:
158-
if d["dateType"] == "Issued":
159-
year = d["date"].split("-")[0]
160-
json_record["publicationYear"] = year
161-
162-
del json_record["publicationDate"]
163-
164-
else:
165-
print("No publication date set - something is odd with the record ", json_record)
166-
167-
# license - no url available
168-
if "rightsList" not in json_record:
169-
if "license" in json_record:
170-
json_record["rightsList"] = [{"rights": json_record.pop("license")}]
171-
if "rightsList" in json_record:
172-
if not isinstance(json_record["rightsList"], list):
173-
json_record["rightsList"] = [json_record["rightsList"]]
174265

175266
# Funding
176267
if "fundings" in json_record:

0 commit comments

Comments
 (0)