Skip to content

Commit 8ec02c9

Browse files
committed
New name and metadata function
1 parent 78f78e4 commit 8ec02c9

File tree

12 files changed

+234
-19
lines changed

12 files changed

+234
-19
lines changed

README.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
1-
# caltechdata_write
1+
# caltechdata_api
22

3-
Write files and a DataCite 4 standard json record to CaltechDATA repository
3+
Python library for using the CaltechDATA API
4+
caltechdata_write write files and a DataCite 4 standard json record to CaltechDATA repository
5+
caltechdata_edit edits records in CaltechDATA
6+
get_metadata gets metadata for CaltechDATA records
47

58
In development.
69

caltechdata_api/__init__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from .caltechdata_write import caltechdata_write
2+
from .caltechdata_write import send_s3
3+
from .caltechdata_edit import caltechdata_add
4+
from .caltechdata_edit import caltechdata_edit
5+
from .customize_schema import customize_schema
6+
from .decustomize_schema import decustomize_schema
7+
from .get_metadata import get_metadata

caltechdata_write/caltechdata_edit.py renamed to caltechdata_api/caltechdata_edit.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
from requests import session
22
import json
3-
from caltechdata_write import customize_schema
4-
from caltechdata_write import send_s3
3+
from caltechdata_api import customize_schema
4+
from caltechdata_api import send_s3
55

6-
def Caltechdata_edit(token,ids,metadata={},files={},delete={},production=False):
6+
def caltechdata_edit(token,ids,metadata={},files={},delete={},production=False):
77

88
#Including files will only replaces files if they have the same name
99
#The delete option will delete any existing files with a given file
@@ -69,7 +69,7 @@ def Caltechdata_edit(token,ids,metadata={},files={},delete={},production=False):
6969
response = c.post(url, headers=headers, data=dat)
7070
return response.text
7171

72-
def Caltechdata_add(token,ids,metadata={},files={},production=False):
72+
def caltechdata_add(token,ids,metadata={},files={},production=False):
7373

7474
#Adds file
7575

@@ -113,5 +113,5 @@ def Caltechdata_add(token,ids,metadata={},files={},production=False):
113113

114114
c = session()
115115
response = c.post(url, headers=headers, data=dat)
116-
print(response.text)
116+
return response.text
117117

caltechdata_write/caltechdata_write.py renamed to caltechdata_api/caltechdata_write.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from requests import session
2-
from caltechdata_write import customize_schema
2+
from caltechdata_api import customize_schema
33
import json
44
import os
55

@@ -58,7 +58,7 @@ def send_s3(filepath,token,production=False):
5858

5959
return(fileinfo)
6060

61-
def Caltechdata_write(metadata,token,files=[],production=False):
61+
def caltechdata_write(metadata,token,files=[],production=False):
6262

6363
#If files is a string - change to single value array
6464
if isinstance(files, str) == True:
File renamed without changes.
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
# Convert a internal TIND CaltechDATA record into a DataCite 4 standard schema json record
2+
import json
3+
import argparse
4+
5+
def decustomize_schema(json_record):
6+
7+
#Extract subjects to single string
8+
if "subjects" in json_record:
9+
if isinstance(json_record['subjects'],str):
10+
subjects = json_record['subjects'].split(',')
11+
array = []
12+
for s in subjects:
13+
array.append({'subject':s})
14+
json_record['subjects']=array
15+
else:
16+
array = []
17+
for s in json_record['subjects']:
18+
array.append({"subject":s})
19+
json_record['subjects']=array
20+
21+
#Extract identifier and label as DOI
22+
if "doi" in json_record:
23+
json_record['identifier'] = {'identifier':json_record['doi'],
24+
'identifierType':"DOI"}
25+
del json_record['doi']
26+
27+
#Extract title
28+
if "title" in json_record:
29+
json_record['titles'] = [{"title":json_record['title']}]
30+
del json_record['title']
31+
32+
#Change related identifier labels
33+
if "relatedIdentifiers" in json_record:
34+
for listing in json_record['relatedIdentifiers']:
35+
listing['relationType'] = listing.pop('relatedIdentifierRelation')
36+
listing['relatedIdentifierType'] = listing.pop('relatedIdentifierScheme')
37+
38+
#change author formatting
39+
#Could do better with multiple affiliations
40+
if "authors" in json_record:
41+
authors = json_record['authors']
42+
newa = []
43+
for a in authors:
44+
new = {}
45+
if 'authorAffiliation' in a:
46+
new['affiliations'] = [a['authorAffiliation']]
47+
new['creatorName'] = a['authorName']
48+
newa.append(new)
49+
json_record['creators']=newa
50+
del json_record['authors']
51+
52+
#contributors
53+
if "contributors" in json_record:
54+
for c in json_record['contributors']:
55+
if 'contributorAffiliation' in c:
56+
c['affiliations'] = [c.pop('contributorAffiliation')]
57+
if 'contributorIdentifiers' in c:
58+
c['contributorIdentifiers']['nameIdentifier'] =\
59+
c['contributorIdentifiers'].pop('contributorIdentifier')
60+
c['contributorIdentifiers']['nameIdentifierScheme'] =\
61+
c['contributorIdentifiers'].pop('contributorIdentifierScheme')
62+
c['nameIdentifiers'] = [c.pop('contributorIdentifiers')]
63+
if 'contributorEmail' in c:
64+
del c['contributorEmail']
65+
#format
66+
if "format" in json_record:
67+
if isinstance(json_record['format'],list):
68+
json_record['formats']=json_record.pop('format')
69+
else:
70+
json_record['formats']=[json_record.pop('format')]
71+
72+
#dates
73+
datetypes = set()
74+
#Save set of types for handling publicationDate
75+
if "relevantDates" in json_record:
76+
dates = json_record['relevantDates']
77+
for d in dates:
78+
d['date']=d.pop('relevantDateValue')
79+
d['dateType']=d.pop('relevantDateType')
80+
datetypes.add(d['dateType'])
81+
json_record['dates']=json_record.pop('relevantDates')
82+
83+
#set publicationYear
84+
year = json_record['publicationDate'].split('-')[0]
85+
json_record['publicationYear'] = year
86+
#If "Issued' date type was not manually set in metadata
87+
#We want to save the entire publicationDate
88+
if 'Issued' not in datetypes:
89+
if 'dates' in json_record:
90+
json_record['dates'].append({"date":json_record['publicationDate'],\
91+
"dateType": "Issued"})
92+
else:
93+
json_record['dates']=[{"date":json_record['publicationDate'],\
94+
"dateType": "Issued"}]
95+
del json_record['publicationDate']
96+
97+
#license - no url available
98+
if 'license' in json_record:
99+
json_record['rightsList']=[{"rights":json_record.pop('license')}]
100+
101+
#Funding
102+
if 'fundings' in json_record:
103+
funding = json_record['fundings']
104+
newf = []
105+
for f in funding:
106+
frec = {}
107+
if 'fundingName' in f:
108+
frec['funderName'] = f['fundingName']
109+
#f['fundingName']=f.pop('funderName')
110+
if 'fundingAwardNumber' in f:
111+
frec['awardNumber']={'awardNumber':f['fundingAwardNumber']}
112+
newf.append(frec)
113+
json_record['fundingReferences']=newf
114+
del json_record['fundings']
115+
116+
#Geo
117+
if 'geographicCoverage' in json_record:
118+
geo = json_record['geographicCoverage']
119+
newgeo = {}
120+
if 'geoLocationPlace' in geo:
121+
newgeo['geoLocationPlace'] = geo['geoLocationPlace']
122+
if 'geoLocationPoint' in geo:
123+
pt = geo['geoLocationPoint'][0]
124+
newpt = {}
125+
newpt['pointLatitude'] = float(pt['pointLatitude'])
126+
newpt['pointLongitude'] = float(pt['pointLongitude'])
127+
newgeo['geoLocationPoint'] = newpt
128+
json_record['geoLocations'] = [newgeo]
129+
del json_record['geographicCoverage']
130+
131+
#Publisher
132+
if "publishers" in json_record:
133+
if isinstance(json_record['publisher'],list):
134+
json_record['publisher'] = json_record['publishers'][0]['publisherName']
135+
else:
136+
json_record['publisher'] = json_record['publishers']['publisherName']
137+
del json_record['publishers']
138+
139+
#description
140+
if "descriptions" in json_record:
141+
for d in json_record["descriptions"]:
142+
if 'descriptionValue' in d:
143+
d["description"] = d.pop("descriptionValue")
144+
145+
others = ['files', 'id', 'owners', 'pid_value', 'control_number', '_oai',
146+
'_form_uuid', 'electronic_location_and_access', 'access_right']
147+
for v in others:
148+
if v in json_record:
149+
del json_record[v]
150+
151+
#print(json.dumps(json_record))
152+
return json_record
153+
154+
if __name__ == "__main__":
155+
#Read in from file for demo purposes
156+
157+
parser = argparse.ArgumentParser(description=\
158+
"decustomize_schema converts a internal TIND CaltechDATA record\
159+
into a DataCite 4 standard schema json record")
160+
parser.add_argument('json_files', nargs='+', help='json file name')
161+
args = parser.parse_args()
162+
163+
for jfile in args.json_files:
164+
infile = open(jfile,'r')
165+
data = json.load(infile)
166+
new = customize_schema(data)
167+
with open('formatted.json','w') as outfile:
168+
json.dump(new,outfile)
169+
#print(json.dumps(new))

caltechdata_api/get_metadata.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import os,json,csv,argparse
2+
import requests
3+
from datacite import DataCiteMDSClient, schema40
4+
from caltechdata_api import decustomize_schema
5+
6+
def get_metadata(ids,production=True):
7+
8+
if production==True:
9+
api_url = "https://data.caltech.edu/api/record/"
10+
else:
11+
api_url = "https://cd-sandbox.tind.io/api/record"
12+
13+
r = requests.get(api_url+str(idv))
14+
metadata = r.json()['metadata']
15+
metadata = decustomize_schema(metadata)
16+
17+
try:
18+
assert schema40.validate(metadata)
19+
except AssertionError:
20+
v = schema40.validator.validate(metadata)
21+
errors = sorted(v.iter_errors(instance), key=lambda e:e.path)
22+
for error in errors:
23+
print(error.message)
24+
exit()
25+
26+
return metadata
27+
28+
if __name__ == "__main__":
29+
parser = argparse.ArgumentParser(description=\
30+
"get_metadata queries the caltechDATA (Invenio 3) API\
31+
and returns DataCite-compatable metadata")
32+
parser.add_argument('ids', metavar='ID', type=int, nargs='+',\
33+
help='The CaltechDATA ID for each record of interest')
34+
35+
args = parser.parse_args()
36+
37+
for idv in args.ids:
38+
metadata = get_metadata(idv)
39+
outfile = open(str(idv)+'.json','w')
40+
outfile.write(json.dumps(metadata))
41+
outfile.close()

caltechdata_write/__init__.py

Lines changed: 0 additions & 5 deletions
This file was deleted.

edit.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import argparse, os, json
2-
from caltechdata_write import Caltechdata_edit
2+
from caltechdata_api import caltechdata_edit
33

44
parser = argparse.ArgumentParser(description=\
55
"Write files and a DataCite 4 standard json record\
@@ -16,5 +16,5 @@
1616
metaf = open(args.json_file[0], 'r')
1717
metadata = json.load(metaf)
1818

19-
response = Caltechdata_edit(token, args.ids, metadata, args.fnames, {}, False)
19+
response = caltechdata_edit(token, args.ids, metadata, args.fnames, {}, False)
2020
print(response)

example.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import argparse
22
import os,json
3-
from caltechdata_write import Caltechdata_write
3+
from caltechdata_api import caltechdata_write
44

55
parser = argparse.ArgumentParser(description=\
66
"Write files and a DataCite 4 standard json record\
@@ -20,5 +20,5 @@
2020
if files == None:
2121
files={}
2222

23-
response = Caltechdata_write(metadata,token,files,False)
23+
response = caltechdata_write(metadata,token,files,False)
2424
print(response)

0 commit comments

Comments
 (0)