New name and metadata function

tmorrell · tmorrell · commit 8ec02c985e82 · 2017-11-22T09:13:38.000-08:00
diff --git a/README.md b/README.md
@@ -1,6 +1,9 @@
-# caltechdata_write
+# caltechdata_api
 
-Write files and a DataCite 4 standard json record to CaltechDATA repository
+Python library for using the CaltechDATA API
+caltechdata_write write files and a DataCite 4 standard json record to CaltechDATA repository
+caltechdata_edit edits records in CaltechDATA
+get_metadata gets metadata for CaltechDATA records
 
 In development. 
 
diff --git a/caltechdata_api/__init__.py b/caltechdata_api/__init__.py
@@ -0,0 +1,7 @@
+from .caltechdata_write import caltechdata_write
+from .caltechdata_write import send_s3
+from .caltechdata_edit import caltechdata_add
+from .caltechdata_edit import caltechdata_edit
+from .customize_schema import customize_schema
+from .decustomize_schema import decustomize_schema
+from .get_metadata import get_metadata
diff --git a/caltechdata_api/caltechdata_edit.py b/caltechdata_api/caltechdata_edit.py
@@ -1,9 +1,9 @@
 from requests import session
 import json
-from caltechdata_write import customize_schema
-from caltechdata_write import send_s3
+from caltechdata_api import customize_schema
+from caltechdata_api import send_s3
 
-def Caltechdata_edit(token,ids,metadata={},files={},delete={},production=False):
+def caltechdata_edit(token,ids,metadata={},files={},delete={},production=False):
 
     #Including files will only replaces files if they have the same name
     #The delete option will delete any existing files with a given file
@@ -69,7 +69,7 @@ def Caltechdata_edit(token,ids,metadata={},files={},delete={},production=False):
         response = c.post(url, headers=headers, data=dat)
         return response.text
 
-def Caltechdata_add(token,ids,metadata={},files={},production=False):
+def caltechdata_add(token,ids,metadata={},files={},production=False):
 
     #Adds file
 
@@ -113,5 +113,5 @@ def Caltechdata_add(token,ids,metadata={},files={},production=False):
 
         c = session()
         response = c.post(url, headers=headers, data=dat)
-        print(response.text)
+        return response.text
 
diff --git a/caltechdata_api/caltechdata_write.py b/caltechdata_api/caltechdata_write.py
@@ -1,5 +1,5 @@
 from requests import session
-from caltechdata_write import customize_schema
+from caltechdata_api import customize_schema
 import json
 import os
 
@@ -58,7 +58,7 @@ def send_s3(filepath,token,production=False):
 
     return(fileinfo)
 
-def Caltechdata_write(metadata,token,files=[],production=False):
+def caltechdata_write(metadata,token,files=[],production=False):
 
     #If files is a string - change to single value array
     if isinstance(files, str) == True:
diff --git a/caltechdata_api/customize_schema.py b/caltechdata_api/customize_schema.py
diff --git a/caltechdata_api/decustomize_schema.py b/caltechdata_api/decustomize_schema.py
@@ -0,0 +1,169 @@
+# Convert a internal TIND CaltechDATA record into a  DataCite 4 standard schema json record
+import json
+import argparse
+
+def decustomize_schema(json_record):
+
+    #Extract subjects to single string
+    if "subjects" in json_record:
+        if isinstance(json_record['subjects'],str):
+            subjects = json_record['subjects'].split(',')
+            array = []
+            for s in subjects:
+                array.append({'subject':s})
+            json_record['subjects']=array
+        else:
+            array = []
+            for s in json_record['subjects']:
+                array.append({"subject":s})
+            json_record['subjects']=array
+
+    #Extract identifier and label as DOI
+    if "doi" in json_record:
+        json_record['identifier'] = {'identifier':json_record['doi'],
+                'identifierType':"DOI"}
+        del json_record['doi']
+
+    #Extract title
+    if "title" in json_record:
+        json_record['titles'] = [{"title":json_record['title']}]
+        del json_record['title']
+
+    #Change related identifier labels
+    if "relatedIdentifiers" in json_record:
+        for listing in json_record['relatedIdentifiers']:
+            listing['relationType'] = listing.pop('relatedIdentifierRelation') 
+            listing['relatedIdentifierType'] = listing.pop('relatedIdentifierScheme')
+
+    #change author formatting
+    #Could do better with multiple affiliations
+    if "authors" in json_record:
+        authors = json_record['authors']
+        newa = []
+        for a in authors:
+            new = {}
+            if 'authorAffiliation' in a:
+                new['affiliations'] = [a['authorAffiliation']]
+            new['creatorName'] = a['authorName']
+            newa.append(new)
+        json_record['creators']=newa
+        del json_record['authors']
+
+    #contributors
+    if "contributors" in json_record:
+        for c in json_record['contributors']:
+            if 'contributorAffiliation' in c:
+                c['affiliations'] = [c.pop('contributorAffiliation')]
+            if 'contributorIdentifiers' in c:
+                c['contributorIdentifiers']['nameIdentifier'] =\
+                c['contributorIdentifiers'].pop('contributorIdentifier')
+                c['contributorIdentifiers']['nameIdentifierScheme'] =\
+                c['contributorIdentifiers'].pop('contributorIdentifierScheme')
+                c['nameIdentifiers'] = [c.pop('contributorIdentifiers')]
+            if 'contributorEmail' in c:
+                del c['contributorEmail']
+    #format
+    if "format" in json_record:
+        if isinstance(json_record['format'],list):
+            json_record['formats']=json_record.pop('format')
+        else:
+            json_record['formats']=[json_record.pop('format')]
+
+    #dates
+    datetypes = set()
+    #Save set of types for handling publicationDate
+    if "relevantDates" in json_record:
+        dates = json_record['relevantDates']
+        for d in dates:
+            d['date']=d.pop('relevantDateValue')
+            d['dateType']=d.pop('relevantDateType')
+            datetypes.add(d['dateType'])
+        json_record['dates']=json_record.pop('relevantDates')
+
+    #set publicationYear
+    year = json_record['publicationDate'].split('-')[0]
+    json_record['publicationYear'] = year
+    #If "Issued' date type was not manually set in metadata
+    #We want to save the entire publicationDate
+    if 'Issued' not in datetypes:
+        if 'dates' in json_record:
+            json_record['dates'].append({"date":json_record['publicationDate'],\
+                "dateType": "Issued"})
+        else:
+            json_record['dates']=[{"date":json_record['publicationDate'],\
+                "dateType": "Issued"}]
+    del json_record['publicationDate']
+
+    #license - no url available
+    if 'license' in json_record:
+        json_record['rightsList']=[{"rights":json_record.pop('license')}]
+    
+    #Funding
+    if 'fundings' in json_record:
+        funding = json_record['fundings']
+        newf = []
+        for f in funding:
+            frec = {}
+            if 'fundingName' in f:
+                frec['funderName'] = f['fundingName']
+            #f['fundingName']=f.pop('funderName')
+            if 'fundingAwardNumber' in f:
+                frec['awardNumber']={'awardNumber':f['fundingAwardNumber']}
+            newf.append(frec)
+        json_record['fundingReferences']=newf
+        del json_record['fundings']
+
+    #Geo
+    if 'geographicCoverage' in json_record:
+        geo = json_record['geographicCoverage']
+        newgeo = {}
+        if 'geoLocationPlace' in geo:
+            newgeo['geoLocationPlace'] = geo['geoLocationPlace'] 
+        if 'geoLocationPoint' in geo:
+            pt = geo['geoLocationPoint'][0]
+            newpt = {}
+            newpt['pointLatitude'] = float(pt['pointLatitude'])
+            newpt['pointLongitude'] = float(pt['pointLongitude'])
+            newgeo['geoLocationPoint'] = newpt
+        json_record['geoLocations'] = [newgeo]
+        del json_record['geographicCoverage']
+
+    #Publisher
+    if "publishers" in json_record:
+        if isinstance(json_record['publisher'],list):
+            json_record['publisher'] = json_record['publishers'][0]['publisherName']
+        else:
+            json_record['publisher'] = json_record['publishers']['publisherName']
+        del json_record['publishers']
+
+    #description
+    if "descriptions" in json_record:
+        for d in json_record["descriptions"]:
+            if 'descriptionValue' in d:
+                d["description"] = d.pop("descriptionValue")
+
+    others = ['files', 'id', 'owners', 'pid_value', 'control_number', '_oai',
+            '_form_uuid', 'electronic_location_and_access', 'access_right']
+    for v in others:
+        if v in json_record:
+            del json_record[v]
+
+    #print(json.dumps(json_record))
+    return json_record
+
+if __name__ == "__main__":
+    #Read in from file for demo purposes
+
+    parser = argparse.ArgumentParser(description=\
+                "decustomize_schema converts a internal TIND CaltechDATA record\
+       into a  DataCite 4 standard schema json record")
+    parser.add_argument('json_files', nargs='+', help='json file name')
+    args = parser.parse_args()
+
+    for jfile in args.json_files:
+        infile = open(jfile,'r')
+        data = json.load(infile)
+        new = customize_schema(data)
+        with open('formatted.json','w') as outfile:
+            json.dump(new,outfile)
+        #print(json.dumps(new))
diff --git a/caltechdata_api/get_metadata.py b/caltechdata_api/get_metadata.py
@@ -0,0 +1,41 @@
+import os,json,csv,argparse
+import requests
+from datacite import DataCiteMDSClient, schema40
+from caltechdata_api import decustomize_schema
+
+def get_metadata(ids,production=True):
+
+    if production==True:
+        api_url = "https://data.caltech.edu/api/record/"
+    else:
+        api_url = "https://cd-sandbox.tind.io/api/record"
+
+    r = requests.get(api_url+str(idv))
+    metadata = r.json()['metadata']
+    metadata = decustomize_schema(metadata)
+
+    try: 
+        assert schema40.validate(metadata)
+    except AssertionError:
+        v = schema40.validator.validate(metadata)
+        errors = sorted(v.iter_errors(instance), key=lambda e:e.path)
+        for error in errors:
+            print(error.message)
+        exit()
+
+    return metadata
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=\
+    "get_metadata queries the caltechDATA (Invenio 3) API\
+    and returns DataCite-compatable metadata")
+    parser.add_argument('ids', metavar='ID', type=int, nargs='+',\
+    help='The CaltechDATA ID for each record of interest')
+
+    args = parser.parse_args()
+
+    for idv in args.ids:
+        metadata = get_metadata(idv)
+        outfile = open(str(idv)+'.json','w')
+        outfile.write(json.dumps(metadata))
+        outfile.close()
diff --git a/caltechdata_write/__init__.py b/caltechdata_write/__init__.py
diff --git a/edit.py b/edit.py
@@ -1,5 +1,5 @@
 import argparse, os, json
-from caltechdata_write import Caltechdata_edit
+from caltechdata_api import caltechdata_edit
 
 parser = argparse.ArgumentParser(description=\
         "Write files and a DataCite 4 standard json record\
@@ -16,5 +16,5 @@
 metaf = open(args.json_file[0], 'r')
 metadata = json.load(metaf)
 
-response = Caltechdata_edit(token, args.ids, metadata, args.fnames, {}, False)
+response = caltechdata_edit(token, args.ids, metadata, args.fnames, {}, False)
 print(response)
diff --git a/example.py b/example.py
@@ -1,6 +1,6 @@
 import argparse
 import os,json
-from caltechdata_write import Caltechdata_write
+from caltechdata_api import caltechdata_write
 
 parser = argparse.ArgumentParser(description=\
         "Write files and a DataCite 4 standard json record\
@@ -20,5 +20,5 @@
 if files == None:
     files={}
 
-response = Caltechdata_write(metadata,token,files,False)
+response = caltechdata_write(metadata,token,files,False)
 print(response)
diff --git a/setup.py b/setup.py
@@ -1,6 +1,6 @@
 from setuptools import setup, find_packages
 setup(
-        name = 'caltechdata_write',
+        name = 'caltechdata_api',
         version ='0.0.1',
         packages = find_packages(),
         install_requires=[
diff --git a/token.bash b/token.bash