Start of geo date update script

tmorrell · tmorrell · commit a278cb9a3adb · 2019-04-09T15:55:18.000-07:00
diff --git a/edit_all_geo.py b/edit_all_geo.py
@@ -0,0 +1,40 @@
+import argparse, os, json, requests,csv
+from caltechdata_api import caltechdata_edit, decustomize_schema
+
+#Get access token from TIND sed as environment variable with source token.bash
+token = os.environ['TINDTOK']
+
+production = True
+
+if production == True:
+    url = 'https://data.caltech.edu/api/records'
+else:
+    url = 'https://cd-sandbox.tind.io/api/records'
+
+response = requests.get(url+'/?size=1000&q=subjects:gps,thesis')
+hits = response.json()
+
+#Set up dictionary of links between resolver and thesis IDs
+available = os.path.isfile('data/record_list.csv')
+if available == False:
+    print("You need to run update_thesis_file.py")
+    exit()
+else:
+    record_list = {}
+    reader=csv.reader(open("data/record_list.csv"))
+    for row in reader:
+        record_list[row[1]] = row[0]
+
+for h in hits['hits']['hits']:
+    rid = str(h['id'])
+    print(rid)
+    record = decustomize_schema(h['metadata'],True)
+    if 'relatedIdentifiers' in record:
+        for r in record['relatedIdentifiers']:
+            if r['relationType']=='IsSupplementTo' and 'relatedIdentifierType'=='URL':
+                idv = record_list[r['relatedIdentifier']]
+                print(idv)
+        #metadata =\
+                #{'descriptions':[{'description':description,'descriptionType':'Abstract'}]}
+            #response = caltechdata_edit(token, rid, metadata, {}, {}, production)
+            #print(response)
diff --git a/update_thesis_file.py b/update_thesis_file.py
@@ -0,0 +1,33 @@
+import os,subprocess,json,csv
+import dataset
+from ames.harvesters import get_caltechfeed
+
+if os.path.isdir('data') == False:
+    os.mkdir('data')
+os.chdir('data')
+
+get_caltechfeed('thesis')
+
+record_list = {}
+collection = 'CaltechTHESIS.ds'
+keys = dataset.keys(collection)
+count = 0
+for k in keys:
+    count = count + 1
+    if count % 100 == 0:
+        print(count)
+    metadata,err = dataset.read(collection,k)
+    if err != '':
+        print("Error on read ",err)
+        exit()
+    if metadata != {}:
+        if 'official_url' in metadata:
+            record_list[k]=metadata['official_url']
+        else:
+            print("Missing URL",metadata)
+    else:
+        print("Bad Record: "+k)
+        print(metadata)
+with open('record_list.csv','w') as f:
+    w = csv.writer(f)
+    w.writerows(record_list.items())