Skip to content

Commit a278cb9

Browse files
committed
Start of geo date update script
1 parent 9750c1c commit a278cb9

File tree

2 files changed

+73
-0
lines changed

2 files changed

+73
-0
lines changed

edit_all_geo.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import argparse, os, json, requests,csv
2+
from caltechdata_api import caltechdata_edit, decustomize_schema
3+
4+
#Get access token from TIND sed as environment variable with source token.bash
5+
token = os.environ['TINDTOK']
6+
7+
production = True
8+
9+
if production == True:
10+
url = 'https://data.caltech.edu/api/records'
11+
else:
12+
url = 'https://cd-sandbox.tind.io/api/records'
13+
14+
response = requests.get(url+'/?size=1000&q=subjects:gps,thesis')
15+
hits = response.json()
16+
17+
#Set up dictionary of links between resolver and thesis IDs
18+
available = os.path.isfile('data/record_list.csv')
19+
if available == False:
20+
print("You need to run update_thesis_file.py")
21+
exit()
22+
else:
23+
record_list = {}
24+
reader=csv.reader(open("data/record_list.csv"))
25+
for row in reader:
26+
record_list[row[1]] = row[0]
27+
28+
for h in hits['hits']['hits']:
29+
rid = str(h['id'])
30+
print(rid)
31+
record = decustomize_schema(h['metadata'],True)
32+
if 'relatedIdentifiers' in record:
33+
for r in record['relatedIdentifiers']:
34+
if r['relationType']=='IsSupplementTo' and 'relatedIdentifierType'=='URL':
35+
idv = record_list[r['relatedIdentifier']]
36+
print(idv)
37+
#metadata =\
38+
#{'descriptions':[{'description':description,'descriptionType':'Abstract'}]}
39+
#response = caltechdata_edit(token, rid, metadata, {}, {}, production)
40+
#print(response)

update_thesis_file.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import os,subprocess,json,csv
2+
import dataset
3+
from ames.harvesters import get_caltechfeed
4+
5+
if os.path.isdir('data') == False:
6+
os.mkdir('data')
7+
os.chdir('data')
8+
9+
get_caltechfeed('thesis')
10+
11+
record_list = {}
12+
collection = 'CaltechTHESIS.ds'
13+
keys = dataset.keys(collection)
14+
count = 0
15+
for k in keys:
16+
count = count + 1
17+
if count % 100 == 0:
18+
print(count)
19+
metadata,err = dataset.read(collection,k)
20+
if err != '':
21+
print("Error on read ",err)
22+
exit()
23+
if metadata != {}:
24+
if 'official_url' in metadata:
25+
record_list[k]=metadata['official_url']
26+
else:
27+
print("Missing URL",metadata)
28+
else:
29+
print("Bad Record: "+k)
30+
print(metadata)
31+
with open('record_list.csv','w') as f:
32+
w = csv.writer(f)
33+
w.writerows(record_list.items())

0 commit comments

Comments
 (0)