Skip to content

Commit 2ff5fc2

Browse files
committed
updates
1 parent a49f345 commit 2ff5fc2

File tree

2 files changed

+244
-315
lines changed

2 files changed

+244
-315
lines changed

expansion/gen3_expansion.py

Lines changed: 244 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,37 @@
11
## Gen3 SDK Expansion pack
22

3+
# Install gen3sdk via pip
4+
!pip install --force --upgrade gen3 --ignore-installed certifi
5+
6+
# Download and configure gen3-client
7+
!wget https://github.com/uc-cdis/cdis-data-client/releases/download/0.4.1/dataclient_linux.zip
8+
!unzip dataclient_linux.zip
9+
!mkdir /home/jovyan/.gen3
10+
!mv gen3-client /home/jovyan/.gen3
11+
!rm dataclient_linux.zip
12+
!/home/jovyan/.gen3/gen3-client configure --profile=bpa --apiendpoint=https://data.bloodpac.org --cred=/home/jovyan/pd/bpa-credentials.json
13+
314
import requests, json, fnmatch, os, os.path, sys, subprocess, glob
415
import pandas as pd
516
from pandas.io.json import json_normalize
17+
from collections import Counter
618

719
import gen3
820
from gen3.auth import Gen3Auth
921
from gen3.submission import Gen3Submission
1022
from gen3.file import Gen3File
1123

12-
endpoint = 'https://my.datacommons.org'
13-
auth = Gen3Auth(endpoint, refresh_file='my-credentials.json')
14-
sub = Gen3Submission(endpoint, auth)
15-
file = Gen3File(endpoint, auth)
24+
#plotting
25+
import matplotlib.pyplot as plt
26+
import numpy as np
27+
import seaborn as sns
28+
29+
30+
31+
api = 'https://my.datacommons.org'
32+
auth = Gen3Auth(api, refresh_file='my-credentials.json')
33+
sub = Gen3Submission(api, auth)
34+
file = Gen3File(api, auth)
1635

1736
### AWS S3 Tools:
1837
def s3_ls(path, bucket, profile, pattern='*'):
@@ -142,7 +161,7 @@ def get_node_tsvs(node,projects=None):
142161
if os.path.isfile(filename):
143162
print("File previously downloaded.")
144163
else:
145-
prog,proj = project.split('-')
164+
prog,proj = project.split('-',1)
146165
sub.export_node(prog,proj,node,'tsv',filename)
147166
df1 = pd.read_csv(filename, sep='\t', header=0)
148167
dfs.append(df1)
@@ -159,16 +178,16 @@ def get_node_tsvs(node,projects=None):
159178
def get_project_tsvs(projects):
160179
# Get a TSV for every node in a project
161180
all_nodes = list(set(json_normalize(sub.query("""{_node_type (first:-1) {id}}""")['data']['_node_type'])['id'])) #get all the 'node_id's in the data model
181+
162182
if isinstance(projects,str):
163183
projects = [projects]
184+
164185
for project_id in projects:
165-
#create the directory to store TSVs
166-
mydir = str('project_tsvs/'+project_id+'_tsvs')
186+
mydir = str('project_tsvs/'+project_id+'_tsvs') #create the directory to store TSVs
167187
if not os.path.exists(mydir):
168188
os.makedirs(mydir)
169189
for node in all_nodes:
170-
#check if the project has records in the node
171-
res = sub.query("""{node (of_type:"%s", project_id:"%s"){project_id}}""" % (node,project_id))
190+
res = sub.query("""{node (of_type:"%s", project_id:"%s"){project_id}}""" % (node,project_id)) #check if the project has records in the node
172191
df = json_normalize(res['data']['node'])
173192
if not df.empty:
174193
filename = str(mydir+'/'+project_id+'_'+node+'.tsv')
@@ -178,7 +197,8 @@ def get_project_tsvs(projects):
178197
prog,proj = project_id.split('-',1)
179198
sub.export_node(prog,proj,node,'tsv',filename)
180199
print(filename+' exported to '+mydir)
181-
cmd = ['ls',mydir]
200+
201+
cmd = ['ls',mydir] #look in the download directory
182202
try:
183203
output = subprocess.check_output(cmd, stderr=subprocess.STDOUT).decode('UTF-8')
184204
except Exception as e:
@@ -213,17 +233,37 @@ def delete_node(node,project_id):
213233
results['other'] = other
214234
return results
215235

216-
217-
218-
236+
def delete_records(uuids,project_id):
237+
## Delete a list of records in 'uuids' from a project
238+
program,project = project_id.split('-',1)
239+
failure = []
240+
success = []
241+
other = []
242+
results = {}
243+
if isinstance(uuids, str):
244+
uuids = [uuids]
245+
if isinstance(uuids, list):
246+
for uuid in uuids:
247+
r = json.loads(sub.delete_record(program,project,uuid))
248+
if r['code'] == 400:
249+
failure.append(uuid)
250+
elif r['code'] == 200:
251+
success.append(uuid)
252+
else:
253+
other.append(uuid)
254+
results['failure'] = failure
255+
results['success'] = success
256+
results['other'] = other
257+
return results
219258

220259
def get_urls(guids,api):
260+
# Get URLs for a list of GUIDs
221261
if isinstance(guids, str):
222262
guids = [guids]
223263
if isinstance(guids, list):
224264
urls = {}
225265
for guid in guids:
226-
index_url = "{}index/{}".format(api, guid)
266+
index_url = "{}/index/{}".format(api, guid)
227267
output = requests.get(index_url, auth=auth).text
228268
guid_index = json.loads(output)
229269
url = guid_index['urls'][0]
@@ -232,6 +272,21 @@ def get_urls(guids,api):
232272
print("Please provide one or a list of data file GUIDs: get_urls\(guids=guid_list\)")
233273
return urls
234274

275+
def get_guid_for_filename(file_names,api):
276+
# Get GUIDs for a list of file_names
277+
if isinstance(file_names, str):
278+
file_names = [file_names]
279+
if not isinstance(file_names,list):
280+
print("Please provide one or a list of data file file_names: get_guid_for_filename\(file_names=file_name_list\)")
281+
guids = {}
282+
for file_name in file_names:
283+
index_url = api + '/index/index/?file_name=' + file_name
284+
output = requests.get(index_url, auth=auth).text
285+
index_record = json.loads(output)
286+
if len(index_record['records']) > 0:
287+
guid = index_record['records'][0]['did']
288+
guids[file_name] = guid
289+
return guids
235290

236291
def delete_uploaded_files(guids,api):
237292
# DELETE http://petstore.swagger.io/?url=https://raw.githubusercontent.com/uc-cdis/fence/master/openapis/swagger.yaml#/data/delete_data__file_id_
@@ -253,27 +308,180 @@ def delete_uploaded_files(guids,api):
253308
print("Error deleting GUID {}:".format(guid))
254309
print(response.reason)
255310

311+
def plot_categorical_property(property,df):
312+
#plot a bar graph of categorical variable counts in a dataframe
313+
df = df[df[property].notnull()]
314+
N = len(df)
315+
categories, counts = zip(*Counter(df[property]).items())
316+
y_pos = np.arange(len(categories))
317+
plt.bar(y_pos, counts, align='center', alpha=0.5)
318+
#plt.figtext(.8, .8, 'N = '+str(N))
319+
plt.xticks(y_pos, categories)
320+
plt.ylabel('Counts')
321+
plt.title(str('Counts by '+category+' (N = '+str(N)+')'))
322+
plt.xticks(rotation=90, horizontalalignment='center')
323+
#add N for each bar
324+
plt.show()
325+
326+
def plot_numeric_property(property,df):
327+
#plot a histogram of numeric variable in a dataframe
328+
df = df[df[property].notnull()]
329+
data = list(df[property])
330+
N = len(data)
331+
fig = sns.distplot(data, hist=False, kde=True,
332+
bins=int(180/5), color = 'darkblue',
333+
kde_kws={'linewidth': 2})
334+
plt.figtext(.8, .8, 'N = '+str(N))
335+
plt.xlabel(property)
336+
plt.ylabel("Probability")
337+
plt.title("PDF for all projects "+property) # You can comment this line out if you don't need title
338+
plt.show(fig)
339+
340+
projects = list(set(df['project_id']))
341+
for project in projects:
342+
proj_df = df[df['project_id']==project]
343+
data = list(proj_df[property])
344+
N = len(data)
345+
fig = sns.distplot(data, hist=False, kde=True,
346+
bins=int(180/5), color = 'darkblue',
347+
kde_kws={'linewidth': 2})
348+
plt.figtext(.8, .8, 'N = '+str(N))
349+
plt.xlabel(property)
350+
plt.ylabel("Probability")
351+
plt.title("PDF for "+property+' in ' + project) # You can comment this line out if you don't need title
352+
plt.show(fig)
353+
354+
def node_record_counts(project_id):
355+
query_txt = """{node (first:-1, project_id:"%s"){type}}""" % (project_id)
356+
res = sub.query(query_txt)
357+
df = json_normalize(res['data']['node'])
358+
counts = Counter(df['type'])
359+
df = pd.DataFrame.from_dict(counts, orient='index').reset_index()
360+
df = df.rename(columns={'index':'node', 0:'count'})
361+
return df
362+
363+
def list_project_files(project_id):
364+
query_txt = """{datanode(first:-1,project_id: "%s") {type file_name id object_id}}""" % (project_id)
365+
res = sub.query(query_txt)
366+
if len(res['data']['datanode']) == 0:
367+
print('Project ' + project_id + ' has no records in any data_file node.')
368+
return None
369+
else:
370+
df = json_normalize(res['data']['datanode'])
371+
json_normalize(Counter(df['type']))
372+
#guids = df.loc[(df['type'] == node)]['object_id']
373+
return df
374+
375+
def get_data_file_tsvs(projects=None,remove_empty=True):
376+
# Download TSVs for all data file nodes in the specified projects
377+
#if no projects specified, get node for all projects
378+
if projects is None:
379+
projects = list(json_normalize(sub.query("""{project (first:0){project_id}}""")['data']['project'])['project_id'])
380+
elif isinstance(projects, str):
381+
projects = [projects]
382+
# Make a directory for files
383+
mydir = 'downloaded_data_file_tsvs'
384+
if not os.path.exists(mydir):
385+
os.makedirs(mydir)
386+
# list all data_file 'node_id's in the data model
387+
dnodes = list(set(json_normalize(sub.query("""{_node_type (first:-1,category:"data_file") {id}}""")['data']['_node_type'])['id']))
388+
mnodes = list(set(json_normalize(sub.query("""{_node_type (first:-1,category:"metadata_file") {id}}""")['data']['_node_type'])['id']))
389+
inodes = list(set(json_normalize(sub.query("""{_node_type (first:-1,category:"index_file") {id}}""")['data']['_node_type'])['id']))
390+
nodes = list(set(dnodes + mnodes + inodes))
391+
# get TSVs and return a master pandas DataFrame with records from every project
392+
dfs = []
393+
df_len = 0
394+
for node in nodes:
395+
for project in projects:
396+
filename = str(mydir+'/'+project+'_'+node+'.tsv')
397+
if os.path.isfile(filename):
398+
print('\n'+filename + " previously downloaded.")
399+
else:
400+
prog,proj = project.split('-',1)
401+
sub.export_node(prog,proj,node,'tsv',filename) # use the gen3sdk to download a tsv for the node
402+
df1 = pd.read_csv(filename, sep='\t', header=0) # read in the downloaded TSV to append to the master (all projects) TSV
403+
dfs.append(df1)
404+
df_len+=len(df1) # Counting the total number of records in the node
405+
print(filename +' has '+str(len(df1))+' records.')
406+
if remove_empty is True:
407+
if df1.empty:
408+
print('Removing empty file: ' + filename)
409+
cmd = ['rm',filename] #look in the download directory
410+
try:
411+
output = subprocess.check_output(cmd, stderr=subprocess.STDOUT).decode('UTF-8')
412+
except Exception as e:
413+
output = e.output.decode('UTF-8')
414+
print("ERROR:" + output)
415+
all_data = pd.concat(dfs, ignore_index=True, sort=False)
416+
print('\nlength of all dfs: ' +str(df_len)) # this should match len(all_data) below
417+
nodefile = str('master_'+node+'.tsv')
418+
all_data.to_csv(str(mydir+'/'+nodefile),sep='\t')
419+
print('Master node TSV with '+str(len(all_data))+' total records written to '+nodefile+'.') # this should match df_len above
420+
return all_data
256421

257-
258-
def delete_records(uuids,project_id):
259-
## Delete a list of records in 'uuids' from a project
260-
program,project = project_id.split('-',1)
261-
failure = []
262-
success = []
263-
other = []
264-
results = {}
265-
if isinstance(uuids, str):
266-
uuids = [uuids]
267-
if isinstance(uuids, list):
268-
for uuid in uuids:
269-
r = json.loads(sub.delete_record(program,project,uuid))
270-
if r['code'] == 400:
271-
failure.append(uuid)
272-
elif r['code'] == 200:
273-
success.append(uuid)
422+
def list_guids_in_nodes(nodes=None,projects=None):
423+
# Get GUIDs for node(s) in project(s)
424+
if nodes is None: # get all data_file/metadata_file/index_file 'node_id's in the data model
425+
categories = ['data_file','metadata_file','index_file']
426+
nodes = []
427+
for category in categories:
428+
query_txt = """{_node_type (first:-1,category:"%s") {id}}""" % category
429+
df = json_normalize(sub.query(query_txt)['data']['_node_type'])
430+
if not df.empty:
431+
nodes = list(set(nodes + list(set(df['id']))))
432+
elif isinstance(nodes,str):
433+
nodes = [nodes]
434+
if projects is None:
435+
projects = list(json_normalize(sub.query("""{project (first:0){project_id}}""")['data']['project'])['project_id'])
436+
elif isinstance(projects, str):
437+
projects = [projects]
438+
all_guids = {} # all_guids will be a nested dict: {project_id: {node1:[guids1],node2:[guids2]} }
439+
for project in projects:
440+
all_guids[project] = {}
441+
for node in nodes:
442+
guids=[]
443+
query_txt = """{%s (first:-1,project_id:"%s") {project_id file_size file_name object_id id}}""" % (node,project)
444+
res = sub.query(query_txt)
445+
if len(res['data'][node]) == 0:
446+
print(project + ' has no records in node ' + node + '.')
447+
guids = None
274448
else:
275-
other.append(uuid)
276-
results['failure'] = failure
277-
results['success'] = success
278-
results['other'] = other
279-
return results
449+
df = json_normalize(res['data'][node])
450+
guids = list(df['object_id'])
451+
print(project + ' has '+str(len(guids))+' records in node ' + node + '.')
452+
all_guids[project][node] = guids
453+
# nested dict: all_guids[project][node]
454+
return all_guids
455+
456+
457+
def download_files_by_guids(guids=None):
458+
# Make a directory for files
459+
mydir = 'downloaded_data_files'
460+
if not os.path.exists(mydir):
461+
os.makedirs(mydir)
462+
if isinstance(guids, str):
463+
guids = [guids]
464+
if isinstance(guids, list):
465+
file_names = {}
466+
for guid in guids:
467+
cmd = client+' download --profile='+profile+' --guid='+guid
468+
try:
469+
output = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True).decode('UTF-8')
470+
try:
471+
file_name = re.search('Successfully downloaded (.+)\\n', output).group(1)
472+
cmd = 'mv ' + file_name + ' ' + mydir
473+
try:
474+
output = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True).decode('UTF-8')
475+
except Exception as e:
476+
output = e.output.decode('UTF-8')
477+
print("ERROR:" + output)
478+
except AttributeError:
479+
file_name = '' # apply your error handling
480+
print('Successfully downloaded: '+file_name)
481+
file_names[guid] = file_name
482+
except Exception as e:
483+
output = e.output.decode('UTF-8')
484+
print("ERROR:" + output)
485+
else:
486+
print('Provide a list of guids to download: "get_file_by_guid(guids=guid_list)"')
487+
return file_names

0 commit comments

Comments
 (0)