11## Gen3 SDK Expansion pack
22
3+ # Install gen3sdk via pip
4+ !pip install - - force - - upgrade gen3 - - ignore - installed certifi
5+
6+ # Download and configure gen3-client
7+ !wget https :// github .com / uc - cdis / cdis - data - client / releases / download / 0.4 .1 / dataclient_linux .zip
8+ !unzip dataclient_linux .zip
9+ !mkdir / home / jovyan / .gen3
10+ !mv gen3 - client / home / jovyan / .gen3
11+ !rm dataclient_linux .zip
12+ !/ home / jovyan / .gen3 / gen3 - client configure - - profile = bpa - - apiendpoint = https :// data .bloodpac .org - - cred = / home / jovyan / pd / bpa - credentials .json
13+
314import requests , json , fnmatch , os , os .path , sys , subprocess , glob
415import pandas as pd
516from pandas .io .json import json_normalize
17+ from collections import Counter
618
719import gen3
820from gen3 .auth import Gen3Auth
921from gen3 .submission import Gen3Submission
1022from gen3 .file import Gen3File
1123
12- endpoint = 'https://my.datacommons.org'
13- auth = Gen3Auth (endpoint , refresh_file = 'my-credentials.json' )
14- sub = Gen3Submission (endpoint , auth )
15- file = Gen3File (endpoint , auth )
24+ #plotting
25+ import matplotlib .pyplot as plt
26+ import numpy as np
27+ import seaborn as sns
28+
29+
30+
31+ api = 'https://my.datacommons.org'
32+ auth = Gen3Auth (api , refresh_file = 'my-credentials.json' )
33+ sub = Gen3Submission (api , auth )
34+ file = Gen3File (api , auth )
1635
1736### AWS S3 Tools:
1837def s3_ls (path , bucket , profile , pattern = '*' ):
@@ -142,7 +161,7 @@ def get_node_tsvs(node,projects=None):
142161 if os .path .isfile (filename ):
143162 print ("File previously downloaded." )
144163 else :
145- prog ,proj = project .split ('-' )
164+ prog ,proj = project .split ('-' , 1 )
146165 sub .export_node (prog ,proj ,node ,'tsv' ,filename )
147166 df1 = pd .read_csv (filename , sep = '\t ' , header = 0 )
148167 dfs .append (df1 )
@@ -159,16 +178,16 @@ def get_node_tsvs(node,projects=None):
159178def get_project_tsvs (projects ):
160179 # Get a TSV for every node in a project
161180 all_nodes = list (set (json_normalize (sub .query ("""{_node_type (first:-1) {id}}""" )['data' ]['_node_type' ])['id' ])) #get all the 'node_id's in the data model
181+
162182 if isinstance (projects ,str ):
163183 projects = [projects ]
184+
164185 for project_id in projects :
165- #create the directory to store TSVs
166- mydir = str ('project_tsvs/' + project_id + '_tsvs' )
186+ mydir = str ('project_tsvs/' + project_id + '_tsvs' ) #create the directory to store TSVs
167187 if not os .path .exists (mydir ):
168188 os .makedirs (mydir )
169189 for node in all_nodes :
170- #check if the project has records in the node
171- res = sub .query ("""{node (of_type:"%s", project_id:"%s"){project_id}}""" % (node ,project_id ))
190+ res = sub .query ("""{node (of_type:"%s", project_id:"%s"){project_id}}""" % (node ,project_id )) #check if the project has records in the node
172191 df = json_normalize (res ['data' ]['node' ])
173192 if not df .empty :
174193 filename = str (mydir + '/' + project_id + '_' + node + '.tsv' )
@@ -178,7 +197,8 @@ def get_project_tsvs(projects):
178197 prog ,proj = project_id .split ('-' ,1 )
179198 sub .export_node (prog ,proj ,node ,'tsv' ,filename )
180199 print (filename + ' exported to ' + mydir )
181- cmd = ['ls' ,mydir ]
200+
201+ cmd = ['ls' ,mydir ] #look in the download directory
182202 try :
183203 output = subprocess .check_output (cmd , stderr = subprocess .STDOUT ).decode ('UTF-8' )
184204 except Exception as e :
@@ -213,17 +233,37 @@ def delete_node(node,project_id):
213233 results ['other' ] = other
214234 return results
215235
216-
217-
218-
236+ def delete_records (uuids ,project_id ):
237+ ## Delete a list of records in 'uuids' from a project
238+ program ,project = project_id .split ('-' ,1 )
239+ failure = []
240+ success = []
241+ other = []
242+ results = {}
243+ if isinstance (uuids , str ):
244+ uuids = [uuids ]
245+ if isinstance (uuids , list ):
246+ for uuid in uuids :
247+ r = json .loads (sub .delete_record (program ,project ,uuid ))
248+ if r ['code' ] == 400 :
249+ failure .append (uuid )
250+ elif r ['code' ] == 200 :
251+ success .append (uuid )
252+ else :
253+ other .append (uuid )
254+ results ['failure' ] = failure
255+ results ['success' ] = success
256+ results ['other' ] = other
257+ return results
219258
220259def get_urls (guids ,api ):
260+ # Get URLs for a list of GUIDs
221261 if isinstance (guids , str ):
222262 guids = [guids ]
223263 if isinstance (guids , list ):
224264 urls = {}
225265 for guid in guids :
226- index_url = "{}index/{}" .format (api , guid )
266+ index_url = "{}/ index/{}" .format (api , guid )
227267 output = requests .get (index_url , auth = auth ).text
228268 guid_index = json .loads (output )
229269 url = guid_index ['urls' ][0 ]
@@ -232,6 +272,21 @@ def get_urls(guids,api):
232272 print ("Please provide one or a list of data file GUIDs: get_urls\(guids=guid_list\)" )
233273 return urls
234274
275+ def get_guid_for_filename (file_names ,api ):
276+ # Get GUIDs for a list of file_names
277+ if isinstance (file_names , str ):
278+ file_names = [file_names ]
279+ if not isinstance (file_names ,list ):
280+ print ("Please provide one or a list of data file file_names: get_guid_for_filename\(file_names=file_name_list\)" )
281+ guids = {}
282+ for file_name in file_names :
283+ index_url = api + '/index/index/?file_name=' + file_name
284+ output = requests .get (index_url , auth = auth ).text
285+ index_record = json .loads (output )
286+ if len (index_record ['records' ]) > 0 :
287+ guid = index_record ['records' ][0 ]['did' ]
288+ guids [file_name ] = guid
289+ return guids
235290
236291def delete_uploaded_files (guids ,api ):
237292# DELETE http://petstore.swagger.io/?url=https://raw.githubusercontent.com/uc-cdis/fence/master/openapis/swagger.yaml#/data/delete_data__file_id_
@@ -253,27 +308,180 @@ def delete_uploaded_files(guids,api):
253308 print ("Error deleting GUID {}:" .format (guid ))
254309 print (response .reason )
255310
311+ def plot_categorical_property (property ,df ):
312+ #plot a bar graph of categorical variable counts in a dataframe
313+ df = df [df [property ].notnull ()]
314+ N = len (df )
315+ categories , counts = zip (* Counter (df [property ]).items ())
316+ y_pos = np .arange (len (categories ))
317+ plt .bar (y_pos , counts , align = 'center' , alpha = 0.5 )
318+ #plt.figtext(.8, .8, 'N = '+str(N))
319+ plt .xticks (y_pos , categories )
320+ plt .ylabel ('Counts' )
321+ plt .title (str ('Counts by ' + category + ' (N = ' + str (N )+ ')' ))
322+ plt .xticks (rotation = 90 , horizontalalignment = 'center' )
323+ #add N for each bar
324+ plt .show ()
325+
326+ def plot_numeric_property (property ,df ):
327+ #plot a histogram of numeric variable in a dataframe
328+ df = df [df [property ].notnull ()]
329+ data = list (df [property ])
330+ N = len (data )
331+ fig = sns .distplot (data , hist = False , kde = True ,
332+ bins = int (180 / 5 ), color = 'darkblue' ,
333+ kde_kws = {'linewidth' : 2 })
334+ plt .figtext (.8 , .8 , 'N = ' + str (N ))
335+ plt .xlabel (property )
336+ plt .ylabel ("Probability" )
337+ plt .title ("PDF for all projects " + property ) # You can comment this line out if you don't need title
338+ plt .show (fig )
339+
340+ projects = list (set (df ['project_id' ]))
341+ for project in projects :
342+ proj_df = df [df ['project_id' ]== project ]
343+ data = list (proj_df [property ])
344+ N = len (data )
345+ fig = sns .distplot (data , hist = False , kde = True ,
346+ bins = int (180 / 5 ), color = 'darkblue' ,
347+ kde_kws = {'linewidth' : 2 })
348+ plt .figtext (.8 , .8 , 'N = ' + str (N ))
349+ plt .xlabel (property )
350+ plt .ylabel ("Probability" )
351+ plt .title ("PDF for " + property + ' in ' + project ) # You can comment this line out if you don't need title
352+ plt .show (fig )
353+
354+ def node_record_counts (project_id ):
355+ query_txt = """{node (first:-1, project_id:"%s"){type}}""" % (project_id )
356+ res = sub .query (query_txt )
357+ df = json_normalize (res ['data' ]['node' ])
358+ counts = Counter (df ['type' ])
359+ df = pd .DataFrame .from_dict (counts , orient = 'index' ).reset_index ()
360+ df = df .rename (columns = {'index' :'node' , 0 :'count' })
361+ return df
362+
363+ def list_project_files (project_id ):
364+ query_txt = """{datanode(first:-1,project_id: "%s") {type file_name id object_id}}""" % (project_id )
365+ res = sub .query (query_txt )
366+ if len (res ['data' ]['datanode' ]) == 0 :
367+ print ('Project ' + project_id + ' has no records in any data_file node.' )
368+ return None
369+ else :
370+ df = json_normalize (res ['data' ]['datanode' ])
371+ json_normalize (Counter (df ['type' ]))
372+ #guids = df.loc[(df['type'] == node)]['object_id']
373+ return df
374+
375+ def get_data_file_tsvs (projects = None ,remove_empty = True ):
376+ # Download TSVs for all data file nodes in the specified projects
377+ #if no projects specified, get node for all projects
378+ if projects is None :
379+ projects = list (json_normalize (sub .query ("""{project (first:0){project_id}}""" )['data' ]['project' ])['project_id' ])
380+ elif isinstance (projects , str ):
381+ projects = [projects ]
382+ # Make a directory for files
383+ mydir = 'downloaded_data_file_tsvs'
384+ if not os .path .exists (mydir ):
385+ os .makedirs (mydir )
386+ # list all data_file 'node_id's in the data model
387+ dnodes = list (set (json_normalize (sub .query ("""{_node_type (first:-1,category:"data_file") {id}}""" )['data' ]['_node_type' ])['id' ]))
388+ mnodes = list (set (json_normalize (sub .query ("""{_node_type (first:-1,category:"metadata_file") {id}}""" )['data' ]['_node_type' ])['id' ]))
389+ inodes = list (set (json_normalize (sub .query ("""{_node_type (first:-1,category:"index_file") {id}}""" )['data' ]['_node_type' ])['id' ]))
390+ nodes = list (set (dnodes + mnodes + inodes ))
391+ # get TSVs and return a master pandas DataFrame with records from every project
392+ dfs = []
393+ df_len = 0
394+ for node in nodes :
395+ for project in projects :
396+ filename = str (mydir + '/' + project + '_' + node + '.tsv' )
397+ if os .path .isfile (filename ):
398+ print ('\n ' + filename + " previously downloaded." )
399+ else :
400+ prog ,proj = project .split ('-' ,1 )
401+ sub .export_node (prog ,proj ,node ,'tsv' ,filename ) # use the gen3sdk to download a tsv for the node
402+ df1 = pd .read_csv (filename , sep = '\t ' , header = 0 ) # read in the downloaded TSV to append to the master (all projects) TSV
403+ dfs .append (df1 )
404+ df_len += len (df1 ) # Counting the total number of records in the node
405+ print (filename + ' has ' + str (len (df1 ))+ ' records.' )
406+ if remove_empty is True :
407+ if df1 .empty :
408+ print ('Removing empty file: ' + filename )
409+ cmd = ['rm' ,filename ] #look in the download directory
410+ try :
411+ output = subprocess .check_output (cmd , stderr = subprocess .STDOUT ).decode ('UTF-8' )
412+ except Exception as e :
413+ output = e .output .decode ('UTF-8' )
414+ print ("ERROR:" + output )
415+ all_data = pd .concat (dfs , ignore_index = True , sort = False )
416+ print ('\n length of all dfs: ' + str (df_len )) # this should match len(all_data) below
417+ nodefile = str ('master_' + node + '.tsv' )
418+ all_data .to_csv (str (mydir + '/' + nodefile ),sep = '\t ' )
419+ print ('Master node TSV with ' + str (len (all_data ))+ ' total records written to ' + nodefile + '.' ) # this should match df_len above
420+ return all_data
256421
257-
258- def delete_records (uuids ,project_id ):
259- ## Delete a list of records in 'uuids' from a project
260- program ,project = project_id .split ('-' ,1 )
261- failure = []
262- success = []
263- other = []
264- results = {}
265- if isinstance (uuids , str ):
266- uuids = [uuids ]
267- if isinstance (uuids , list ):
268- for uuid in uuids :
269- r = json .loads (sub .delete_record (program ,project ,uuid ))
270- if r ['code' ] == 400 :
271- failure .append (uuid )
272- elif r ['code' ] == 200 :
273- success .append (uuid )
422+ def list_guids_in_nodes (nodes = None ,projects = None ):
423+ # Get GUIDs for node(s) in project(s)
424+ if nodes is None : # get all data_file/metadata_file/index_file 'node_id's in the data model
425+ categories = ['data_file' ,'metadata_file' ,'index_file' ]
426+ nodes = []
427+ for category in categories :
428+ query_txt = """{_node_type (first:-1,category:"%s") {id}}""" % category
429+ df = json_normalize (sub .query (query_txt )['data' ]['_node_type' ])
430+ if not df .empty :
431+ nodes = list (set (nodes + list (set (df ['id' ]))))
432+ elif isinstance (nodes ,str ):
433+ nodes = [nodes ]
434+ if projects is None :
435+ projects = list (json_normalize (sub .query ("""{project (first:0){project_id}}""" )['data' ]['project' ])['project_id' ])
436+ elif isinstance (projects , str ):
437+ projects = [projects ]
438+ all_guids = {} # all_guids will be a nested dict: {project_id: {node1:[guids1],node2:[guids2]} }
439+ for project in projects :
440+ all_guids [project ] = {}
441+ for node in nodes :
442+ guids = []
443+ query_txt = """{%s (first:-1,project_id:"%s") {project_id file_size file_name object_id id}}""" % (node ,project )
444+ res = sub .query (query_txt )
445+ if len (res ['data' ][node ]) == 0 :
446+ print (project + ' has no records in node ' + node + '.' )
447+ guids = None
274448 else :
275- other .append (uuid )
276- results ['failure' ] = failure
277- results ['success' ] = success
278- results ['other' ] = other
279- return results
449+ df = json_normalize (res ['data' ][node ])
450+ guids = list (df ['object_id' ])
451+ print (project + ' has ' + str (len (guids ))+ ' records in node ' + node + '.' )
452+ all_guids [project ][node ] = guids
453+ # nested dict: all_guids[project][node]
454+ return all_guids
455+
456+
457+ def download_files_by_guids (guids = None ):
458+ # Make a directory for files
459+ mydir = 'downloaded_data_files'
460+ if not os .path .exists (mydir ):
461+ os .makedirs (mydir )
462+ if isinstance (guids , str ):
463+ guids = [guids ]
464+ if isinstance (guids , list ):
465+ file_names = {}
466+ for guid in guids :
467+ cmd = client + ' download --profile=' + profile + ' --guid=' + guid
468+ try :
469+ output = subprocess .check_output (cmd , stderr = subprocess .STDOUT , shell = True ).decode ('UTF-8' )
470+ try :
471+ file_name = re .search ('Successfully downloaded (.+)\\ n' , output ).group (1 )
472+ cmd = 'mv ' + file_name + ' ' + mydir
473+ try :
474+ output = subprocess .check_output (cmd , stderr = subprocess .STDOUT , shell = True ).decode ('UTF-8' )
475+ except Exception as e :
476+ output = e .output .decode ('UTF-8' )
477+ print ("ERROR:" + output )
478+ except AttributeError :
479+ file_name = '' # apply your error handling
480+ print ('Successfully downloaded: ' + file_name )
481+ file_names [guid ] = file_name
482+ except Exception as e :
483+ output = e .output .decode ('UTF-8' )
484+ print ("ERROR:" + output )
485+ else :
486+ print ('Provide a list of guids to download: "get_file_by_guid(guids=guid_list)"' )
487+ return file_names
0 commit comments