11
22import argparse
33import functools as ft
4+ import logging
45from os import PathLike
56from pathlib import Path
67from pathlib import PurePath
1011import coderdata as cd
1112import pandas as pd
1213
14+ logger = logging .getLogger (__name__ )
15+
1316def main ():
1417
1518 main_parser = argparse .ArgumentParser (add_help = True )
@@ -32,6 +35,13 @@ def main():
3235 dest = 'OVERWRITE' ,
3336 action = 'store_true' ,
3437 )
38+ p_shared_args .add_argument (
39+ '-v' , '--verbose' ,
40+ dest = 'LOGLEVEL' ,
41+ choices = ['warn' , 'info' , 'debug' ],
42+ default = 'warn' ,
43+ help = 'defines verbosity level of logging'
44+ )
3545
3646 p_setup_workflow = command_parsers .add_parser (
3747 "setup" ,
@@ -86,6 +96,19 @@ def main():
8696 sys .exit (e )
8797 except ValueError as e :
8898 sys .exit (e )
99+ if args .LOGLEVEL == 'info' :
100+ loglevel = logging .INFO
101+ elif args .LOGLEVEL == 'debug' :
102+ loglevel = logging .DEBUG
103+ else :
104+ loglevel = logging .WARNING
105+
106+ logging .basicConfig (
107+ format = "{asctime} - {levelname} - {message}" ,
108+ style = "{" ,
109+ datefmt = "%Y-%m-%d %H:%M" ,
110+ level = loglevel
111+ )
89112 args .func (args )
90113
91114
@@ -104,18 +127,22 @@ def process_datasets(args):
104127
105128 # loading all available datasets into a dict where the dataset name
106129 # is the key
130+ logger .info ("importing datasets..." )
107131 data_sets = {}
108132 for data_set in data_sets_info .keys ():
109133 data_sets [data_set ] = cd .load (name = data_set , local_path = local_path )
110-
134+ logger . info ( "importing datasets... done" )
111135
112136 #-------------------------------------------------------------------
113137 # concatting all experiments / responses to create response.tsv
114138 #-------------------------------------------------------------------
139+ logger .info ("creating 'response.tsv' ..." )
115140 experiments = []
141+ logger .debug ("creating list of datasets that contain experiment info ..." )
116142 for data_set in data_sets_info .keys ():
117143 # not all Datasets have experiments / drug response data
118144 if data_sets [data_set ].experiments is not None :
145+ logger .debug (f"experiment data found for { data_set } " )
119146 # formatting existing response data to wide
120147 experiment = data_sets [data_set ].format (
121148 data_type = 'experiments' ,
@@ -133,8 +160,11 @@ def process_datasets(args):
133160 ],
134161 )
135162 experiments .append (experiment )
136-
163+ else :
164+ logger .debug (f"NO experiment data for { data_set } " )
165+
137166 # concatenating existing response data and "clean up"
167+ logger .debug ("concatenating experiment data ..." )
138168 response_data = pd .concat (experiments , axis = 0 , ignore_index = True )
139169 # TODO: potentially more columns must be renamed
140170 # (e.g. fit_auc to auc). If so this would happen here
@@ -149,6 +179,7 @@ def process_datasets(args):
149179 index = False ,
150180 sep = '\t ' ,
151181 )
182+ logger .info (f"drug response data written to '{ outfile_path } '" )
152183 # temporary addition of "index column" to serve as a reference for
153184 # the extraction of split files
154185 response_data ['index' ] = response_data .index
@@ -294,14 +325,13 @@ def split_data_sets(
294325
295326 splits_folder = args .WORKDIR .joinpath ('data_out' , 'splits' )
296327 split_type = args .SPLIT_TYPE
297- # TODO: potentially change vars to be read from `args`
298328 ratio = (8 ,1 ,1 )
299329 stratify_by = None
300330 random_state = None
301331
302332 for data_set in data_sets_info .keys ():
303333 if data_sets [data_set ].experiments is not None :
304-
334+ logger . info ( f'creating splits for { data_set } ...' )
305335 # getting "<DATASET>_all.txt"
306336 drug_response_rows = (
307337 data_sets ['mpnst' ]
@@ -334,6 +364,9 @@ def split_data_sets(
334364
335365 splits = {}
336366 for i in range (0 , args .NUM_SPLITS ):
367+ logger .debug (
368+ f"split #{ i } of { args .NUM_SPLITS } for { data_set } ..."
369+ )
337370 splits [i ] = data_sets [data_set ].train_test_validate (
338371 split_type = split_type ,
339372 ratio = ratio ,
@@ -359,15 +392,23 @@ def split_data_sets(
359392 response_data ,
360393 train_keys ,
361394 how = 'inner' ,
362- on = ['improve_sample_id' , 'improve_chem_id' , "time" , "study" ],
395+ on = [
396+ 'improve_sample_id' ,
397+ 'improve_chem_id' ,
398+ "time" ,
399+ "study"
400+ ],
401+ )
402+ outfile_path = splits_folder .joinpath (
403+ f"{ data_set } _split_{ i } _train.txt"
363404 )
364- outfile_path = splits_folder .joinpath (f"{ data_set } _split_{ i } _train.txt" )
365405 row_nums .to_csv (
366406 path_or_buf = outfile_path ,
367407 columns = ['index' ],
368408 index = False ,
369409 header = False
370410 )
411+ logger .debug (f"training split written to { outfile_path } " )
371412
372413 test_keys = (
373414 splits [i ]
@@ -397,6 +438,7 @@ def split_data_sets(
397438 index = False ,
398439 header = False
399440 )
441+ logger .debug (f"testing split written to { outfile_path } " )
400442
401443 val_keys = (
402444 splits [i ]
@@ -426,6 +468,8 @@ def split_data_sets(
426468 index = False ,
427469 header = False
428470 )
471+ logger .debug (f"validation split written to { outfile_path } " )
472+ logger .info (f"all splits for { data_set } generated" )
429473
430474
431475def merge_master_tables (args , data_sets , data_type : str = 'transcriptomics' ):
0 commit comments