Skip to content

Commit 41c9991

Browse files
committed
added logging (verbose) functionality
1 parent 222bbdc commit 41c9991

File tree

1 file changed

+50
-6
lines changed

1 file changed

+50
-6
lines changed

scripts/prepare_data_for_improve.py

Lines changed: 50 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11

22
import argparse
33
import functools as ft
4+
import logging
45
from os import PathLike
56
from pathlib import Path
67
from pathlib import PurePath
@@ -10,6 +11,8 @@
1011
import coderdata as cd
1112
import pandas as pd
1213

14+
logger = logging.getLogger(__name__)
15+
1316
def main():
1417

1518
main_parser = argparse.ArgumentParser(add_help=True)
@@ -32,6 +35,13 @@ def main():
3235
dest='OVERWRITE',
3336
action='store_true',
3437
)
38+
p_shared_args.add_argument(
39+
'-v', '--verbose',
40+
dest='LOGLEVEL',
41+
choices=['warn', 'info', 'debug'],
42+
default='warn',
43+
help='defines verbosity level of logging'
44+
)
3545

3646
p_setup_workflow = command_parsers.add_parser(
3747
"setup",
@@ -86,6 +96,19 @@ def main():
8696
sys.exit(e)
8797
except ValueError as e:
8898
sys.exit(e)
99+
if args.LOGLEVEL == 'info':
100+
loglevel = logging.INFO
101+
elif args.LOGLEVEL == 'debug':
102+
loglevel = logging.DEBUG
103+
else:
104+
loglevel = logging.WARNING
105+
106+
logging.basicConfig(
107+
format="{asctime} - {levelname} - {message}",
108+
style="{",
109+
datefmt="%Y-%m-%d %H:%M",
110+
level=loglevel
111+
)
89112
args.func(args)
90113

91114

@@ -104,18 +127,22 @@ def process_datasets(args):
104127

105128
# loading all available datasets into a dict where the dataset name
106129
# is the key
130+
logger.info("importing datasets...")
107131
data_sets = {}
108132
for data_set in data_sets_info.keys():
109133
data_sets[data_set] = cd.load(name=data_set, local_path=local_path)
110-
134+
logger.info("importing datasets... done")
111135

112136
#-------------------------------------------------------------------
113137
# concatting all experiments / responses to create response.tsv
114138
#-------------------------------------------------------------------
139+
logger.info("creating 'response.tsv' ...")
115140
experiments = []
141+
logger.debug("creating list of datasets that contain experiment info ...")
116142
for data_set in data_sets_info.keys():
117143
# not all Datasets have experiments / drug response data
118144
if data_sets[data_set].experiments is not None:
145+
logger.debug(f"experiment data found for {data_set}")
119146
# formatting existing response data to wide
120147
experiment = data_sets[data_set].format(
121148
data_type='experiments',
@@ -133,8 +160,11 @@ def process_datasets(args):
133160
],
134161
)
135162
experiments.append(experiment)
136-
163+
else:
164+
logger.debug(f"NO experiment data for {data_set}")
165+
137166
# concatenating existing response data and "clean up"
167+
logger.debug("concatenating experiment data ...")
138168
response_data = pd.concat(experiments, axis=0, ignore_index=True)
139169
# TODO: potentially more columns must be renamed
140170
# (e.g. fit_auc to auc). If so this would happen here
@@ -149,6 +179,7 @@ def process_datasets(args):
149179
index=False,
150180
sep='\t',
151181
)
182+
logger.info(f"drug response data written to '{outfile_path}'")
152183
# temporary addition of "index column" to serve as a reference for
153184
# the extraction of split files
154185
response_data['index'] = response_data.index
@@ -294,14 +325,13 @@ def split_data_sets(
294325

295326
splits_folder = args.WORKDIR.joinpath('data_out', 'splits')
296327
split_type = args.SPLIT_TYPE
297-
# TODO: potentially change vars to be read from `args`
298328
ratio = (8,1,1)
299329
stratify_by = None
300330
random_state = None
301331

302332
for data_set in data_sets_info.keys():
303333
if data_sets[data_set].experiments is not None:
304-
334+
logger.info(f'creating splits for {data_set} ...')
305335
# getting "<DATASET>_all.txt"
306336
drug_response_rows = (
307337
data_sets['mpnst']
@@ -334,6 +364,9 @@ def split_data_sets(
334364

335365
splits = {}
336366
for i in range(0, args.NUM_SPLITS):
367+
logger.debug(
368+
f"split #{i} of {args.NUM_SPLITS} for {data_set} ..."
369+
)
337370
splits[i] = data_sets[data_set].train_test_validate(
338371
split_type=split_type,
339372
ratio=ratio,
@@ -359,15 +392,23 @@ def split_data_sets(
359392
response_data,
360393
train_keys,
361394
how='inner',
362-
on=['improve_sample_id', 'improve_chem_id', "time", "study"],
395+
on=[
396+
'improve_sample_id',
397+
'improve_chem_id',
398+
"time",
399+
"study"
400+
],
401+
)
402+
outfile_path = splits_folder.joinpath(
403+
f"{data_set}_split_{i}_train.txt"
363404
)
364-
outfile_path = splits_folder.joinpath(f"{data_set}_split_{i}_train.txt")
365405
row_nums.to_csv(
366406
path_or_buf=outfile_path,
367407
columns=['index'],
368408
index=False,
369409
header=False
370410
)
411+
logger.debug(f"training split written to {outfile_path}")
371412

372413
test_keys = (
373414
splits[i]
@@ -397,6 +438,7 @@ def split_data_sets(
397438
index=False,
398439
header=False
399440
)
441+
logger.debug(f"testing split written to {outfile_path}")
400442

401443
val_keys = (
402444
splits[i]
@@ -426,6 +468,8 @@ def split_data_sets(
426468
index=False,
427469
header=False
428470
)
471+
logger.debug(f"validation split written to {outfile_path}")
472+
logger.info(f"all splits for {data_set} generated")
429473

430474

431475
def merge_master_tables(args, data_sets, data_type: str='transcriptomics'):

0 commit comments

Comments
 (0)