Skip to content

Commit 925b790

Browse files
authored
Merge pull request #43 from ECP-CANDLE/develop
Develop merge into Master
2 parents 79b191c + 9b1ed42 commit 925b790

File tree

99 files changed

+7542
-758
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

99 files changed

+7542
-758
lines changed

Pilot1/Combo/NCI60.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
lib_path = os.path.abspath(os.path.join(file_path, '..', '..', 'common'))
1515
sys.path.append(lib_path)
1616

17-
import candle_keras as candle
17+
import candle
1818

1919
global_cache = {}
2020

@@ -104,7 +104,8 @@ def load_dose_response(min_logconc=-4., max_logconc=-4., subsample=None, fractio
104104
return df
105105

106106

107-
def load_combo_response(response_url=None, fraction=False, use_combo_score=False, exclude_cells=[], exclude_drugs=[]):
107+
def load_combo_response(response_url=None, fraction=False, use_combo_score=False, use_mean_growth=False,
108+
exclude_cells=[], exclude_drugs=[]):
108109
"""Load cell line response to pairs of drugs, sub-select response for a specific
109110
drug log concentration range and return a pandas dataframe.
110111
@@ -160,8 +161,15 @@ def load_combo_response(response_url=None, fraction=False, use_combo_score=False
160161
df_max = df.groupby(['CELLNAME', 'NSC1', 'NSC2']).max()
161162
df_max = df_max.add_suffix('_MAX').reset_index() # add SCORE_MAX by flattening the hierarchical index
162163

164+
df_avg = df.copy()
165+
df_avg['PERCENTGROWTH'] = df_avg['PERCENTGROWTH'].apply(lambda x: 100 if x > 100 else 50+x/2 if x < 0 else 50+x/2)
166+
df_avg = df.groupby(['CELLNAME', 'NSC1', 'NSC2']).mean()
167+
df_avg = df_avg.add_suffix('_AVG').reset_index()
168+
163169
if use_combo_score:
164170
df = df_max.rename(columns={'SCORE_MAX': 'GROWTH'}).drop('PERCENTGROWTH_MAX', axis=1)
171+
elif use_mean_growth:
172+
df = df_avg.rename(columns={'PERCENTGROWTH_AVG': 'GROWTH'}).drop('SCORE_AVG', axis=1)
165173
else:
166174
df = df_min.rename(columns={'PERCENTGROWTH_MIN': 'GROWTH'}).drop('SCORE_MIN', axis=1)
167175

@@ -252,6 +260,10 @@ def load_drug_set_descriptors(drug_set='ALMANAC', ncols=None, scaling='std', add
252260
path = get_file(DATA_URL + 'GDSC_PubChemCID_drug_descriptors_dragon7')
253261
elif drug_set == 'NCI_IOA_AOA':
254262
path = get_file(DATA_URL + 'NCI_IOA_AOA_drug_descriptors_dragon7')
263+
elif drug_set == 'RTS':
264+
path = get_file(DATA_URL + 'RTS_drug_descriptors_dragon7')
265+
elif drug_set == 'pan':
266+
path = get_file(DATA_URL + 'pan_drugs_dragon7_descriptors.tsv')
255267
else:
256268
raise Exception('Drug set {} not supported!'.format(drug_set))
257269

@@ -347,7 +359,8 @@ def load_drug_descriptors(ncols=None, scaling='std', add_prefix=True):
347359
add feature namespace prefix
348360
"""
349361

350-
path = get_file(DATA_URL + 'ALMANAC_drug_descriptors_dragon7.txt')
362+
# path = get_file(DATA_URL + 'ALMANAC_drug_descriptors_dragon7.txt')
363+
path = get_file(DATA_URL + 'pan_drugs_dragon7_descriptors.tsv')
351364

352365
df = global_cache.get(path)
353366
if df is None:
@@ -452,7 +465,11 @@ def load_sample_rnaseq(ncols=None, scaling='std', add_prefix=True, use_landmark_
452465
df = pd.read_csv(path, sep='\t', engine='c')
453466
global_cache[path] = df
454467

455-
df = df[df['Sample'].str.startswith(sample_set)].reset_index(drop=True)
468+
if sample_set == 'RTS':
469+
df_ids = pd.read_table(get_file(DATA_URL + 'RTS_PDM_samples'))
470+
df = df.merge(df_ids, on='Sample').reset_index(drop=True)
471+
else:
472+
df = df[df['Sample'].str.startswith(sample_set)].reset_index(drop=True)
456473

457474
# cellmap_path = get_file(DATA_URL + 'NCI60_CELLNAME_to_Combo.new.txt')
458475
# df_cellmap = pd.read_csv(cellmap_path, sep='\t')

Pilot1/Combo/README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,3 +184,16 @@ GDSC.22RV1 NSC.105014 NSC.102816 100 0.1627 0.1060 -0.0531
184184

185185
A version of trained model files with dropout are available here: [saved.uq.model.h5](http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/combo/saved.uq.model.h5) and [saved.uq.weights.h5](http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/combo/saved.uq.weights.h5).
186186

187+
## Profile runs
188+
We have run the same configuration across multiple machines and compared the resource utilization.
189+
```
190+
python uno_baseline_keras2.py --conf combo_perf_benchmark.txt
191+
```
192+
193+
| Machine | Time to complete (HH:mm:ss) | Time per epoch (s) | Perf factor <sup>*</sup> | CPU % | Mem % | Mem GB | GPU % | GPU Mem % | Note |
194+
| ------- | --------------------------: | -----------------: | -----------------------: | ----: | ----: | -----: | ----: | --------: | ---- |
195+
| Theta | 1:14:12 | 811 | 0.31 | 7.6 | 7.6 | 12.8 | | |
196+
| Nucleus | 0:14:13 | 72 | 3.47 | 3.8 | 9.3 | 21.9 | 63.4 | 91.9 |
197+
| Tesla (K20) | 0:44:17 | 250 | 1.00 | 3.9 | 42.3 | 12.9 | 73.8 | 53.3 |
198+
| Titan | | | | | | | | | keras version 2.0.3 does not supprot model.clone_model() which is introduced in 2.0.7 |
199+
* Time per epoch on the machine divided by time per epoch of Titan (or Tesla)

Pilot1/Combo/combo.py

Lines changed: 61 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -12,86 +12,90 @@
1212
lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common'))
1313
sys.path.append(lib_path2)
1414

15-
import candle_keras as candle
15+
import candle
1616

1717
logger = logging.getLogger(__name__)
1818

1919
additional_definitions = [
20-
{'name':'cell_features',
21-
'nargs':'+',
22-
'choices':['expression', 'mirna', 'proteome', 'all', 'expression_5platform', 'expression_u133p2', 'rnaseq', 'categorical'],
23-
'help':"use one or more cell line feature sets: 'expression', 'mirna', 'proteome', 'all'; use all for ['expression', 'mirna', 'proteome']; use 'categorical' for one-hot encoded cell lines"},
24-
{'name':'drug_features', 'nargs':'+',
25-
'choices':['descriptors', 'latent', 'all', 'categorical', 'noise'],
26-
'help':"use dragon7 descriptors, latent representations from Aspuru-Guzik's SMILES autoencoder, or both, or one-hot encoded drugs, or random features; 'descriptors','latent', 'all', 'categorical', 'noise'"},
27-
{'name':'dense_feature_layers',
28-
'nargs':'+',
29-
'type':int,
30-
'help':'number of neurons in intermediate dense layers in the feature encoding submodels'},
31-
{'name':'use_landmark_genes',
20+
{'name':'cell_features',
21+
'nargs':'+',
22+
'choices':['expression', 'mirna', 'proteome', 'all', 'expression_5platform', 'expression_u133p2', 'rnaseq', 'categorical'],
23+
'help':"use one or more cell line feature sets: 'expression', 'mirna', 'proteome', 'all'; use all for ['expression', 'mirna', 'proteome']; use 'categorical' for one-hot encoded cell lines"},
24+
{'name':'drug_features', 'nargs':'+',
25+
'choices':['descriptors', 'latent', 'all', 'categorical', 'noise'],
26+
'help':"use dragon7 descriptors, latent representations from Aspuru-Guzik's SMILES autoencoder, or both, or one-hot encoded drugs, or random features; 'descriptors','latent', 'all', 'categorical', 'noise'"},
27+
{'name':'dense_feature_layers',
28+
'nargs':'+',
29+
'type':int,
30+
'help':'number of neurons in intermediate dense layers in the feature encoding submodels'},
31+
{'name':'use_landmark_genes',
3232
'type':candle.str2bool,
33-
'default':True, #action="store_true",
34-
'help':"use the 978 landmark genes from LINCS (L1000) as expression features"},
35-
{'name':'preprocess_rnaseq',
33+
'default':True, #action="store_true",
34+
'help':"use the 978 landmark genes from LINCS (L1000) as expression features"},
35+
{'name':'preprocess_rnaseq',
3636
'default':'none',
37-
'choices':['source_scale', 'combat', 'none'],
38-
'help':"preprocessing method for RNAseq data; none for global normalization"},
39-
{'name':'response_url',
37+
'choices':['source_scale', 'combat', 'none'],
38+
'help':"preprocessing method for RNAseq data; none for global normalization"},
39+
{'name':'response_url',
4040
'default':None,
41-
'help':"URL to combo dose response file"},
41+
'help':"URL to combo dose response file"},
4242
{'name':'residual',
4343
'type':candle.str2bool,
44-
'default':True, #action="store_true",
45-
'help':"add skip connections to the layers"},
44+
'default':True, #action="store_true",
45+
'help':"add skip connections to the layers"},
4646
{'name':'reduce_lr',
4747
'type':candle.str2bool,
48-
'default':True, #action="store_true",
49-
'help':'reduce learning rate on plateau'},
50-
{'name':'warmup_lr',
48+
'default':True, #action="store_true",
49+
'help':'reduce learning rate on plateau'},
50+
{'name':'warmup_lr',
5151
'type':candle.str2bool,
52-
'default':True, #action="store_true",
53-
'help':'gradually increase learning rate on start'},
54-
{'name':'base_lr', 'type':float,
55-
'default':None,
56-
'help':'base learning rate'},
57-
{'name':'cp',
52+
'default':True, #action="store_true",
53+
'help':'gradually increase learning rate on start'},
54+
{'name':'base_lr', 'type':float,
55+
'default':None,
56+
'help':'base learning rate'},
57+
{'name':'cp',
58+
'type':candle.str2bool,
59+
'default':True, #action="store_true",
60+
'help':'checkpoint models with best val_loss'},
61+
{'name':'tb',
5862
'type':candle.str2bool,
59-
'default':True, #action="store_true",
60-
'help':'checkpoint models with best val_loss'},
61-
{'name':'tb',
63+
'default':True, #action="store_true",
64+
'help':'use tensorboard'},
65+
{'name':'use_mean_growth',
6266
'type':candle.str2bool,
63-
'default':True, #action="store_true",
64-
'help':'use tensorboard'},
65-
{'name':'max_val_loss', 'type':float,
66-
'help':'retrain if val_loss is greater than the threshold'},
67-
{'name':'cv_partition',
68-
'choices':['overlapping', 'disjoint', 'disjoint_cells'],
69-
'help':"cross validation paritioning scheme: overlapping or disjoint"},
70-
{'name':'cv', 'type':int,
71-
'help':"cross validation folds"},
72-
{'name':'gen',
67+
'default':False,
68+
'help':'aggregate growth percentage by mean instead of min'},
69+
{'name':'max_val_loss', 'type':float,
70+
'help':'retrain if val_loss is greater than the threshold'},
71+
{'name':'cv_partition',
72+
'choices':['overlapping', 'disjoint', 'disjoint_cells'],
73+
'help':"cross validation paritioning scheme: overlapping or disjoint"},
74+
{'name':'cv', 'type':int,
75+
'help':"cross validation folds"},
76+
{'name':'gen',
7377
'type':candle.str2bool,
74-
'default':True, #action="store_true",
75-
'help':"use generator for training and validation data"},
76-
{'name':'exclude_cells', 'nargs':'+',
77-
'default':[],
78-
'help':"cell line IDs to exclude"},
79-
{'name':'exclude_drugs', 'nargs':'+',
80-
'default':[],
81-
'help':"drug line IDs to exclude"}
78+
'default':True, #action="store_true",
79+
'help':"use generator for training and validation data"},
80+
{'name':'exclude_cells', 'nargs':'+',
81+
'default':[],
82+
'help':"cell line IDs to exclude"},
83+
{'name':'exclude_drugs', 'nargs':'+',
84+
'default':[],
85+
'help':"drug line IDs to exclude"}
8286
]
8387

8488

8589
required = [ 'activation', 'batch_size', 'dense', 'dense_feature_layers', 'drop',
8690
'epochs', 'learning_rate', 'loss', 'optimizer', 'residual', 'rng_seed',
87-
'save', 'scaling', 'feature_subsample', 'validation_split',
91+
'save_path', 'scaling', 'feature_subsample', 'validation_split',
8892
'solr_root', 'timeout'
8993
]
9094

91-
class BenchmarkCombo(candle.Benchmark):
92-
def set_locals(self):
93-
"""Functionality to set variables specific for the benchmark
94-
- required: set of required parameters for the benchmark.
95+
class BenchmarkCombo(candle.Benchmark):
96+
def set_locals(self):
97+
"""Functionality to set variables specific for the benchmark
98+
- required: set of required parameters for the benchmark.
9599
- additional_definitions: list of dictionaries describing the additional parameters for the
96100
benchmark.
97101
"""
@@ -100,4 +104,3 @@ def set_locals(self):
100104
self.required = set(required)
101105
if additional_definitions is not None:
102106
self.additional_definitions = additional_definitions
103-

Pilot1/Combo/combo_baseline_keras2.py

Lines changed: 31 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434

3535
import NCI60
3636
import combo
37-
import candle_keras as candle
37+
import candle
3838

3939
logger = logging.getLogger(__name__)
4040
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
@@ -108,6 +108,8 @@ def extension_from_parameters(args):
108108
ext += '.gen'
109109
if args.use_combo_score:
110110
ext += '.scr'
111+
if args.use_mean_growth:
112+
ext += '.mg'
111113
for i, n in enumerate(args.dense):
112114
if n > 0:
113115
ext += '.D{}={}'.format(i+1, n)
@@ -132,7 +134,8 @@ class ComboDataLoader(object):
132134

133135
def __init__(self, seed, val_split=0.2, shuffle=True,
134136
cell_features=['expression'], drug_features=['descriptors'],
135-
response_url=None, use_landmark_genes=False, use_combo_score=False,
137+
response_url=None, use_landmark_genes=False,
138+
use_combo_score=False, use_mean_growth=False,
136139
preprocess_rnaseq=None, exclude_cells=[], exclude_drugs=[],
137140
feature_subsample=None, scaling='std', scramble=False,
138141
cv_partition='overlapping', cv=0):
@@ -163,6 +166,8 @@ def __init__(self, seed, val_split=0.2, shuffle=True,
163166
only use LINCS1000 landmark genes
164167
use_combo_score: bool (default False)
165168
use combination score in place of percent growth (stored in 'GROWTH' column)
169+
use_mean_growth: bool (default False)
170+
use mean aggregation instead of min on percent growth
166171
scaling: None, 'std', 'minmax' or 'maxabs' (default 'std')
167172
type of feature scaling: 'maxabs' to [-1,1], 'maxabs' to [-1, 1], 'std' for standard normalization
168173
"""
@@ -171,7 +176,7 @@ def __init__(self, seed, val_split=0.2, shuffle=True,
171176

172177
np.random.seed(seed)
173178

174-
df = NCI60.load_combo_response(response_url=response_url, use_combo_score=use_combo_score, fraction=True, exclude_cells=exclude_cells, exclude_drugs=exclude_drugs)
179+
df = NCI60.load_combo_response(response_url=response_url, use_combo_score=use_combo_score, use_mean_growth=use_mean_growth, fraction=True, exclude_cells=exclude_cells, exclude_drugs=exclude_drugs)
175180
logger.info('Loaded {} unique (CL, D1, D2) response sets.'.format(df.shape[0]))
176181

177182
if 'all' in cell_features:
@@ -520,18 +525,18 @@ def log_evaluation(metric_outputs, description='Comparing y_true and y_pred:'):
520525
logger.info(' {}: {:.4f}'.format(metric, value))
521526

522527

523-
def plot_history(out, history, metric='loss', title=None):
524-
title = title or 'model {}'.format(metric)
525-
val_metric = 'val_{}'.format(metric)
526-
plt.figure(figsize=(8, 6))
527-
plt.plot(history.history[metric], marker='o')
528-
plt.plot(history.history[val_metric], marker='d')
529-
plt.title(title)
530-
plt.ylabel(metric)
531-
plt.xlabel('epoch')
532-
plt.legend(['train_{}'.format(metric), 'val_{}'.format(metric)], loc='upper center')
533-
png = '{}.plot.{}.png'.format(out, metric)
534-
plt.savefig(png, bbox_inches='tight')
528+
#def plot_history(out, history, metric='loss', title=None):
529+
# title = title or 'model {}'.format(metric)
530+
# val_metric = 'val_{}'.format(metric)
531+
# plt.figure(figsize=(8, 6))
532+
# plt.plot(history.history[metric], marker='o')
533+
# plt.plot(history.history[val_metric], marker='d')
534+
# plt.title(title)
535+
# plt.ylabel(metric)
536+
# plt.xlabel('epoch')
537+
# plt.legend(['train_{}'.format(metric), 'val_{}'.format(metric)], loc='upper center')
538+
# png = '{}.plot.{}.png'.format(out, metric)
539+
# plt.savefig(png, bbox_inches='tight')
535540

536541

537542
class LoggingCallback(Callback):
@@ -643,7 +648,7 @@ def initialize_parameters():
643648

644649
# Build benchmark object
645650
comboBmk = combo.BenchmarkCombo(combo.file_path, 'combo_default_model.txt', 'keras',
646-
prog='combo_baseline',
651+
prog='combo_baseline',
647652
desc = 'Build neural network based models to predict tumor response to drug pairs.')
648653

649654
# Initialize parameters
@@ -661,8 +666,8 @@ def run(params):
661666
args = Struct(**params)
662667
set_seed(args.rng_seed)
663668
ext = extension_from_parameters(args)
664-
verify_path(args.save)
665-
prefix = args.save + ext
669+
verify_path(args.save_path)
670+
prefix = args.save_path + ext
666671
logfile = args.logfile if args.logfile else prefix+'.log'
667672
set_up_logger(logfile, args.verbose)
668673
logger.info('Params: {}'.format(params))
@@ -671,6 +676,7 @@ def run(params):
671676
val_split=args.validation_split,
672677
cell_features=args.cell_features,
673678
drug_features=args.drug_features,
679+
use_mean_growth=args.use_mean_growth,
674680
response_url=args.response_url,
675681
use_landmark_genes=args.use_landmark_genes,
676682
preprocess_rnaseq=args.preprocess_rnaseq,
@@ -689,7 +695,7 @@ def run(params):
689695

690696
model = build_model(loader, args, verbose=True)
691697
model.summary()
692-
# plot_model(model, to_file=prefix+'.model.png', show_shapes=True)
698+
# candle.plot_model(model, to_file=prefix+'.model.png', show_shapes=True)
693699

694700
if args.cp:
695701
model_json = model.to_json()
@@ -798,16 +804,19 @@ def warmup_scheduler(epoch):
798804
# print('old_pred:', y_val_pred[:10])
799805
# print('new_pred:', new_pred[:10])
800806

801-
plot_history(prefix, history, 'loss')
802-
plot_history(prefix, history, 'r2')
807+
candle.plot_history(prefix, history, 'loss')
808+
candle.plot_history(prefix, history, 'r2')
803809

804810
if K.backend() == 'tensorflow':
805811
K.clear_session()
806812

807813
if not args.gen:
808-
pred_fname = prefix + '.predicted.growth.tsv'
809814
if args.use_combo_score:
810815
pred_fname = prefix + '.predicted.score.tsv'
816+
elif args.use_mean_growth:
817+
pred_fname = prefix + '.predicted.mean.growth.tsv'
818+
else:
819+
pred_fname = prefix + '.predicted.growth.tsv'
811820
df_pred = pd.concat(df_pred_list)
812821
df_pred.to_csv(pred_fname, sep='\t', index=False, float_format='%.4g')
813822

Pilot1/Combo/combo_default_model.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ warmup_lr=False
2222
batch_normalization=False
2323
feature_subsample=0
2424
rng_seed=2017
25-
save='save/combo'
25+
save_path='save/combo'
2626
gen=False
2727
use_combo_score=False
2828
verbose = False

0 commit comments

Comments
 (0)