ECP-CANDLE
diff --git a/‎Pilot1/Combo/NCI60.py‎
Lines changed: 21 additions & 4 deletions b/‎Pilot1/Combo/NCI60.py‎
Lines changed: 21 additions & 4 deletions
diff --git a/‎Pilot1/Combo/README.md‎
Lines changed: 13 additions & 0 deletions b/‎Pilot1/Combo/README.md‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎Pilot1/Combo/combo.py‎
Lines changed: 61 additions & 58 deletions b/‎Pilot1/Combo/combo.py‎
Lines changed: 61 additions & 58 deletions
diff --git a/‎Pilot1/Combo/combo_baseline_keras2.py‎
Lines changed: 31 additions & 22 deletions b/‎Pilot1/Combo/combo_baseline_keras2.py‎
Lines changed: 31 additions & 22 deletions
diff --git a/‎Pilot1/Combo/combo_default_model.txt‎
Lines changed: 1 addition & 1 deletion b/‎Pilot1/Combo/combo_default_model.txt‎
Lines changed: 1 addition & 1 deletion
@@ -14,7 +14,7 @@
 lib_path = os.path.abspath(os.path.join(file_path, '..', '..', 'common'))
 sys.path.append(lib_path)
 
-import candle_keras as candle
+import candle
 
 global_cache = {}
 
@@ -104,7 +104,8 @@ def load_dose_response(min_logconc=-4., max_logconc=-4., subsample=None, fractio
     return df
 
 
-def load_combo_response(response_url=None, fraction=False, use_combo_score=False, exclude_cells=[], exclude_drugs=[]):
+def load_combo_response(response_url=None, fraction=False, use_combo_score=False, use_mean_growth=False,
+                        exclude_cells=[], exclude_drugs=[]):
     """Load cell line response to pairs of drugs, sub-select response for a specific
         drug log concentration range and return a pandas dataframe.
 
@@ -160,8 +161,15 @@ def load_combo_response(response_url=None, fraction=False, use_combo_score=False
     df_max = df.groupby(['CELLNAME', 'NSC1', 'NSC2']).max()
     df_max = df_max.add_suffix('_MAX').reset_index()  # add SCORE_MAX by flattening the hierarchical index
 
+    df_avg = df.copy()
+    df_avg['PERCENTGROWTH'] = df_avg['PERCENTGROWTH'].apply(lambda x: 100 if x > 100 else 50+x/2 if x < 0 else 50+x/2)
+    df_avg = df.groupby(['CELLNAME', 'NSC1', 'NSC2']).mean()
+    df_avg = df_avg.add_suffix('_AVG').reset_index()
+
     if use_combo_score:
         df = df_max.rename(columns={'SCORE_MAX': 'GROWTH'}).drop('PERCENTGROWTH_MAX', axis=1)
+    elif use_mean_growth:
+        df = df_avg.rename(columns={'PERCENTGROWTH_AVG': 'GROWTH'}).drop('SCORE_AVG', axis=1)
     else:
         df = df_min.rename(columns={'PERCENTGROWTH_MIN': 'GROWTH'}).drop('SCORE_MIN', axis=1)
 
@@ -252,6 +260,10 @@ def load_drug_set_descriptors(drug_set='ALMANAC', ncols=None, scaling='std', add
         path = get_file(DATA_URL + 'GDSC_PubChemCID_drug_descriptors_dragon7')
     elif drug_set == 'NCI_IOA_AOA':
         path = get_file(DATA_URL + 'NCI_IOA_AOA_drug_descriptors_dragon7')
+    elif drug_set == 'RTS':
+        path = get_file(DATA_URL + 'RTS_drug_descriptors_dragon7')
+    elif drug_set == 'pan':
+        path = get_file(DATA_URL + 'pan_drugs_dragon7_descriptors.tsv')
     else:
         raise Exception('Drug set {} not supported!'.format(drug_set))
 
@@ -347,7 +359,8 @@ def load_drug_descriptors(ncols=None, scaling='std', add_prefix=True):
         add feature namespace prefix
     """
 
-    path = get_file(DATA_URL + 'ALMANAC_drug_descriptors_dragon7.txt')
+    # path = get_file(DATA_URL + 'ALMANAC_drug_descriptors_dragon7.txt')
+    path = get_file(DATA_URL + 'pan_drugs_dragon7_descriptors.tsv')
 
     df = global_cache.get(path)
     if df is None:
@@ -452,7 +465,11 @@ def load_sample_rnaseq(ncols=None, scaling='std', add_prefix=True, use_landmark_
         df = pd.read_csv(path, sep='\t', engine='c')
         global_cache[path] = df
 
-    df = df[df['Sample'].str.startswith(sample_set)].reset_index(drop=True)
+    if sample_set == 'RTS':
+        df_ids = pd.read_table(get_file(DATA_URL + 'RTS_PDM_samples'))
+        df = df.merge(df_ids, on='Sample').reset_index(drop=True)
+    else:
+        df = df[df['Sample'].str.startswith(sample_set)].reset_index(drop=True)
 
     # cellmap_path = get_file(DATA_URL + 'NCI60_CELLNAME_to_Combo.new.txt')
     # df_cellmap = pd.read_csv(cellmap_path, sep='\t')
 
@@ -184,3 +184,16 @@ GDSC.22RV1      NSC.105014      NSC.102816      100     0.1627  0.1060  -0.0531
 
 A version of trained model files with dropout are available here: [saved.uq.model.h5](http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/combo/saved.uq.model.h5) and [saved.uq.weights.h5](http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/combo/saved.uq.weights.h5).
 
+## Profile runs
+We have run the same configuration across multiple machines and compared the resource utilization. 
+```
+python uno_baseline_keras2.py --conf combo_perf_benchmark.txt
+```
+
+| Machine | Time to complete (HH:mm:ss) | Time per epoch (s) | Perf factor <sup>*</sup> | CPU % | Mem % | Mem GB | GPU % | GPU Mem % | Note |
+| ------- | --------------------------: | -----------------: | -----------------------: | ----: | ----: | -----: | ----: | --------: | ---- |
+| Theta | 1:14:12 | 811 | 0.31 | 7.6 | 7.6 | 12.8 | | |
+| Nucleus | 0:14:13 | 72 | 3.47 | 3.8 | 9.3 | 21.9 | 63.4 | 91.9 |
+| Tesla (K20) | 0:44:17 | 250 | 1.00 | 3.9 | 42.3 | 12.9 | 73.8 | 53.3 |
+| Titan | | | | | | | | | keras version 2.0.3 does not supprot model.clone_model() which is introduced in 2.0.7 |
+* Time per epoch on the machine divided by time per epoch of Titan (or Tesla)
@@ -12,86 +12,90 @@
 lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common'))
 sys.path.append(lib_path2)
 
-import candle_keras as candle
+import candle
 
 logger = logging.getLogger(__name__)
 
 additional_definitions = [
-{'name':'cell_features', 
-    'nargs':'+', 
-    'choices':['expression', 'mirna', 'proteome', 'all', 'expression_5platform', 'expression_u133p2', 'rnaseq', 'categorical'], 
-    'help':"use one or more cell line feature sets: 'expression', 'mirna', 'proteome', 'all'; use all for ['expression', 'mirna', 'proteome']; use 'categorical' for one-hot encoded cell lines"}, 
-{'name':'drug_features', 'nargs':'+', 
-    'choices':['descriptors', 'latent', 'all', 'categorical', 'noise'], 
-    'help':"use dragon7 descriptors, latent representations from Aspuru-Guzik's SMILES autoencoder, or both, or one-hot encoded drugs, or random features; 'descriptors','latent', 'all', 'categorical', 'noise'"}, 
-{'name':'dense_feature_layers', 
-    'nargs':'+', 
-    'type':int, 
-    'help':'number of neurons in intermediate dense layers in the feature encoding submodels'}, 
-{'name':'use_landmark_genes', 
+{'name':'cell_features',
+    'nargs':'+',
+    'choices':['expression', 'mirna', 'proteome', 'all', 'expression_5platform', 'expression_u133p2', 'rnaseq', 'categorical'],
+    'help':"use one or more cell line feature sets: 'expression', 'mirna', 'proteome', 'all'; use all for ['expression', 'mirna', 'proteome']; use 'categorical' for one-hot encoded cell lines"},
+{'name':'drug_features', 'nargs':'+',
+    'choices':['descriptors', 'latent', 'all', 'categorical', 'noise'],
+    'help':"use dragon7 descriptors, latent representations from Aspuru-Guzik's SMILES autoencoder, or both, or one-hot encoded drugs, or random features; 'descriptors','latent', 'all', 'categorical', 'noise'"},
+{'name':'dense_feature_layers',
+    'nargs':'+',
+    'type':int,
+    'help':'number of neurons in intermediate dense layers in the feature encoding submodels'},
+{'name':'use_landmark_genes',
     'type':candle.str2bool,
-    'default':True, #action="store_true", 
-    'help':"use the 978 landmark genes from LINCS (L1000) as expression features"}, 
-{'name':'preprocess_rnaseq', 
+    'default':True, #action="store_true",
+    'help':"use the 978 landmark genes from LINCS (L1000) as expression features"},
+{'name':'preprocess_rnaseq',
     'default':'none',
-    'choices':['source_scale', 'combat', 'none'], 
-    'help':"preprocessing method for RNAseq data; none for global normalization"}, 
-{'name':'response_url', 
+    'choices':['source_scale', 'combat', 'none'],
+    'help':"preprocessing method for RNAseq data; none for global normalization"},
+{'name':'response_url',
     'default':None,
-    'help':"URL to combo dose response file"}, 
+    'help':"URL to combo dose response file"},
 {'name':'residual',
     'type':candle.str2bool,
-    'default':True, #action="store_true", 
-    'help':"add skip connections to the layers"}, 
+    'default':True, #action="store_true",
+    'help':"add skip connections to the layers"},
 {'name':'reduce_lr',
     'type':candle.str2bool,
-    'default':True, #action="store_true", 
-    'help':'reduce learning rate on plateau'}, 
-{'name':'warmup_lr', 
+    'default':True, #action="store_true",
+    'help':'reduce learning rate on plateau'},
+{'name':'warmup_lr',
     'type':candle.str2bool,
-    'default':True, #action="store_true", 
-    'help':'gradually increase learning rate on start'}, 
-{'name':'base_lr', 'type':float, 
-    'default':None, 
-    'help':'base learning rate'}, 
-{'name':'cp', 
+    'default':True, #action="store_true",
+    'help':'gradually increase learning rate on start'},
+{'name':'base_lr', 'type':float,
+    'default':None,
+    'help':'base learning rate'},
+{'name':'cp',
+    'type':candle.str2bool,
+    'default':True, #action="store_true",
+    'help':'checkpoint models with best val_loss'},
+{'name':'tb',
     'type':candle.str2bool,
-    'default':True, #action="store_true", 
-    'help':'checkpoint models with best val_loss'}, 
-{'name':'tb', 
+    'default':True, #action="store_true",
+    'help':'use tensorboard'},
+{'name':'use_mean_growth',
     'type':candle.str2bool,
-    'default':True, #action="store_true", 
-    'help':'use tensorboard'}, 
-{'name':'max_val_loss', 'type':float, 
-    'help':'retrain if val_loss is greater than the threshold'}, 
-{'name':'cv_partition', 
-    'choices':['overlapping', 'disjoint', 'disjoint_cells'], 
-    'help':"cross validation paritioning scheme: overlapping or disjoint"}, 
-{'name':'cv', 'type':int, 
-    'help':"cross validation folds"}, 
-{'name':'gen', 
+    'default':False,
+    'help':'aggregate growth percentage by mean instead of min'},
+{'name':'max_val_loss', 'type':float,
+    'help':'retrain if val_loss is greater than the threshold'},
+{'name':'cv_partition',
+    'choices':['overlapping', 'disjoint', 'disjoint_cells'],
+    'help':"cross validation paritioning scheme: overlapping or disjoint"},
+{'name':'cv', 'type':int,
+    'help':"cross validation folds"},
+{'name':'gen',
     'type':candle.str2bool,
-    'default':True, #action="store_true", 
-    'help':"use generator for training and validation data"}, 
-{'name':'exclude_cells', 'nargs':'+', 
-    'default':[], 
-    'help':"cell line IDs to exclude"}, 
-{'name':'exclude_drugs', 'nargs':'+', 
-    'default':[], 
-    'help':"drug line IDs to exclude"} 
+    'default':True, #action="store_true",
+    'help':"use generator for training and validation data"},
+{'name':'exclude_cells', 'nargs':'+',
+    'default':[],
+    'help':"cell line IDs to exclude"},
+{'name':'exclude_drugs', 'nargs':'+',
+    'default':[],
+    'help':"drug line IDs to exclude"}
 ]
 
 
 required = [ 'activation', 'batch_size', 'dense', 'dense_feature_layers', 'drop',
              'epochs', 'learning_rate', 'loss', 'optimizer', 'residual', 'rng_seed',
-             'save', 'scaling', 'feature_subsample', 'validation_split',
+             'save_path', 'scaling', 'feature_subsample', 'validation_split',
             'solr_root', 'timeout'
 	    ]
 
-class BenchmarkCombo(candle.Benchmark): 
-    def set_locals(self): 
-        """Functionality to set variables specific for the benchmark 
-        - required: set of required parameters for the benchmark.  
+class BenchmarkCombo(candle.Benchmark):
+    def set_locals(self):
+        """Functionality to set variables specific for the benchmark
+        - required: set of required parameters for the benchmark.
         - additional_definitions: list of dictionaries describing the additional parameters for the
         benchmark.
         """
@@ -100,4 +104,3 @@ def set_locals(self):
             self.required = set(required)
         if additional_definitions is not None:
             self.additional_definitions = additional_definitions
-
 
@@ -34,7 +34,7 @@
 
 import NCI60
 import combo
-import candle_keras as candle
+import candle
 
 logger = logging.getLogger(__name__)
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
@@ -108,6 +108,8 @@ def extension_from_parameters(args):
         ext += '.gen'
     if args.use_combo_score:
         ext += '.scr'
+    if args.use_mean_growth:
+        ext += '.mg'
     for i, n in enumerate(args.dense):
         if n > 0:
             ext += '.D{}={}'.format(i+1, n)
@@ -132,7 +134,8 @@ class ComboDataLoader(object):
 
     def __init__(self, seed, val_split=0.2, shuffle=True,
                  cell_features=['expression'], drug_features=['descriptors'],
-                 response_url=None, use_landmark_genes=False, use_combo_score=False,
+                 response_url=None, use_landmark_genes=False,
+                 use_combo_score=False, use_mean_growth=False,
                  preprocess_rnaseq=None, exclude_cells=[], exclude_drugs=[],
                  feature_subsample=None, scaling='std', scramble=False,
                  cv_partition='overlapping', cv=0):
@@ -163,6 +166,8 @@ def __init__(self, seed, val_split=0.2, shuffle=True,
             only use LINCS1000 landmark genes
         use_combo_score: bool (default False)
             use combination score in place of percent growth (stored in 'GROWTH' column)
+        use_mean_growth: bool (default False)
+            use mean aggregation instead of min on percent growth
         scaling: None, 'std', 'minmax' or 'maxabs' (default 'std')
             type of feature scaling: 'maxabs' to [-1,1], 'maxabs' to [-1, 1], 'std' for standard normalization
         """
@@ -171,7 +176,7 @@ def __init__(self, seed, val_split=0.2, shuffle=True,
 
         np.random.seed(seed)
 
-        df = NCI60.load_combo_response(response_url=response_url, use_combo_score=use_combo_score, fraction=True, exclude_cells=exclude_cells, exclude_drugs=exclude_drugs)
+        df = NCI60.load_combo_response(response_url=response_url, use_combo_score=use_combo_score, use_mean_growth=use_mean_growth, fraction=True, exclude_cells=exclude_cells, exclude_drugs=exclude_drugs)
         logger.info('Loaded {} unique (CL, D1, D2) response sets.'.format(df.shape[0]))
 
         if 'all' in cell_features:
@@ -520,18 +525,18 @@ def log_evaluation(metric_outputs, description='Comparing y_true and y_pred:'):
         logger.info('  {}: {:.4f}'.format(metric, value))
 
 
-def plot_history(out, history, metric='loss', title=None):
-    title = title or 'model {}'.format(metric)
-    val_metric = 'val_{}'.format(metric)
-    plt.figure(figsize=(8, 6))
-    plt.plot(history.history[metric], marker='o')
-    plt.plot(history.history[val_metric], marker='d')
-    plt.title(title)
-    plt.ylabel(metric)
-    plt.xlabel('epoch')
-    plt.legend(['train_{}'.format(metric), 'val_{}'.format(metric)], loc='upper center')
-    png = '{}.plot.{}.png'.format(out, metric)
-    plt.savefig(png, bbox_inches='tight')
+#def plot_history(out, history, metric='loss', title=None):
+#    title = title or 'model {}'.format(metric)
+#    val_metric = 'val_{}'.format(metric)
+#    plt.figure(figsize=(8, 6))
+#    plt.plot(history.history[metric], marker='o')
+#    plt.plot(history.history[val_metric], marker='d')
+#    plt.title(title)
+#    plt.ylabel(metric)
+#    plt.xlabel('epoch')
+#    plt.legend(['train_{}'.format(metric), 'val_{}'.format(metric)], loc='upper center')
+#    png = '{}.plot.{}.png'.format(out, metric)
+#    plt.savefig(png, bbox_inches='tight')
 
 
 class LoggingCallback(Callback):
@@ -643,7 +648,7 @@ def initialize_parameters():
 
     # Build benchmark object
     comboBmk = combo.BenchmarkCombo(combo.file_path, 'combo_default_model.txt', 'keras',
-        prog='combo_baseline', 
+        prog='combo_baseline',
         desc = 'Build neural network based models to predict tumor response to drug pairs.')
 
     # Initialize parameters
@@ -661,8 +666,8 @@ def run(params):
     args = Struct(**params)
     set_seed(args.rng_seed)
     ext = extension_from_parameters(args)
-    verify_path(args.save)
-    prefix = args.save + ext
+    verify_path(args.save_path)
+    prefix = args.save_path + ext
     logfile = args.logfile if args.logfile else prefix+'.log'
     set_up_logger(logfile, args.verbose)
     logger.info('Params: {}'.format(params))
@@ -671,6 +676,7 @@ def run(params):
                              val_split=args.validation_split,
                              cell_features=args.cell_features,
                              drug_features=args.drug_features,
+                             use_mean_growth=args.use_mean_growth,
                              response_url=args.response_url,
                              use_landmark_genes=args.use_landmark_genes,
                              preprocess_rnaseq=args.preprocess_rnaseq,
@@ -689,7 +695,7 @@ def run(params):
 
     model = build_model(loader, args, verbose=True)
     model.summary()
-    # plot_model(model, to_file=prefix+'.model.png', show_shapes=True)
+    # candle.plot_model(model, to_file=prefix+'.model.png', show_shapes=True)
 
     if args.cp:
         model_json = model.to_json()
@@ -798,16 +804,19 @@ def warmup_scheduler(epoch):
             # print('old_pred:', y_val_pred[:10])
             # print('new_pred:', new_pred[:10])
 
-        plot_history(prefix, history, 'loss')
-        plot_history(prefix, history, 'r2')
+        candle.plot_history(prefix, history, 'loss')
+        candle.plot_history(prefix, history, 'r2')
 
         if K.backend() == 'tensorflow':
             K.clear_session()
 
     if not args.gen:
-        pred_fname = prefix + '.predicted.growth.tsv'
         if args.use_combo_score:
             pred_fname = prefix + '.predicted.score.tsv'
+        elif args.use_mean_growth:
+            pred_fname = prefix + '.predicted.mean.growth.tsv'
+        else:
+            pred_fname = prefix + '.predicted.growth.tsv'
         df_pred = pd.concat(df_pred_list)
         df_pred.to_csv(pred_fname, sep='\t', index=False, float_format='%.4g')
 
 
@@ -22,7 +22,7 @@ warmup_lr=False
 batch_normalization=False
 feature_subsample=0
 rng_seed=2017
-save='save/combo'
+save_path='save/combo'
 gen=False
 use_combo_score=False
 verbose = False