Added Combo to Release01

jmohdyusof · jmohdyusof · commit 724295db5cb8 · 2018-05-23T19:28:55.000-06:00
diff --git a/Pilot1/Combo/NCI60.py b/Pilot1/Combo/NCI60.py
@@ -11,11 +11,10 @@
 from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
 
 file_path = os.path.dirname(os.path.realpath(__file__))
-lib_path = os.path.abspath(os.path.join(file_path, '..', 'common'))
+lib_path = os.path.abspath(os.path.join(file_path, '..', '..', 'common'))
 sys.path.append(lib_path)
 
-import p1_common
-
+import candle_keras as candle
 
 global_cache = {}
 
@@ -25,7 +24,7 @@
 
 
 def get_file(url):
-    return p1_common.get_p1_file(url)
+    return candle.fetch_file(url, 'Pilot1')
 
 
 def impute_and_scale(df, scaling='std'):
@@ -444,7 +443,7 @@ def load_sample_rnaseq(ncols=None, scaling='std', add_prefix=True, use_landmark_
 
     if preprocess_rnaseq and preprocess_rnaseq != 'none':
         scaling = None
-        filename += ('_' + preprocess_rnaseq)  # 'scale_per_source' or 'combat'
+        filename += ('_' + preprocess_rnaseq)  # 'source_scale' or 'combat'
 
     path = get_file(DATA_URL + filename)
 
@@ -489,7 +488,7 @@ def load_cell_expression_rnaseq(ncols=None, scaling='std', add_prefix=True, use_
 
     if preprocess_rnaseq and preprocess_rnaseq != 'none':
         scaling = None
-        filename += ('_' + preprocess_rnaseq)  # 'scale_per_source' or 'combat'
+        filename += ('_' + preprocess_rnaseq)  # 'source_scale' or 'combat'
 
     path = get_file(DATA_URL + filename)
 
diff --git a/Pilot1/Combo/combo.py b/Pilot1/Combo/combo.py
@@ -4,113 +4,100 @@
 import sys
 import logging
 import argparse
-try:
-    import configparser
-except ImportError:
-    import ConfigParser as configparser
 
 import pandas as pd
 import numpy as np
 
 file_path = os.path.dirname(os.path.realpath(__file__))
-lib_path = os.path.abspath(os.path.join(file_path, '..', 'common'))
-sys.path.append(lib_path)
-# lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common'))
-# sys.path.append(lib_path2)
+lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common'))
+sys.path.append(lib_path2)
 
-import p1_common
+import candle_keras as candle
 
 logger = logging.getLogger(__name__)
 
-
-def common_parser(parser):
-    parser.add_argument("--config-file", dest='config_file', type=str,
-                        default=os.path.join(file_path, 'combo_default_model.txt'),
-                        help="specify model configuration file")
-
-    # Parse has been split between arguments that are common with the default neon parser
-    # and all the other options
-    parser = p1_common.get_default_neon_parse(parser)
-    parser = p1_common.get_p1_common_parser(parser)
-
-    # Arguments that are applicable just to combo
-    parser = combo_parser(parser)
-
-    return parser
-
-
-def combo_parser(parser):
-    parser.add_argument("--cell_features", nargs='+',
-                        default=argparse.SUPPRESS,
-                        choices=['expression', 'mirna', 'proteome', 'all', 'expression_5platform', 'expression_u133p2', 'rnaseq', 'categorical'],
-                        help="use one or more cell line feature sets: 'expression', 'mirna', 'proteome', 'all'; use all for ['expression', 'mirna', 'proteome']; use 'categorical' for one-hot encoded cell lines")
-    parser.add_argument("--drug_features", nargs='+',
-                        default=argparse.SUPPRESS,
-                        choices=['descriptors', 'latent', 'all', 'categorical', 'noise'],
-                        help="use dragon7 descriptors, latent representations from Aspuru-Guzik's SMILES autoencoder, or both, or one-hot encoded drugs, or random features; 'descriptors','latent', 'all', 'categorical', 'noise'")
-    parser.add_argument('--dense_feature_layers', nargs='+', type=int,
-                        default=argparse.SUPPRESS,
-                        help='number of neurons in intermediate dense layers in the feature encoding submodels')
-    parser.add_argument("--use_landmark_genes", action="store_true",
-                        help="use the 978 landmark genes from LINCS (L1000) as expression features")
-    parser.add_argument("--preprocess_rnaseq",
-                        choices=['scale_per_source', 'combat', 'none'],
-                        help="preprocessing method for RNAseq data; none for global normalization")
-    parser.add_argument("--response_url",
-                        help="URL to combo dose response file")
-    parser.add_argument("--residual", action="store_true",
-                        help="add skip connections to the layers")
-    parser.add_argument('--reduce_lr', action='store_true',
-                        help='reduce learning rate on plateau')
-    parser.add_argument('--warmup_lr', action='store_true',
-                        help='gradually increase learning rate on start')
-    parser.add_argument('--base_lr', type=float,
-                        default=None,
-                        help='base learning rate')
-    parser.add_argument('--cp', action='store_true',
-                        help='checkpoint models with best val_loss')
-    parser.add_argument('--tb', action='store_true',
-                        help='use tensorboard')
-    parser.add_argument('--max_val_loss', type=float,
-                        default=argparse.SUPPRESS,
-                        help='retrain if val_loss is greater than the threshold')
-    parser.add_argument("--cv_partition",
-                        choices=['overlapping', 'disjoint', 'disjoint_cells'],
-                        default=argparse.SUPPRESS,
-                        help="cross validation paritioning scheme: overlapping or disjoint")
-    parser.add_argument("--cv", type=int,
-                        default=argparse.SUPPRESS,
-                        help="cross validation folds")
-    parser.add_argument("--gen", action="store_true",
-                        help="use generator for training and validation data")
-    parser.add_argument("--exclude_cells", nargs='+',
-                        default=[],
-                        help="cell line IDs to exclude")
-    parser.add_argument("--exclude_drugs", nargs='+',
-                        default=[],
-                        help="drug line IDs to exclude")
-
-    return parser
-
-
-def read_config_file(file):
-    config = configparser.ConfigParser()
-    config.read(file)
-    section = config.sections()
-
-    args = [['activation', 'batch_size', 'dense', 'dense_feature_layers', 'drop',
+additional_definitions = [
+{'name':'cell_features', 
+    'nargs':'+', 
+    'choices':['expression', 'mirna', 'proteome', 'all', 'expression_5platform', 'expression_u133p2', 'rnaseq', 'categorical'], 
+    'help':"use one or more cell line feature sets: 'expression', 'mirna', 'proteome', 'all'; use all for ['expression', 'mirna', 'proteome']; use 'categorical' for one-hot encoded cell lines"}, 
+{'name':'drug_features', 'nargs':'+', 
+    'choices':['descriptors', 'latent', 'all', 'categorical', 'noise'], 
+    'help':"use dragon7 descriptors, latent representations from Aspuru-Guzik's SMILES autoencoder, or both, or one-hot encoded drugs, or random features; 'descriptors','latent', 'all', 'categorical', 'noise'"}, 
+{'name':'dense_feature_layers', 
+    'nargs':'+', 
+    'type':int, 
+    'help':'number of neurons in intermediate dense layers in the feature encoding submodels'}, 
+{'name':'use_landmark_genes', 
+    'type':candle.str2bool,
+    'default':True, #action="store_true", 
+    'help':"use the 978 landmark genes from LINCS (L1000) as expression features"}, 
+{'name':'preprocess_rnaseq', 
+    'default':'none',
+    'choices':['source_scale', 'combat', 'none'], 
+    'help':"preprocessing method for RNAseq data; none for global normalization"}, 
+{'name':'response_url', 
+    'default':None,
+    'help':"URL to combo dose response file"}, 
+{'name':'residual',
+    'type':candle.str2bool,
+    'default':True, #action="store_true", 
+    'help':"add skip connections to the layers"}, 
+{'name':'reduce_lr',
+    'type':candle.str2bool,
+    'default':True, #action="store_true", 
+    'help':'reduce learning rate on plateau'}, 
+{'name':'warmup_lr', 
+    'type':candle.str2bool,
+    'default':True, #action="store_true", 
+    'help':'gradually increase learning rate on start'}, 
+{'name':'base_lr', 'type':float, 
+    'default':None, 
+    'help':'base learning rate'}, 
+{'name':'cp', 
+    'type':candle.str2bool,
+    'default':True, #action="store_true", 
+    'help':'checkpoint models with best val_loss'}, 
+{'name':'tb', 
+    'type':candle.str2bool,
+    'default':True, #action="store_true", 
+    'help':'use tensorboard'}, 
+{'name':'max_val_loss', 'type':float, 
+    'help':'retrain if val_loss is greater than the threshold'}, 
+{'name':'cv_partition', 
+    'choices':['overlapping', 'disjoint', 'disjoint_cells'], 
+    'help':"cross validation paritioning scheme: overlapping or disjoint"}, 
+{'name':'cv', 'type':int, 
+    'help':"cross validation folds"}, 
+{'name':'gen', 
+    'type':candle.str2bool,
+    'default':True, #action="store_true", 
+    'help':"use generator for training and validation data"}, 
+{'name':'exclude_cells', 'nargs':'+', 
+    'default':[], 
+    'help':"cell line IDs to exclude"}, 
+{'name':'exclude_drugs', 'nargs':'+', 
+    'default':[], 
+    'help':"drug line IDs to exclude"} 
+]
+
+
+required = [ 'activation', 'batch_size', 'dense', 'dense_feature_layers', 'drop',
              'epochs', 'learning_rate', 'loss', 'optimizer', 'residual', 'rng_seed',
-             'save', 'scaling', 'feature_subsample', 'validation_split'],
-            ['solr_root', 'timeout']]
-
-    file_params = {}
-    for i, sec_args in enumerate(args):
-        for arg in sec_args:
-            file_params[arg] = eval(config.get(section[i], arg))
-
-    # parse the remaining values
-    for k, v in config.items(section[0]):
-        if not k in file_params:
-            file_params[k] = eval(v)
+             'save', 'scaling', 'feature_subsample', 'validation_split',
+            'solr_root', 'timeout'
+	    ]
+
+class BenchmarkCombo(candle.Benchmark): 
+    def set_locals(self): 
+        """Functionality to set variables specific for the benchmark 
+        - required: set of required parameters for the benchmark.  
+        - additional_definitions: list of dictionaries describing the additional parameters for the
+        benchmark.
+        """
+
+        if required is not None:
+            self.required = set(required)
+        if additional_definitions is not None:
+            self.additional_definitions = additional_definitions
 
-    return file_params
diff --git a/Pilot1/Combo/combo_baseline_keras2.py b/Pilot1/Combo/combo_baseline_keras2.py
@@ -30,23 +30,15 @@
 mpl.use('Agg')
 import matplotlib.pyplot as plt
 
-
 import combo
-import p1_common
-# import p1_common_keras
-from solr_keras import CandleRemoteMonitor, compute_trainable_params, TerminateOnTimeOut
-
-# import argparser
-# from datasets import NCI60
 
 import NCI60
 import combo
-
+import candle_keras as candle
 
 logger = logging.getLogger(__name__)
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 
-
 def set_seed(seed):
     os.environ['PYTHONHASHSEED'] = '0'
     np.random.seed(seed)
@@ -647,26 +639,18 @@ def build_model(loader, args, verbose=False):
 
     return Model(inputs, output)
 
+def initialize_parameters():
 
+    # Build benchmark object
+    comboBmk = combo.BenchmarkCombo(combo.file_path, 'combo_default_model.txt', 'keras',
+        prog='combo_baseline', 
+        desc = 'Build neural network based models to predict tumor response to drug pairs.')
 
-def get_combo_parser():
-    description = 'Build neural network based models to predict tumor response to drug pairs.'
-    parser = argparse.ArgumentParser(prog='combo_baseline', formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-                                     description=description)
-    return combo.common_parser(parser)
-
-
-def initialize_parameters():
-    # Get command-line parameters
-    parser = get_combo_parser()
-    args = parser.parse_args()
-    # Get parameters from configuration file
-    file_params = combo.read_config_file(args.config_file)
-    # Consolidate parameter set. Command-line parameters overwrite file configuration
-    params = p1_common.args_overwrite_config(args, file_params)
-    # print(params)
-    return params
+    # Initialize parameters
+    gParameters = candle.initialize_parameters(comboBmk)
+    #combo.logger.info('Params: {}'.format(gParameters))
 
+    return gParameters
 
 class Struct:
     def __init__(self, **entries):
@@ -740,10 +724,10 @@ def warmup_scheduler(epoch):
         model.compile(loss=args.loss, optimizer=optimizer, metrics=[mae, r2])
 
         # calculate trainable and non-trainable params
-        params.update(compute_trainable_params(model))
+        params.update(candle.compute_trainable_params(model))
 
-        candle_monitor = CandleRemoteMonitor(params=params)
-        timeout_monitor = TerminateOnTimeOut(params['timeout'])
+        candle_monitor = candle.CandleRemoteMonitor(params=params)
+        timeout_monitor = candle.TerminateOnTimeOut(params['timeout'])
 
         reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.00001)
         warmup_lr = LearningRateScheduler(warmup_scheduler)
diff --git a/Pilot1/Combo/infer.py b/Pilot1/Combo/infer.py
@@ -81,7 +81,7 @@ def get_parser(description=None):
     parser.add_argument("--use_landmark_genes", action="store_true",
                         help="use the 978 landmark genes from LINCS (L1000) as expression features")
     parser.add_argument("--preprocess_rnaseq",
-                        choices=['scale_per_source', 'combat', 'none'],
+                        choices=['source_scale', 'combat', 'none'],
                         help="preprocessing method for RNAseq data; none for global normalization")
 
     return parser
diff --git a/Pilot1/Combo/infer_dose.py b/Pilot1/Combo/infer_dose.py
@@ -63,7 +63,7 @@ def get_parser(description=None):
     parser.add_argument("--use_landmark_genes", action="store_true",
                         help="use the 978 landmark genes from LINCS (L1000) as expression features")
     parser.add_argument("--preprocess_rnaseq",
-                        choices=['scale_per_source', 'combat', 'none'],
+                        choices=['source_scale', 'combat', 'none'],
                         help="preprocessing method for RNAseq data; none for global normalization")
     parser.add_argument("--skip_single_prediction_cleanup", action="store_true",
                         help="skip removing single drug predictions with two different concentrations")