3
3
import os
4
4
import sys
5
5
import logging
6
- import argparse
7
- try :
8
- import configparser
9
- except ImportError :
10
- import ConfigParser as configparser
11
6
12
7
import pandas as pd
13
8
import numpy as np
17
12
from scipy .stats .stats import pearsonr
18
13
19
14
file_path = os .path .dirname (os .path .realpath (__file__ ))
20
- lib_path = os .path .abspath (os .path .join (file_path , '..' , 'common ' ))
21
- sys .path .append (lib_path )
15
+ # lib_path = os.path.abspath(os.path.join(file_path, '..'))
16
+ # sys.path.append(lib_path)
22
17
lib_path2 = os .path .abspath (os .path .join (file_path , '..' , '..' , 'common' ))
23
18
sys .path .append (lib_path2 )
24
19
25
- import p1_common
26
-
27
- url_p1b1 = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/'
28
- file_train = 'P1B1.dev.train.csv'
29
- file_test = 'P1B1.dev.test.csv'
20
+ import candle_keras as candle
30
21
31
22
logger = logging .getLogger (__name__ )
32
23
33
- def common_parser (parser ):
34
-
35
- parser .add_argument ("--config-file" , dest = 'config_file' , type = str ,
36
- default = os .path .join (file_path , 'p1b1_default_model.txt' ),
37
- help = "specify model configuration file" )
38
-
39
- # Parse has been split between arguments that are common with the default neon parser
40
- # and all the other options
41
- parser = p1_common .get_default_neon_parse (parser )
42
- parser = p1_common .get_p1_common_parser (parser )
43
-
44
- # Arguments that are applicable just to p1b1
45
- parser = p1b1_parser (parser )
46
-
47
- return parser
48
-
49
-
50
- def p1b1_parser (parser ):
51
- parser .add_argument ("--latent_dim" , type = int ,
52
- default = argparse .SUPPRESS ,
53
- help = "latent dimensions" )
54
- parser .add_argument ('-m' , '--model' ,
55
- default = argparse .SUPPRESS ,
56
- help = 'model to use: ae, vae, cvae' )
57
- parser .add_argument ("--use_landmark_genes" , action = "store_true" ,
58
- help = "use the 978 landmark genes from LINCS (L1000) as expression features" )
59
- parser .add_argument ("--residual" , action = "store_true" ,
60
- help = "add skip connections to the layers" )
61
- parser .add_argument ('--reduce_lr' , action = 'store_true' ,
62
- help = 'reduce learning rate on plateau' )
63
- parser .add_argument ('--warmup_lr' , action = 'store_true' ,
64
- help = 'gradually increase learning rate on start' )
65
- parser .add_argument ('--base_lr' , type = float ,
66
- default = None ,
67
- help = 'base learning rate' )
68
- parser .add_argument ("--epsilon_std" , type = float ,
69
- default = argparse .SUPPRESS ,
70
- help = "epsilon std for sampling latent noise" )
71
- parser .add_argument ('--cp' , action = 'store_true' ,
72
- help = 'checkpoint models with best val_loss' )
73
- parser .add_argument ('--tb' , action = 'store_true' ,
74
- help = 'use tensorboard' )
75
- parser .add_argument ('--tsne' , action = 'store_true' ,
76
- help = 'generate tsne plot of the latent representation' )
77
-
78
- return parser
79
-
80
-
81
- def read_config_file (file ):
82
- config = configparser .ConfigParser ()
83
- config .read (file )
84
- section = config .sections ()
85
- file_params = {}
86
- file_params ['activation' ] = eval (config .get (section [0 ], 'activation' ))
87
- file_params ['batch_size' ] = eval (config .get (section [0 ], 'batch_size' ))
88
- file_params ['dense' ] = eval (config .get (section [0 ], 'dense' ))
89
- file_params ['drop' ] = eval (config .get (section [0 ], 'drop' ))
90
- file_params ['epochs' ] = eval (config .get (section [0 ], 'epochs' ))
91
- file_params ['initialization' ] = eval (config .get (section [0 ], 'initialization' ))
92
- file_params ['learning_rate' ] = eval (config .get (section [0 ], 'learning_rate' ))
93
- file_params ['loss' ] = eval (config .get (section [0 ], 'loss' ))
94
- file_params ['noise_factor' ] = eval (config .get (section [0 ], 'noise_factor' ))
95
- file_params ['optimizer' ] = eval (config .get (section [0 ], 'optimizer' ))
96
- file_params ['rng_seed' ] = eval (config .get (section [0 ], 'rng_seed' ))
97
- file_params ['model' ] = eval (config .get (section [0 ], 'model' ))
98
- file_params ['scaling' ] = eval (config .get (section [0 ], 'scaling' ))
99
- file_params ['validation_split' ] = eval (config .get (section [0 ], 'validation_split' ))
100
- file_params ['latent_dim' ] = eval (config .get (section [0 ], 'latent_dim' ))
101
- file_params ['feature_subsample' ] = eval (config .get (section [0 ], 'feature_subsample' ))
102
- file_params ['batch_normalization' ] = eval (config .get (section [0 ], 'batch_normalization' ))
103
- file_params ['epsilon_std' ] = eval (config .get (section [0 ], 'epsilon_std' ))
104
-
105
- file_params ['solr_root' ] = eval (config .get (section [1 ], 'solr_root' ))
106
- file_params ['timeout' ] = eval (config .get (section [1 ], 'timeout' ))
107
-
108
- # parse the remaining values
109
- for k , v in config .items (section [0 ]):
110
- if not k in file_params :
111
- file_params [k ] = eval (v )
112
-
113
- return file_params
114
-
24
+ additional_definitions = [
25
+ {'name' :'latent_dim' ,
26
+ 'action' :'store' ,
27
+ 'type' : int ,
28
+ 'help' :'latent dimensions' },
29
+ {'name' :'model' ,
30
+ 'default' :'ae' ,
31
+ 'choices' :['ae' , 'vae' , 'cvae' ],
32
+ 'help' :'model to use: ae, vae, cvae' },
33
+ {'name' :'use_landmark_genes' ,
34
+ 'type' : candle .str2bool ,
35
+ 'default' : False ,
36
+ 'help' :'use the 978 landmark genes from LINCS (L1000) as expression features' },
37
+ {'name' :'residual' ,
38
+ 'type' : candle .str2bool ,
39
+ 'default' : False ,
40
+ 'help' :'add skip connections to the layers' },
41
+ {'name' :'reduce_lr' ,
42
+ 'type' : candle .str2bool ,
43
+ 'default' : False ,
44
+ 'help' :'reduce learning rate on plateau' },
45
+ {'name' :'warmup_lr' ,
46
+ 'type' : candle .str2bool ,
47
+ 'default' : False ,
48
+ 'help' :'gradually increase learning rate on start' },
49
+ {'name' :'base_lr' ,
50
+ 'type' : float ,
51
+ 'help' :'base learning rate' },
52
+ {'name' :'epsilon_std' ,
53
+ 'type' : float ,
54
+ 'help' :'epsilon std for sampling latent noise' },
55
+ {'name' :'cp' ,
56
+ 'type' : candle .str2bool ,
57
+ 'default' : False ,
58
+ 'help' :'checkpoint models with best val_loss' },
59
+ #{'name':'shuffle',
60
+ #'type': candle.str2bool,
61
+ #'default': False,
62
+ #'help':'shuffle data'},
63
+ {'name' :'tb' ,
64
+ 'type' : candle .str2bool ,
65
+ 'default' : False ,
66
+ 'help' :'use tensorboard' },
67
+ {'name' :'tsne' ,
68
+ 'type' : candle .str2bool ,
69
+ 'default' : False ,
70
+ 'help' :'generate tsne plot of the latent representation' }
71
+ ]
72
+
73
+ required = [
74
+ 'activation' ,
75
+ 'batch_size' ,
76
+ 'dense' ,
77
+ 'drop' ,
78
+ 'epochs' ,
79
+ 'initialization' ,
80
+ 'learning_rate' ,
81
+ 'loss' ,
82
+ 'noise_factor' ,
83
+ 'optimizer' ,
84
+ 'rng_seed' ,
85
+ 'model' ,
86
+ 'scaling' ,
87
+ 'validation_split' ,
88
+ 'latent_dim' ,
89
+ 'feature_subsample' ,
90
+ 'batch_normalization' ,
91
+ 'epsilon_std' ,
92
+ 'solr_root' ,
93
+ 'timeout'
94
+ ]
95
+
96
+ class BenchmarkP1B1 (candle .Benchmark ):
97
+
98
+ def set_locals (self ):
99
+ """Functionality to set variables specific for the benchmark
100
+ - required: set of required parameters for the benchmark.
101
+ - additional_definitions: list of dictionaries describing the additional parameters for the
102
+ benchmark.
103
+ """
104
+
105
+ if required is not None :
106
+ self .required = set (required )
107
+ if additional_definitions is not None :
108
+ self .additional_definitions = additional_definitions
115
109
116
110
def extension_from_parameters (params , framework = '' ):
117
111
"""Construct string for saving model with annotation of parameters"""
@@ -155,17 +149,17 @@ def load_data(params, seed):
155
149
156
150
if params ['use_landmark_genes' ]:
157
151
lincs_file = 'lincs1000.tsv'
158
- lincs_path = p1_common . get_p1_file ( url_p1b1 + lincs_file )
152
+ lincs_path = candle . fetch_file ( params [ ' url_p1b1' ] + lincs_file , 'Pilot1' )
159
153
df_l1000 = pd .read_csv (lincs_path , sep = '\t ' )
160
154
x_cols = df_l1000 ['gdc' ].tolist ()
161
155
drop_cols = None
162
156
else :
163
157
x_cols = None
164
158
165
- train_path = p1_common . get_p1_file ( url_p1b1 + file_train )
166
- test_path = p1_common . get_p1_file ( url_p1b1 + file_test )
159
+ train_path = candle . fetch_file ( params [ ' url_p1b1' ] + params [ ' file_train' ], 'Pilot1' )
160
+ test_path = candle . fetch_file ( params [ ' url_p1b1' ] + params [ ' file_test' ], 'Pilot1' )
167
161
168
- return p1_common .load_csv_data (train_path , test_path ,
162
+ return candle .load_csv_data (train_path , test_path ,
169
163
x_cols = x_cols ,
170
164
y_cols = y_cols ,
171
165
drop_cols = drop_cols ,
@@ -190,14 +184,14 @@ def load_data_orig(params, seed):
190
184
191
185
if params ['use_landmark_genes' ]:
192
186
lincs_file = 'lincs1000.tsv'
193
- lincs_path = p1_common . get_p1_file (url_p1b1 + lincs_file )
187
+ lincs_path = candle . fetch_file (url_p1b1 + lincs_file )
194
188
df_l1000 = pd .read_csv (lincs_path , sep = '\t ' )
195
189
usecols = df_l1000 ['gdc' ]
196
190
drop_cols = None
197
191
else :
198
192
usecols = None
199
193
200
- return p1_common .load_X_data (url_p1b1 , file_train , file_test ,
194
+ return candle .load_X_data (params [ ' url_p1b1' ], params [ ' file_train' ], params [ ' file_test' ] ,
201
195
drop_cols = drop_cols ,
202
196
onehot_cols = onehot_cols ,
203
197
usecols = usecols ,
0 commit comments