|
22 | 22 | from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
|
23 | 23 |
|
24 | 24 | file_path = os.path.dirname(os.path.realpath(__file__))
|
25 |
| -lib_path = os.path.abspath(os.path.join(file_path, '..', 'common')) |
| 25 | +lib_path = os.path.abspath(os.path.join(file_path, '..' )) |
26 | 26 | sys.path.append(lib_path)
|
27 | 27 | lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common'))
|
28 | 28 | sys.path.append(lib_path2)
|
29 | 29 |
|
30 |
| -import p1_common |
31 |
| - |
| 30 | +import candle_keras as candle |
32 | 31 |
|
33 | 32 | logger = logging.getLogger(__name__)
|
34 | 33 |
|
|
37 | 36 |
|
38 | 37 | np.set_printoptions(threshold=np.nan)
|
39 | 38 |
|
40 |
| -def common_parser(parser): |
41 |
| - |
42 |
| - parser.add_argument("--config_file", dest='config_file', type=str, |
43 |
| - default=os.path.join(file_path, 'p1b3_default_model.txt'), |
44 |
| - help="specify model configuration file") |
45 |
| - |
46 |
| - # Parse has been split between arguments that are common with the default neon parser |
47 |
| - # and all the other options |
48 |
| - parser = p1_common.get_default_neon_parse(parser) |
49 |
| - parser = p1_common.get_p1_common_parser(parser) |
50 |
| - |
51 |
| - # Arguments that are applicable just to p1b3 |
52 |
| - parser = p1b3_parser(parser) |
53 |
| - |
54 |
| - return parser |
55 |
| - |
56 |
| - |
57 |
| -def p1b3_parser(parser): |
58 |
| - |
59 |
| - # Feature selection |
60 |
| - parser.add_argument("--cell_features", nargs='+', |
61 |
| - default=argparse.SUPPRESS, |
62 |
| - choices=['expression', 'mirna', 'proteome', 'all', 'categorical'], |
63 |
| - help="use one or more cell line feature sets: 'expression', 'mirna', 'proteome', 'all'; or use 'categorical' for one-hot encoding of cell lines") |
64 |
| - parser.add_argument("--drug_features", nargs='+', |
65 |
| - default=argparse.SUPPRESS, |
66 |
| - choices=['descriptors', 'latent', 'all', 'noise'], |
67 |
| - help="use dragon7 descriptors, latent representations from Aspuru-Guzik's SMILES autoencoder, or both, or random features; 'descriptors','latent', 'all', 'noise'") |
68 |
| - parser.add_argument("--cell_noise_sigma", type=float, |
69 |
| - help="standard deviation of guassian noise to add to cell line features during training") |
70 |
| - # Output selection |
71 |
| - parser.add_argument("--min_logconc", type=float, |
72 |
| - default=argparse.SUPPRESS, |
73 |
| - help="min log concentration of dose response data to use: -3.0 to -7.0") |
74 |
| - parser.add_argument("--max_logconc", type=float, |
75 |
| - default=argparse.SUPPRESS, |
76 |
| - help="max log concentration of dose response data to use: -3.0 to -7.0") |
77 |
| - parser.add_argument("--subsample", |
78 |
| - default=argparse.SUPPRESS, |
79 |
| - choices=['naive_balancing', 'none'], |
80 |
| - help="dose response subsample strategy; 'none' or 'naive_balancing'") |
81 |
| - parser.add_argument("--category_cutoffs", nargs='+', type=float, |
82 |
| - default=argparse.SUPPRESS, |
83 |
| - help="list of growth cutoffs (between -1 and +1) seperating non-response and response categories") |
84 |
| - # Sample data selection |
85 |
| - parser.add_argument("--test_cell_split", type=float, |
86 |
| - default=argparse.SUPPRESS, |
87 |
| - help="cell lines to use in test; if None use predefined unseen cell lines instead of sampling cell lines used in training") |
88 |
| - # Test random model |
89 |
| - parser.add_argument("--scramble", action="store_true", |
90 |
| - default=False, |
91 |
| - help="randomly shuffle dose response data") |
92 |
| - parser.add_argument("--workers", type=int, |
93 |
| - default=WORKERS, |
94 |
| - help="number of data generator workers") |
95 |
| - |
96 |
| - return parser |
97 |
| - |
98 |
| -def read_config_file(file): |
99 |
| - config = configparser.ConfigParser() |
100 |
| - config.read(file) |
101 |
| - section = config.sections() |
102 |
| - fileParams = {} |
103 |
| - |
104 |
| - # default config values that we assume exists |
105 |
| - fileParams['activation']=eval(config.get(section[0],'activation')) |
106 |
| - fileParams['batch_size']=eval(config.get(section[0],'batch_size')) |
107 |
| - fileParams['batch_normalization']=eval(config.get(section[0],'batch_normalization')) |
108 |
| - fileParams['category_cutoffs']=eval(config.get(section[0],'category_cutoffs')) |
109 |
| - fileParams['cell_features']=eval(config.get(section[0],'cell_features')) |
110 |
| - fileParams['drop']=eval(config.get(section[0],'drop')) |
111 |
| - fileParams['drug_features']=eval(config.get(section[0],'drug_features')) |
112 |
| - fileParams['epochs']=eval(config.get(section[0],'epochs')) |
113 |
| - fileParams['feature_subsample']=eval(config.get(section[0],'feature_subsample')) |
114 |
| - fileParams['initialization']=eval(config.get(section[0],'initialization')) |
115 |
| - fileParams['learning_rate']=eval(config.get(section[0], 'learning_rate')) |
116 |
| - fileParams['loss']=eval(config.get(section[0],'loss')) |
117 |
| - fileParams['min_logconc']=eval(config.get(section[0],'min_logconc')) |
118 |
| - fileParams['max_logconc']=eval(config.get(section[0],'max_logconc')) |
119 |
| - fileParams['optimizer']=eval(config.get(section[0],'optimizer')) |
120 |
| -# fileParams['penalty']=eval(config.get(section[0],'penalty')) |
121 |
| - fileParams['rng_seed']=eval(config.get(section[0],'rng_seed')) |
122 |
| - fileParams['scaling']=eval(config.get(section[0],'scaling')) |
123 |
| - fileParams['subsample']=eval(config.get(section[0],'subsample')) |
124 |
| - fileParams['test_cell_split']=eval(config.get(section[0],'test_cell_split')) |
125 |
| - fileParams['validation_split']=eval(config.get(section[0],'validation_split')) |
126 |
| - fileParams['cell_noise_sigma']=eval(config.get(section[0],'cell_noise_sigma')) |
127 |
| - |
128 |
| - # parse the remaining values |
129 |
| - for k,v in config.items(section[0]): |
130 |
| - if not k in fileParams: |
131 |
| - fileParams[k] = eval(v) |
| 39 | +class BenchmarkP1B3(candle.Benchmark): |
| 40 | + |
| 41 | + def set_locals(self): |
| 42 | + """Functionality to set variables specific for the benchmark |
| 43 | + - required: set of required parameters for the benchmark. |
| 44 | + - additional_definitions: list of dictionaries describing the additional parameters for the |
| 45 | + benchmark. |
| 46 | + """ |
132 | 47 |
|
| 48 | + if required is not None: |
| 49 | + self.required = set(required) |
| 50 | + if additional_definitions is not None: |
| 51 | + self.additional_definitions = additional_definitions |
| 52 | + |
| 53 | +additional_definitions = [ |
| 54 | +# Feature selection |
| 55 | + {'name':'cell_features', |
| 56 | + 'nargs':'+', |
| 57 | + #'default':'argparse.SUPPRESS', |
| 58 | + 'choices':['expression', 'mirna', 'proteome', 'all', 'categorical'], |
| 59 | + 'help':'use one or more cell line feature sets: "expression", "mirna", "proteome", "all"; or use "categorical" for one-hot encoding of cell lines'}, |
| 60 | + {'name':'drug_features', |
| 61 | + 'nargs':'+', |
| 62 | + #'default':'argparse.SUPPRESS', |
| 63 | + 'choices':['descriptors', 'latent', 'all', 'noise'], |
| 64 | + 'help':"use dragon7 descriptors, latent representations from Aspuru-Guzik's SMILES autoencoder, or both, or random features; 'descriptors','latent', 'all', 'noise'"}, |
| 65 | + {'name':'cell_noise_sigma', 'type':float, |
| 66 | + 'help':"standard deviation of guassian noise to add to cell line features during training"}, |
| 67 | +# Output selection |
| 68 | + {'name':'min_logconc', |
| 69 | + 'type':float, |
| 70 | + #'default':'argparse.SUPPRESS', |
| 71 | + 'help':"min log concentration of dose response data to use: -3.0 to -7.0"}, |
| 72 | + {'name':'max_logconc', |
| 73 | + 'type':float, |
| 74 | + #'default':'argparse.SUPPRESS', |
| 75 | + 'help':"max log concentration of dose response data to use: -3.0 to -7.0"}, |
| 76 | + {'name':'subsample', |
| 77 | + #'default':'argparse.SUPPRESS', |
| 78 | + 'choices':['naive_balancing', 'none'], |
| 79 | + 'help':"dose response subsample strategy; 'none' or 'naive_balancing'"}, |
| 80 | + {'name':'category_cutoffs', |
| 81 | + 'nargs':'+', |
| 82 | + 'type':float, |
| 83 | + #'default':'argparse.SUPPRESS', |
| 84 | + 'help':"list of growth cutoffs (between -1 and +1) seperating non-response and response categories"}, |
| 85 | +# Sample data selection |
| 86 | + {'name':'test_cell_split', |
| 87 | + 'type':float, |
| 88 | + #'default':'argparse.SUPPRESS', |
| 89 | + 'help':"cell lines to use in test; if None use predefined unseen cell lines instead of sampling cell lines used in training"}, |
| 90 | +# Test random model |
| 91 | + {'name':'scramble', |
| 92 | + 'type': candle.str2bool, |
| 93 | + 'default': False, |
| 94 | + 'help':'randomly shuffle dose response data'}, |
| 95 | + {'name':'workers', |
| 96 | + 'type':int, |
| 97 | + 'default':WORKERS, |
| 98 | + 'help':'number of data generator workers'} |
| 99 | +] |
| 100 | + |
| 101 | +required = [ |
| 102 | + 'activation', |
| 103 | + 'batch_size', |
| 104 | + 'batch_normalization', |
| 105 | + 'category_cutoffs', |
| 106 | + 'cell_features', |
| 107 | + 'drop', |
| 108 | + 'drug_features', |
| 109 | + 'epochs', |
| 110 | + 'feature_subsample', |
| 111 | + 'initialization', |
| 112 | + 'learning_rate', |
| 113 | + 'loss', |
| 114 | + 'min_logconc', |
| 115 | + 'max_logconc', |
| 116 | + 'optimizer', |
| 117 | +# 'penalty', |
| 118 | + 'rng_seed', |
| 119 | + 'scaling', |
| 120 | + 'subsample', |
| 121 | + 'test_cell_split', |
| 122 | + 'validation_split', |
| 123 | + 'cell_noise_sigma' |
| 124 | + ] |
| 125 | + |
| 126 | +#def common_parser(parser): |
| 127 | +# |
| 128 | +# parser.add_argument("--config_file", dest='config_file', type=str, |
| 129 | +# default=os.path.join(file_path, 'p1b3_default_model.txt'), |
| 130 | +# help="specify model configuration file") |
| 131 | +# |
| 132 | +# # Parse has been split between arguments that are common with the default neon parser |
| 133 | +# # and all the other options |
| 134 | +# parser = candle.get_default_neon_parse(parser) |
| 135 | +# parser = p1_common.get_p1_common_parser(parser) |
| 136 | +# |
| 137 | +# # Arguments that are applicable just to p1b3 |
| 138 | +# parser = p1b3_parser(parser) |
| 139 | +# |
| 140 | +# return parser |
| 141 | + |
| 142 | + |
| 143 | +#def p1b3_parser(parser): |
| 144 | +# |
| 145 | +# # Feature selection |
| 146 | +# parser.add_argument("--cell_features", nargs='+', |
| 147 | +# default=argparse.SUPPRESS, |
| 148 | +# choices=['expression', 'mirna', 'proteome', 'all', 'categorical'], |
| 149 | +# help="use one or more cell line feature sets: 'expression', 'mirna', 'proteome', 'all'; or use 'categorical' for one-hot encoding of cell lines") |
| 150 | +# parser.add_argument("--drug_features", nargs='+', |
| 151 | +# default=argparse.SUPPRESS, |
| 152 | +# choices=['descriptors', 'latent', 'all', 'noise'], |
| 153 | +# help="use dragon7 descriptors, latent representations from Aspuru-Guzik's SMILES autoencoder, or both, or random features; 'descriptors','latent', 'all', 'noise'") |
| 154 | +# parser.add_argument("--cell_noise_sigma", type=float, |
| 155 | +# help="standard deviation of guassian noise to add to cell line features during training") |
| 156 | +# # Output selection |
| 157 | +# parser.add_argument("--min_logconc", type=float, |
| 158 | +# default=argparse.SUPPRESS, |
| 159 | +# help="min log concentration of dose response data to use: -3.0 to -7.0") |
| 160 | +# parser.add_argument("--max_logconc", type=float, |
| 161 | +# default=argparse.SUPPRESS, |
| 162 | +# help="max log concentration of dose response data to use: -3.0 to -7.0") |
| 163 | +# parser.add_argument("--subsample", |
| 164 | +# default=argparse.SUPPRESS, |
| 165 | +# choices=['naive_balancing', 'none'], |
| 166 | +# help="dose response subsample strategy; 'none' or 'naive_balancing'") |
| 167 | +# parser.add_argument("--category_cutoffs", nargs='+', type=float, |
| 168 | +# default=argparse.SUPPRESS, |
| 169 | +# help="list of growth cutoffs (between -1 and +1) seperating non-response and response categories") |
| 170 | +# # Sample data selection |
| 171 | +# parser.add_argument("--test_cell_split", type=float, |
| 172 | +# default=argparse.SUPPRESS, |
| 173 | +# help="cell lines to use in test; if None use predefined unseen cell lines instead of sampling cell lines used in training") |
| 174 | +# # Test random model |
| 175 | +# parser.add_argument("--scramble", action="store_true", |
| 176 | +# default=False, |
| 177 | +# help="randomly shuffle dose response data") |
| 178 | +# parser.add_argument("--workers", type=int, |
| 179 | +# default=WORKERS, |
| 180 | +# help="number of data generator workers") |
| 181 | +# |
| 182 | +# return parser |
| 183 | + |
| 184 | +#def read_config_file(file): |
| 185 | +# config = configparser.ConfigParser() |
| 186 | +# config.read(file) |
| 187 | +# section = config.sections() |
| 188 | +# fileParams = {} |
| 189 | +# |
| 190 | +# # default config values that we assume exists |
| 191 | +# fileParams['activation']=eval(config.get(section[0],'activation')) |
| 192 | +# fileParams['batch_size']=eval(config.get(section[0],'batch_size')) |
| 193 | +# fileParams['batch_normalization']=eval(config.get(section[0],'batch_normalization')) |
| 194 | +# fileParams['category_cutoffs']=eval(config.get(section[0],'category_cutoffs')) |
| 195 | +# fileParams['cell_features']=eval(config.get(section[0],'cell_features')) |
| 196 | +# fileParams['drop']=eval(config.get(section[0],'drop')) |
| 197 | +# fileParams['drug_features']=eval(config.get(section[0],'drug_features')) |
| 198 | +# fileParams['epochs']=eval(config.get(section[0],'epochs')) |
| 199 | +# fileParams['feature_subsample']=eval(config.get(section[0],'feature_subsample')) |
| 200 | +# fileParams['initialization']=eval(config.get(section[0],'initialization')) |
| 201 | +# fileParams['learning_rate']=eval(config.get(section[0], 'learning_rate')) |
| 202 | +# fileParams['loss']=eval(config.get(section[0],'loss')) |
| 203 | +# fileParams['min_logconc']=eval(config.get(section[0],'min_logconc')) |
| 204 | +# fileParams['max_logconc']=eval(config.get(section[0],'max_logconc')) |
| 205 | +# fileParams['optimizer']=eval(config.get(section[0],'optimizer')) |
| 206 | +## fileParams['penalty']=eval(config.get(section[0],'penalty')) |
| 207 | +# fileParams['rng_seed']=eval(config.get(section[0],'rng_seed')) |
| 208 | +# fileParams['scaling']=eval(config.get(section[0],'scaling')) |
| 209 | +# fileParams['subsample']=eval(config.get(section[0],'subsample')) |
| 210 | +# fileParams['test_cell_split']=eval(config.get(section[0],'test_cell_split')) |
| 211 | +# fileParams['validation_split']=eval(config.get(section[0],'validation_split')) |
| 212 | +# fileParams['cell_noise_sigma']=eval(config.get(section[0],'cell_noise_sigma')) |
| 213 | +# |
| 214 | +# # parse the remaining values |
| 215 | +# for k,v in config.items(section[0]): |
| 216 | +# if not k in fileParams: |
| 217 | +# fileParams[k] = eval(v) |
| 218 | +# |
| 219 | + |
| 220 | +def check_params(fileParams): |
133 | 221 | # Allow for either dense or convolutional layer specification
|
134 | 222 | # if none found exit
|
135 | 223 | try:
|
136 |
| - fileParams['dense']=eval(config.get(section[0],'dense')) |
137 |
| - except configparser.NoOptionError: |
138 |
| - try: |
139 |
| - fileParams['conv']=eval(config.get(section[0],'conv')) |
140 |
| - except configparser.NoOptionError: |
141 |
| - print("Error ! No dense or conv layers specified. Wrong file !! ... exiting ") |
| 224 | + fileParams['dense'] |
| 225 | + except KeyError: |
| 226 | + try: |
| 227 | + fileParams['conv'] |
| 228 | + except KeyError: |
| 229 | + print("Error! No dense or conv layers specified. Wrong file !! ... exiting ") |
142 | 230 | raise
|
143 | 231 | else:
|
144 | 232 | try:
|
145 |
| - fileParams['pool']=eval(config.get(section[0],'pool')) |
146 |
| - except configparser.NoOptionError: |
| 233 | + fileParams['pool'] |
| 234 | + except KeyError: |
147 | 235 | fileParams['pool'] = None
|
148 | 236 | print("Warning ! No pooling specified after conv layer.")
|
149 | 237 |
|
150 |
| - return fileParams |
151 |
| - |
152 | 238 |
|
153 | 239 | def extension_from_parameters(params, framework):
|
154 | 240 | """Construct string for saving model with annotation of parameters"""
|
@@ -495,15 +581,15 @@ def load_dose_response(path, seed, dtype, min_logconc=-5., max_logconc=-5., subs
|
495 | 581 | def stage_data():
|
496 | 582 | server = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B3/'
|
497 | 583 |
|
498 |
| - cell_expr_path = p1_common.get_p1_file(server+'P1B3_cellline_expressions.tsv') |
499 |
| - cell_mrna_path = p1_common.get_p1_file(server+'P1B3_cellline_mirna.tsv') |
500 |
| - cell_prot_path = p1_common.get_p1_file(server+'P1B3_cellline_proteome.tsv') |
501 |
| - cell_kino_path = p1_common.get_p1_file(server+'P1B3_cellline_kinome.tsv') |
502 |
| - drug_desc_path = p1_common.get_p1_file(server+'P1B3_drug_descriptors.tsv') |
503 |
| - drug_auen_path = p1_common.get_p1_file(server+'P1B3_drug_latent.csv') |
504 |
| - dose_resp_path = p1_common.get_p1_file(server+'P1B3_dose_response.csv') |
505 |
| - test_cell_path = p1_common.get_p1_file(server+'P1B3_test_celllines.txt') |
506 |
| - test_drug_path = p1_common.get_p1_file(server+'P1B3_test_drugs.txt') |
| 584 | + cell_expr_path = candle.fetch_file(server+'P1B3_cellline_expressions.tsv', 'Pilot1', untar=False) |
| 585 | + cell_mrna_path = candle.fetch_file(server+'P1B3_cellline_mirna.tsv', 'Pilot1', untar=False) |
| 586 | + cell_prot_path = candle.fetch_file(server+'P1B3_cellline_proteome.tsv', 'Pilot1', untar=False) |
| 587 | + cell_kino_path = candle.fetch_file(server+'P1B3_cellline_kinome.tsv', 'Pilot1', untar=False) |
| 588 | + drug_desc_path = candle.fetch_file(server+'P1B3_drug_descriptors.tsv', 'Pilot1', untar=False) |
| 589 | + drug_auen_path = candle.fetch_file(server+'P1B3_drug_latent.csv', 'Pilot1', untar=False) |
| 590 | + dose_resp_path = candle.fetch_file(server+'P1B3_dose_response.csv', 'Pilot1', untar=False) |
| 591 | + test_cell_path = candle.fetch_file(server+'P1B3_test_celllines.txt', 'Pilot1', untar=False) |
| 592 | + test_drug_path = candle.fetch_file(server+'P1B3_test_drugs.txt', 'Pilot1', untar=False) |
507 | 593 |
|
508 | 594 | return(cell_expr_path, cell_mrna_path, cell_prot_path, cell_kino_path,
|
509 | 595 | drug_desc_path, drug_auen_path, dose_resp_path, test_cell_path,
|
|
0 commit comments