1
+ from __future__ import print_function
2
+
1
3
import pandas as pd
2
4
import numpy as np
3
5
import os
11
13
12
14
from keras import backend as K
13
15
14
- from keras .layers import Input , Dense , Dropout , Activation , Conv1D , MaxPooling1D , Flatten , LocallyConnected1D
16
+ from keras .layers import Input , Dense , Dropout , Activation , Conv1D , MaxPooling1D , Flatten
15
17
from keras .optimizers import SGD , Adam , RMSprop
16
18
from keras .models import Sequential , Model , model_from_json , model_from_yaml
17
19
from keras .utils import np_utils
21
23
from sklearn .preprocessing import StandardScaler , MinMaxScaler , MaxAbsScaler
22
24
23
25
file_path = os .path .dirname (os .path .realpath (__file__ ))
24
- lib_path = os .path .abspath (os .path .join (file_path , '..' , 'common' ))
25
- sys .path .append (lib_path )
26
26
lib_path2 = os .path .abspath (os .path .join (file_path , '..' , '..' , 'common' ))
27
27
sys .path .append (lib_path2 )
28
28
29
- import data_utils
30
- import p1_common
31
-
32
- #EPOCH = 400
33
- #BATCH = 20
34
- #CLASSES = 2
35
-
36
- #PL = 60484 # 1 + 60483 these are the width of the RNAseq datasets
37
- #P = 60483 # 60483
38
- #DR = 0.1 # Dropout rate
39
-
40
- def common_parser (parser ):
41
-
42
- parser .add_argument ("--config_file" , dest = 'config_file' , type = str ,
43
- default = os .path .join (file_path , 'tc1_default_model.txt' ),
44
- help = "specify model configuration file" )
45
-
46
- # Parse has been split between arguments that are common with the default neon parser
47
- # and all the other options
48
- parser = p1_common .get_default_neon_parse (parser )
49
- parser = p1_common .get_p1_common_parser (parser )
50
-
51
- return parser
52
-
53
- def get_tc1_parser ():
54
-
55
- parser = argparse .ArgumentParser (prog = 'tc1_baseline' , formatter_class = argparse .ArgumentDefaultsHelpFormatter ,
56
- description = 'Train Autoencoder - Pilot 1 Benchmark 1' )
29
+ import tc1 as bmk
30
+ import candle_keras as candle
57
31
58
- return common_parser (parser )
59
-
60
- def read_config_file (file ):
61
- config = configparser .ConfigParser ()
62
- config .read (file )
63
- section = config .sections ()
64
- fileParams = {}
65
-
66
- fileParams ['data_url' ]= eval (config .get (section [0 ],'data_url' ))
67
- fileParams ['train_data' ]= eval (config .get (section [0 ],'train_data' ))
68
- fileParams ['test_data' ]= eval (config .get (section [0 ],'test_data' ))
69
- fileParams ['model_name' ]= eval (config .get (section [0 ],'model_name' ))
70
- fileParams ['conv' ]= eval (config .get (section [0 ],'conv' ))
71
- fileParams ['dense' ]= eval (config .get (section [0 ],'dense' ))
72
- fileParams ['activation' ]= eval (config .get (section [0 ],'activation' ))
73
- fileParams ['out_act' ]= eval (config .get (section [0 ],'out_act' ))
74
- fileParams ['loss' ]= eval (config .get (section [0 ],'loss' ))
75
- fileParams ['optimizer' ]= eval (config .get (section [0 ],'optimizer' ))
76
- fileParams ['feature_subsample' ]= eval (config .get (section [0 ],'feature_subsample' ))
77
- fileParams ['metrics' ]= eval (config .get (section [0 ],'metrics' ))
78
- fileParams ['epochs' ]= eval (config .get (section [0 ],'epochs' ))
79
- fileParams ['batch_size' ]= eval (config .get (section [0 ],'batch_size' ))
80
- fileParams ['drop' ]= eval (config .get (section [0 ],'drop' ))
81
- fileParams ['classes' ]= eval (config .get (section [0 ],'classes' ))
82
- fileParams ['pool' ]= eval (config .get (section [0 ],'pool' ))
83
- fileParams ['save' ]= eval (config .get (section [0 ], 'save' ))
84
-
85
- return fileParams
86
32
87
33
def initialize_parameters ():
88
- # Get command-line parameters
89
- parser = get_tc1_parser ()
90
- args = parser .parse_args ()
91
- #print('Args:', args)
92
- # Get parameters from configuration file
93
- fileParameters = read_config_file (args .config_file )
94
- #print ('Params:', fileParameters)
95
- # Consolidate parameter set. Command-line parameters overwrite file configuration
96
- gParameters = p1_common .args_overwrite_config (args , fileParameters )
97
- return gParameters
98
-
99
-
100
- def load_data (train_path , test_path , gParameters ):
101
-
102
- print ('Loading data...' )
103
- if gParameters ['feature_subsample' ] > 0 :
104
- usecols = list (range (gParameters ['feature_subsample' ]))
105
- else :
106
- usecols = None
107
- df_train = (pd .read_csv (train_path , header = None , usecols = usecols ).values ).astype ('float32' )
108
- df_test = (pd .read_csv (test_path , header = None , usecols = usecols ).values ).astype ('float32' )
109
- print ('done' )
110
-
111
- print ('df_train shape:' , df_train .shape )
112
- print ('df_test shape:' , df_test .shape )
113
34
114
- seqlen = df_train .shape [1 ]
35
+ # Build benchmark object
36
+ tc1Bmk = bmk .BenchmarkTC1 (file_path , 'tc1_default_model.txt' , 'keras' ,
37
+ prog = 'tc1_baseline' , desc = 'Multi-task (DNN) for data extraction from clinical reports - Pilot 3 Benchmark 1' )
115
38
116
- df_y_train = df_train [:,0 ].astype ('int' )
117
- df_y_test = df_test [:,0 ].astype ('int' )
39
+ # Initialize parameters
40
+ gParameters = candle .initialize_parameters (tc1Bmk )
41
+ #benchmark.logger.info('Params: {}'.format(gParameters))
118
42
119
- Y_train = np_utils .to_categorical (df_y_train ,gParameters ['classes' ])
120
- Y_test = np_utils .to_categorical (df_y_test ,gParameters ['classes' ])
121
-
122
- df_x_train = df_train [:, 1 :seqlen ].astype (np .float32 )
123
- df_x_test = df_test [:, 1 :seqlen ].astype (np .float32 )
124
-
125
- # X_train = df_x_train.as_matrix()
126
- # X_test = df_x_test.as_matrix()
127
-
128
- X_train = df_x_train
129
- X_test = df_x_test
130
-
131
- scaler = MaxAbsScaler ()
132
- mat = np .concatenate ((X_train , X_test ), axis = 0 )
133
- mat = scaler .fit_transform (mat )
134
-
135
- X_train = mat [:X_train .shape [0 ], :]
136
- X_test = mat [X_train .shape [0 ]:, :]
137
-
138
- return X_train , Y_train , X_test , Y_test
43
+ return gParameters
139
44
140
45
141
46
def run (gParameters ):
142
47
143
- print ('Params:' , gParameters )
144
-
145
- file_train = gParameters ['train_data' ]
146
- file_test = gParameters ['test_data' ]
147
- url = gParameters ['data_url' ]
148
-
149
- train_file = data_utils .get_file (file_train , url + file_train , cache_subdir = 'Pilot1' )
150
- test_file = data_utils .get_file (file_test , url + file_test , cache_subdir = 'Pilot1' )
151
-
152
- X_train , Y_train , X_test , Y_test = load_data (train_file , test_file , gParameters )
48
+ X_train , Y_train , X_test , Y_test = bmk .load_data (gParameters )
153
49
154
50
print ('X_train shape:' , X_train .shape )
155
51
print ('X_test shape:' , X_test .shape )
@@ -169,6 +65,7 @@ def run(gParameters):
169
65
170
66
model = Sequential ()
171
67
dense_first = True
68
+
172
69
layer_list = list (range (0 , len (gParameters ['conv' ]), 3 ))
173
70
for l , i in enumerate (layer_list ):
174
71
filters = gParameters ['conv' ][i ]
@@ -212,26 +109,8 @@ def run(gParameters):
212
109
model .add (Flatten ())
213
110
214
111
model .add (Dense (gParameters ['classes' ]))
215
-
216
112
model .add (Activation (gParameters ['out_act' ]))
217
113
218
- #Reference case
219
- #model.add(Conv1D(filters=128, kernel_size=20, strides=1, padding='valid', input_shape=(P, 1)))
220
- #model.add(Activation('relu'))
221
- #model.add(MaxPooling1D(pool_size=1))
222
- #model.add(Conv1D(filters=128, kernel_size=10, strides=1, padding='valid'))
223
- #model.add(Activation('relu'))
224
- #model.add(MaxPooling1D(pool_size=10))
225
- #model.add(Flatten())
226
- #model.add(Dense(200))
227
- #model.add(Activation('relu'))
228
- #model.add(Dropout(0.1))
229
- #model.add(Dense(20))
230
- #model.add(Activation('relu'))
231
- #model.add(Dropout(0.1))
232
- #model.add(Dense(CLASSES))
233
- #model.add(Activation('softmax'))
234
-
235
114
model .summary ()
236
115
237
116
model .compile (loss = gParameters ['loss' ],
@@ -241,8 +120,8 @@ def run(gParameters):
241
120
output_dir = gParameters ['save' ]
242
121
if not os .path .exists (output_dir ):
243
122
os .makedirs (output_dir )
244
- # set up a bunch of callbacks to do work during model training..
245
-
123
+
124
+ # set up callbacks to do work during model training..
246
125
model_name = gParameters ['model_name' ]
247
126
path = '{}/{}.autosave.model.h5' .format (output_dir , model_name )
248
127
checkpointer = ModelCheckpoint (filepath = path , verbose = 1 , save_weights_only = False , save_best_only = True )
@@ -324,6 +203,7 @@ def run(gParameters):
324
203
325
204
return history
326
205
206
+
327
207
def main ():
328
208
329
209
gParameters = initialize_parameters ()
@@ -335,3 +215,4 @@ def main():
335
215
K .clear_session ()
336
216
except AttributeError : # theano does not have this function
337
217
pass
218
+
0 commit comments