Skip to content

Commit 35f1bbe

Browse files
author
Arvind Ramanathan [v33]
committed
added Pilot 3 benchmark 1 - multi-task learning DNN
1 parent 2ddf820 commit 35f1bbe

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

62 files changed

+261
-0
lines changed

P3B1/README.md

Lines changed: 106 additions & 0 deletions

P3B1/keras_p3b1_baseline.py

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
import numpy
2+
3+
from keras.models import Sequential
4+
from keras.layers.core import Dense, Dropout, Activation
5+
from keras.optimizers import SGD, Adam, RMSprop
6+
from keras.utils import np_utils
7+
8+
from keras.layers import Input, Embedding, merge
9+
from keras.models import Model
10+
11+
import cPickle, gzip
12+
13+
"""
14+
This is a baseline implementation of a multi-task learning deep neural net for processing
15+
clinical pathology reports. The original dataa from the pathology reports cannot be made
16+
available online. Hence, we have pre-processed the reports so that example training/testing
17+
sets can be generated. Contact [email protected] for more information for generating additional
18+
training and testing data.
19+
20+
The text comprehension code takes as input an N-gram feature vector that
21+
consists of TF-IDF representation of the words in the pathology report. Each training set
22+
consists of 1200 samples for training, and the testing set consists of 89 samples. The MTL
23+
deep network train on these samples and produce the respective macro- and micro- F1 scores.
24+
25+
"""
26+
27+
# Set up a number of initial variables for use with baseline
28+
NUM_TASKS = 3; # number of learning tasks (for multi-task learning)
29+
NUM_FOLDS = 10; # number of folds for training (main cross validation loop)
30+
NUM_EPOCH = 5; # number of epochs
31+
32+
33+
34+
truth_a_arr = []
35+
pred_a_arr = []
36+
37+
truth_b_arr = []
38+
pred_b_arr = []
39+
40+
truth_c_arr = []
41+
pred_c_arr = []
42+
43+
for fold in range( NUM_FOLDS ):
44+
45+
features_train = []
46+
labels_train = []
47+
truths_train = []
48+
49+
features_test = []
50+
labels_test = []
51+
truths_test = []
52+
53+
n_out = []
54+
55+
for task in range( NUM_TASKS ):
56+
file_post = '.' + str(task) + '.' + str(fold) + '.pkl.gz'
57+
fname_train = 'train/train' + file_post;
58+
fname_test = 'test/test' + file_post;
59+
60+
with gzip.open( fname_train, 'rb' ) as f:
61+
feature_train, label_train = cPickle.load( f )
62+
63+
with gzip.open( fname_test, 'rb') as f:
64+
feature_test, label_test = cPickle.load( f )
65+
66+
features_train.append( feature_train )
67+
labels_train.append( label_train )
68+
69+
features_test.append( feature_test )
70+
labels_test.append( label_test )
71+
72+
mv = numpy.max( label_train )
73+
truth_train = numpy.zeros( ( len( label_train ), mv + 1 ) )
74+
for i in range( len( label_train ) ):
75+
truth_train[ i, label_train[ i ] ] = 1
76+
77+
truths_train.append( truth_train )
78+
79+
mv = numpy.max( label_test )
80+
truth_test = numpy.zeros( ( len( label_test ), mv + 1 ) )
81+
for i in range( len( label_test ) ):
82+
truth_test[ i, label_test[ i ] ] = 1
83+
84+
truths_test.append( truth_test )
85+
86+
n_out.append( mv + 1 )
87+
88+
flen = len( feature_train[ 0 ] ); # input feature length is set to 400 for now based on the training examples available.
89+
90+
# shared layer
91+
main_input = Input( shape= ( flen, ), name= 'main_input' )
92+
layer1 = Dense( flen, activation= 'relu', name= 'layer1' )( main_input )
93+
layer2 = Dense( flen, activation= 'relu', name= 'layer2' )( layer1 )
94+
95+
# task 1
96+
layer3a = Dense( flen, activation= 'relu', name= 'layer3a' )( layer2 )
97+
layer4a = Dense( 256, activation= 'relu', name= 'layer4a' )( layer3a )
98+
layer5a = Dense( n_out[ 0 ], activation= 'softmax', name= 'layer5a' )( layer4a )
99+
100+
# task 2
101+
layer3b = Dense( flen, activation= 'relu', name= 'layer3b' )( layer2 )
102+
layer4b = Dense( 256, activation= 'relu', name= 'layer4b' )( layer3b )
103+
layer5b = Dense( n_out[ 1 ], activation= 'softmax', name= 'layer5b' )( layer4b )
104+
105+
# task 3
106+
layer3c = Dense( flen, activation= 'relu', name= 'layer3c' )( layer2 )
107+
layer4c = Dense( 256, activation= 'relu', name= 'layer4c' )( layer3c )
108+
layer5c = Dense( n_out[ 2 ], activation= 'softmax', name= 'layer5c' )( layer4c )
109+
110+
model_a = Model( input= [ main_input ], output= [ layer5a ] )
111+
model_b = Model( input= [ main_input ], output= [ layer5b ] )
112+
model_c = Model( input= [ main_input ], output= [ layer5c ] )
113+
114+
model_a.summary()
115+
model_b.summary()
116+
model_c.summary()
117+
118+
model_a.compile( loss= 'categorical_crossentropy', optimizer= RMSprop( lr= 0.001 ), metrics= [ 'accuracy' ] )
119+
model_b.compile( loss= 'categorical_crossentropy', optimizer= RMSprop( lr= 0.001 ), metrics= [ 'accuracy' ] )
120+
model_c.compile( loss= 'categorical_crossentropy', optimizer= RMSprop( lr= 0.001 ), metrics= [ 'accuracy' ] )
121+
122+
for epoch in range( NUM_EPOCH ):
123+
model_a.fit( { 'main_input': features_train[ 0 ] }, { 'layer5a': truths_train[ 0 ] }, nb_epoch= 1, verbose= 1,
124+
batch_size= 10, validation_data= ( features_test[ 0 ], truths_test[ 0 ] ) )
125+
126+
model_b.fit( { 'main_input': features_train[ 1 ] }, { 'layer5b': truths_train[ 1 ] }, nb_epoch= 1, verbose= 1,
127+
batch_size= 10, validation_data= ( features_test[ 1 ], truths_test[ 1 ] ) )
128+
129+
model_c.fit( { 'main_input': features_train[ 2 ] }, { 'layer5c': truths_train[ 2 ] }, nb_epoch= 1, verbose= 1,
130+
batch_size= 10, validation_data= ( features_test[ 2 ], truths_test[ 2 ] ) )
131+
132+
pred_a = model_a.predict( features_test[ 0 ] )
133+
pred_b = model_b.predict( features_test[ 1 ] )
134+
pred_c = model_c.predict( features_test[ 2 ] )
135+
136+
truth_a_arr.extend( labels_test[ 0 ] )
137+
pred_a_arr.extend( numpy.argmax( pred_a, axis= 1 ) )
138+
139+
truth_b_arr.extend( labels_test[ 1 ] )
140+
pred_b_arr.extend( numpy.argmax( pred_b, axis= 1 ) )
141+
142+
truth_c_arr.extend( labels_test[ 2 ] )
143+
pred_c_arr.extend( numpy.argmax( pred_c, axis= 1 ) )
144+
145+
146+
from sklearn.metrics import f1_score
147+
148+
print 'Task 1: Primary site - Macro F1 score', f1_score( truth_a_arr, pred_a_arr, average= 'macro' )
149+
print 'Task 1: Primary site - Micro F1 score', f1_score( truth_a_arr, pred_a_arr, average= 'micro' )
150+
151+
print 'Task 2: Tumor laterality - Macro F1 score', f1_score( truth_b_arr, pred_b_arr, average= 'macro' )
152+
print 'Task 3: Tumor laterality - Micro F1 score', f1_score( truth_b_arr, pred_b_arr, average= 'micro' )
153+
154+
print 'Task 3: Histological grade - Macro F1 score', f1_score( truth_c_arr, pred_c_arr, average= 'macro' )
155+
print 'Task 3: Histological grade - Micro F1 score', f1_score( truth_c_arr, pred_c_arr, average= 'micro' )

P3B1/test/test.0.0.pkl.gz

28.7 KB
Binary file not shown.

P3B1/test/test.0.1.pkl.gz

30.8 KB
Binary file not shown.

P3B1/test/test.0.2.pkl.gz

29.5 KB
Binary file not shown.

P3B1/test/test.0.3.pkl.gz

29.2 KB
Binary file not shown.

P3B1/test/test.0.4.pkl.gz

29.2 KB
Binary file not shown.

P3B1/test/test.0.5.pkl.gz

29.8 KB
Binary file not shown.

P3B1/test/test.0.6.pkl.gz

29.5 KB
Binary file not shown.

P3B1/test/test.0.7.pkl.gz

29.7 KB
Binary file not shown.

0 commit comments

Comments
 (0)