Skip to content

Commit 5935dee

Browse files
committed
Merge branch 'master' of github.com:ECP-CANDLE/Benchmarks
2 parents f8a964c + ecef913 commit 5935dee

File tree

4 files changed

+794
-85
lines changed

4 files changed

+794
-85
lines changed

P1B3/p1b3.py

Lines changed: 88 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from data_utils import get_file
33
# from six.moves import cPickle
44

5+
import collections
56
import gzip
67
import logging
78
import os
@@ -20,7 +21,7 @@
2021

2122
logger = logging.getLogger(__name__)
2223

23-
SEED = 2016
24+
SEED = 2017
2425

2526
np.set_printoptions(threshold=np.nan)
2627
np.random.seed(SEED)
@@ -238,8 +239,8 @@ def load_dose_response(path, min_logconc=-5., max_logconc=-5., subsample=None):
238239
return df
239240

240241

241-
class RegressionDataGenerator(object):
242-
"""Generate merged drug response, drug descriptors and cell line essay data
242+
class DataLoader(object):
243+
"""Load merged drug response, drug descriptors and cell line essay data
243244
"""
244245

245246
def __init__(self, val_split=0.2, shuffle=True, drug_features='descriptors',
@@ -273,7 +274,6 @@ def __init__(self, val_split=0.2, shuffle=True, drug_features='descriptors',
273274
growth thresholds seperating non-response and response categories
274275
"""
275276

276-
self.lock = threading.Lock()
277277
self.drug_features = drug_features
278278

279279
server = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B3/'
@@ -349,62 +349,110 @@ def __init__(self, val_split=0.2, shuffle=True, drug_features='descriptors',
349349
format(i, count, count/len(growth), lower, upper))
350350
logger.info(' Total: {:9d}'.format(len(growth)))
351351

352-
nrows = df_train_val.shape[0]
352+
self.total = df_train_val.shape[0]
353353
self.n_test = df_test.shape[0]
354-
self.n_val = int(nrows * val_split)
355-
self.n_train = nrows - self.n_val
356-
357-
self.cycle_train = cycle(range(nrows - self.n_val))
358-
self.cycle_val = cycle(range(nrows)[-self.n_val:])
359-
self.cycle_test = cycle(range(nrows, nrows + self.n_test))
354+
self.n_val = int(self.total * val_split)
355+
self.n_train = self.total - self.n_val
360356
logger.info('Rows in train: {}, val: {}, test: {}'.format(self.n_train, self.n_val, self.n_test))
361357

362-
self.input_dim = self.df_cellline.shape[1] - 1 + 1 # remove CELLNAME; add concentration
363-
logger.info('Features:')
364-
logger.info(' concentration: 1')
365-
logger.info(' cell line expression: {}'.format(self.input_dim-1))
358+
self.input_shapes = collections.OrderedDict()
359+
self.input_shapes['drug_concentration'] = (1,)
360+
self.input_shapes['cellline_expression'] = (self.df_cellline.shape[1] - 1,)
366361
if self.drug_features in ['descriptors', 'both']:
367-
self.input_dim += self.df_drug_desc.shape[1] - 1 # remove NSC
368-
logger.info(' drug descriptors: {}'.format(self.df_drug_desc.shape[1] - 1))
362+
self.input_shapes['drug_descriptors'] = (self.df_drug_desc.shape[1] - 1,) # remove NSC
369363
if self.drug_features in ['latent', 'both']:
370-
self.input_dim += self.df_drug_auen.shape[1] - 1 # remove NSC
371-
logger.info(' drug latent representations: {}'.format(self.df_drug_auen.shape[1] - 1))
364+
self.input_shapes['drug_SMILES_latent'] = (self.df_drug_auen.shape[1] - 1,) # remove NSC
372365
if self.drug_features == 'noise':
373-
self.input_dim += self.df_drug_rand.shape[1] - 1 # remove NSC
374-
logger.info(' drug random vectors: {}'.format(self.df_drug_rand.shape[1] - 1))
366+
self.input_shapes['drug_random_vector'] = (self.df_drug_rand.shape[1] - 1,) # remove NSC
367+
368+
logger.info('Input features shapes:')
369+
for k, v in self.input_shapes.items():
370+
logger.info(' {}: {}'.format(k, v))
371+
372+
self.input_dim = sum([np.prod(x) for x in self.input_shapes.values()])
375373
logger.info('Total input dimensions: {}'.format(self.input_dim))
376374

377-
def flow(self, batch_size=32, data='train', topology=None):
378-
if data == 'val':
379-
cyc = self.cycle_val
380-
elif data == 'test':
381-
cyc = self.cycle_test
375+
376+
class DataGenerator(object):
377+
"""Generate training, validation or testing batches from loaded data
378+
"""
379+
380+
def __init__(self, data, partition='train', batch_size=32, shape=None, concat=True):
381+
"""Initialize data
382+
383+
Parameters
384+
----------
385+
data: DataLoader object
386+
loaded data object containing original data frames for molecular, drug and response data
387+
partition: 'train', 'val', or 'test'
388+
partition of data to generate for
389+
batch_size: integer (default 32)
390+
batch size of generated data
391+
shape: None, '1d' or 'add_1d' (default None)
392+
keep original feature shapes, make them flat or add one extra dimension (for convolution or locally connected layers in some frameworks)
393+
concat: True or False (default True)
394+
concatenate all features if set to True
395+
"""
396+
self.lock = threading.Lock()
397+
self.data = data
398+
self.partition = partition
399+
self.batch_size = batch_size
400+
self.shape = shape
401+
self.concat = concat
402+
403+
if partition == 'train':
404+
self.cycle = cycle(range(data.n_train))
405+
self.num_data = data.n_train
406+
elif partition == 'val':
407+
self.cycle = cycle(range(data.total)[-data.n_val:])
408+
self.num_data = data.n_val
409+
elif partition == 'test':
410+
self.cycle = cycle(range(data.total, data.total + data.n_test))
411+
self.num_data = data.n_test
382412
else:
383-
cyc = self.cycle_train
413+
raise Exception('Data partition "{}" not recognized.'.format(partition))
384414

415+
def flow(self):
416+
"""Keep generating data batches
417+
"""
385418
while 1:
386419
self.lock.acquire()
387-
indices = list(islice(cyc, batch_size))
420+
indices = list(islice(self.cycle, self.batch_size))
388421
# print("\nProcess: {}, Batch indices start: {}".format(multiprocessing.current_process().name, indices[0]))
389422
self.lock.release()
390423

391-
df = self.df_response.iloc[indices, :]
392-
df = pd.merge(df, self.df_cellline, on='CELLNAME')
424+
df = self.data.df_response.iloc[indices, :]
425+
df = pd.merge(df, self.data.df_cellline, on='CELLNAME')
393426

394-
if self.drug_features in ['descriptors', 'both']:
395-
df = df.merge(self.df_drug_desc, on='NSC')
396-
if self.drug_features in ['latent', 'both']:
397-
df = df.merge(self.df_drug_auen, on='NSC')
398-
if self.drug_features == 'noise':
399-
df = df.merge(self.df_drug_rand, on='NSC')
427+
if self.data.drug_features in ['descriptors', 'both']:
428+
df = df.merge(self.data.df_drug_desc, on='NSC')
429+
if self.data.drug_features in ['latent', 'both']:
430+
df = df.merge(self.data.df_drug_auen, on='NSC')
431+
if self.data.drug_features == 'noise':
432+
df = df.merge(self.data.df_drug_rand, on='NSC')
400433

401434
df = df.drop(['CELLNAME', 'NSC'], 1)
402435
x = np.array(df.iloc[:, 1:])
403436
y = np.array(df.iloc[:, 0])
404437
y = y / 100.
405438

406-
if topology == 'simple_local':
407-
yield x.reshape(x.shape + (1,)), y
408-
# yield x.reshape(x.shape[0], 1, x.shape[1]), y
439+
if self.concat:
440+
if self.shape == 'add_1d':
441+
yield x.reshape(x.shape + (1,)), y
442+
else:
443+
yield x, y
409444
else:
410-
yield x, y
445+
x_list = []
446+
index = 0
447+
for v in self.data.input_shapes.values():
448+
length = np.prod(v)
449+
subset = x[:, index:index+length]
450+
if self.shape == '1d':
451+
reshape = (x.shape[0], length)
452+
elif self.shape == 'add_1d':
453+
reshape = (x.shape[0],) + v + (1,)
454+
else:
455+
reshape = (x.shape[0],) + v
456+
x_list.append(subset.reshape(reshape))
457+
index += length
458+
yield x_list, y

P1B3/p1b3_baseline.py

Lines changed: 48 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,8 @@
4747
# Activation function (options: 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear')
4848
ACTIVATION = 'relu'
4949
LOSS = 'mse'
50-
OPTIMIZER = 'adam'
50+
OPTIMIZER = 'sgd'
51+
# OPTIMIZER = 'adam'
5152

5253
# Type of feature scaling (options: 'maxabs': to [-1,1]
5354
# 'minmax': to [0,1]
@@ -65,11 +66,11 @@
6566
DENSE_LAYERS = [D1, D2, D3, D4]
6667

6768
# Number of units per locally connected layer
68-
LC1 = 10, 10 # nb_filter, filter_length
69-
LC2 = 0, 0 # disabled layer
70-
# LOCALLY_CONNECTED_LAYERS = list(LC1 + LC2)
71-
LOCALLY_CONNECTED_LAYERS = [0, 0]
72-
POOL = 100
69+
C1 = 10, 10, 5 # nb_filter, filter_length, stride
70+
C2 = 0, 0, 0 # disabled layer
71+
# CONVOLUTION_LAYERS = list(C1 + C2)
72+
CONVOLUTION_LAYERS = [0, 0, 0]
73+
POOL = 10
7374

7475
MIN_LOGCONC = -5.
7576
MAX_LOGCONC = -4.
@@ -91,18 +92,18 @@ def get_parser():
9192
parser.add_argument("-b", "--batch_size", action="store",
9293
default=BATCH_SIZE, type=int,
9394
help="batch size")
94-
parser.add_argument("-c", "--convolution", action="store_true",
95-
default=False,
96-
help="use convolution layers instead of locally connected layers")
95+
parser.add_argument("-c", "--convolution", action="store", nargs='+', type=int,
96+
default=CONVOLUTION_LAYERS,
97+
help="integer array describing convolution layers: conv1_nb_filter, conv1_filter_len, conv1_stride, conv2_nb_filter, conv2_filter_len, conv2_stride ...")
9798
parser.add_argument("-d", "--dense", action="store", nargs='+', type=int,
9899
default=DENSE_LAYERS,
99100
help="number of units in fully connected layers in an integer array")
100101
parser.add_argument("-e", "--epochs", action="store",
101102
default=NB_EPOCH, type=int,
102103
help="number of training epochs")
103-
parser.add_argument("-l", "--locally_connected", action="store", nargs='+', type=int,
104-
default=LOCALLY_CONNECTED_LAYERS,
105-
help="integer array describing locally connected layers: layer1_nb_filter, layer1_filter_len, layer2_nb_filter, layer2_filter_len, ...")
104+
parser.add_argument("-l", "--locally_connected", action="store_true",
105+
default=False,
106+
help="use locally connected layers instead of convolution layers")
106107
parser.add_argument("-o", "--optimizer", action="store",
107108
default=OPTIMIZER,
108109
help="keras optimizer to use: sgd, rmsprop, ...")
@@ -163,15 +164,16 @@ def extension_from_parameters(args):
163164
ext += '.E={}'.format(args.epochs)
164165
if args.feature_subsample:
165166
ext += '.F={}'.format(args.feature_subsample)
166-
if args.locally_connected:
167-
name = 'C' if args.convolution else 'LC'
168-
layer_list = list(range(0, len(args.locally_connected), 2))
167+
if args.convolution:
168+
name = 'LC' if args.locally_connected else 'C'
169+
layer_list = list(range(0, len(args.convolution), 3))
169170
for l, i in enumerate(layer_list):
170-
nb_filter = args.locally_connected[i]
171-
filter_len = args.locally_connected[i+1]
172-
if nb_filter <= 0 or filter_len <= 0:
171+
nb_filter = args.convolution[i]
172+
filter_len = args.convolution[i+1]
173+
stride = args.convolution[i+2]
174+
if nb_filter <= 0 or filter_len <= 0 or stride <= 0:
173175
break
174-
ext += '.{}{}={},{}'.format(name, l+1, nb_filter, filter_len)
176+
ext += '.{}{}={},{},{}'.format(name, l+1, nb_filter, filter_len, stride)
175177
if args.pool and layer_list[0] and layer_list[1]:
176178
ext += '.P={}'.format(args.pool)
177179
for i, n in enumerate(args.dense):
@@ -308,53 +310,54 @@ def main():
308310

309311
ext = extension_from_parameters(args)
310312

311-
datagen = p1b3.RegressionDataGenerator(feature_subsample=args.feature_subsample,
312-
scaling=args.scaling,
313-
drug_features=args.drug_features,
314-
scramble=args.scramble,
315-
min_logconc=args.min_logconc,
316-
max_logconc=args.max_logconc,
317-
subsample=args.subsample,
318-
category_cutoffs=args.category_cutoffs)
313+
loader = p1b3.DataLoader(feature_subsample=args.feature_subsample,
314+
scaling=args.scaling,
315+
drug_features=args.drug_features,
316+
scramble=args.scramble,
317+
min_logconc=args.min_logconc,
318+
max_logconc=args.max_logconc,
319+
subsample=args.subsample,
320+
category_cutoffs=args.category_cutoffs)
319321

320-
topology = 'dense'
322+
gen_shape = None
321323
out_dim = 1
322324

323325
model = Sequential()
324-
if args.locally_connected and args.locally_connected[0]:
325-
topology = 'simple_local'
326-
layer_list = list(range(0, len(args.locally_connected), 2))
326+
if args.convolution and args.convolution[0]:
327+
gen_shape = 'add_1d'
328+
layer_list = list(range(0, len(args.convolution), 3))
327329
for l, i in enumerate(layer_list):
328-
nb_filter = args.locally_connected[i]
329-
filter_len = args.locally_connected[i+1]
330-
if nb_filter <= 0 or filter_len <= 0:
330+
nb_filter = args.convolution[i]
331+
filter_len = args.convolution[i+1]
332+
stride = args.convolution[i+2]
333+
if nb_filter <= 0 or filter_len <= 0 or stride <= 0:
331334
break
332-
if args.convolution:
333-
model.add(Convolution1D(nb_filter, filter_len, input_shape=(datagen.input_dim, 1), activation=args.activation))
335+
if args.locally_connected:
336+
model.add(LocallyConnected1D(nb_filter, filter_len, subsample_length=stride, input_shape=(loader.input_dim, 1), activation=args.activation))
334337
else:
335-
model.add(LocallyConnected1D(nb_filter, filter_len, input_shape=(datagen.input_dim, 1), activation=args.activation))
338+
model.add(Convolution1D(nb_filter, filter_len, subsample_length=stride, input_shape=(loader.input_dim, 1), activation=args.activation))
336339
if args.pool:
337340
model.add(MaxPooling1D(pool_length=args.pool))
338341
model.add(Flatten())
339342

340343
for layer in args.dense:
341344
if layer:
342-
model.add(Dense(layer, input_dim=datagen.input_dim, activation=args.activation))
345+
model.add(Dense(layer, input_dim=loader.input_dim, activation=args.activation))
343346
if args.drop:
344347
model.add(Dropout(args.drop))
345348
model.add(Dense(out_dim))
346349

347350
model.summary()
348351
model.compile(loss=args.loss, optimizer=args.optimizer)
349352

350-
train_gen = datagen.flow(batch_size=args.batch_size, topology=topology)
351-
val_gen = datagen.flow(data='val', batch_size=args.batch_size, topology=topology)
352-
val_gen2 = datagen.flow(data='val', batch_size=args.batch_size, topology=topology)
353-
test_gen = datagen.flow(data='test', batch_size=args.batch_size, topology=topology)
353+
train_gen = p1b3.DataGenerator(loader, batch_size=args.batch_size, shape=gen_shape).flow()
354+
val_gen = p1b3.DataGenerator(loader, partition='val', batch_size=args.batch_size, shape=gen_shape).flow()
355+
val_gen2 = p1b3.DataGenerator(loader, partition='val', batch_size=args.batch_size, shape=gen_shape).flow()
356+
test_gen = p1b3.DataGenerator(loader, partition='test', batch_size=args.batch_size, shape=gen_shape).flow()
354357

355-
train_samples = int(datagen.n_train/args.batch_size) * args.batch_size
356-
val_samples = int(datagen.n_val/args.batch_size) * args.batch_size
357-
test_samples = int(datagen.n_test/args.batch_size) * args.batch_size
358+
train_samples = int(loader.n_train/args.batch_size) * args.batch_size
359+
val_samples = int(loader.n_val/args.batch_size) * args.batch_size
360+
test_samples = int(loader.n_test/args.batch_size) * args.batch_size
358361

359362
train_samples = args.train_samples if args.train_samples else train_samples
360363
val_samples = args.val_samples if args.val_samples else val_samples

0 commit comments

Comments
 (0)