Skip to content

Commit 75a2e50

Browse files
committed
Merge branch 'modify_dev' into modify_readers_to_fit_parallel_executor
2 parents 284a213 + 72b5de0 commit 75a2e50

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

66 files changed

+2611
-545
lines changed

benchmark/fluid/machine_translation.py

Lines changed: 48 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,13 @@
4848
type=int,
4949
default=16,
5050
help="The sequence number of a mini-batch data. (default: %(default)d)")
51+
parser.add_argument(
52+
'--skip_batch_num',
53+
type=int,
54+
default=5,
55+
help='The first num of minibatch num to skip, for better performance test')
56+
parser.add_argument(
57+
'--iterations', type=int, default=80, help='The number of minibatches.')
5158
parser.add_argument(
5259
"--dict_size",
5360
type=int,
@@ -72,16 +79,21 @@
7279
default=3,
7380
help="The width for beam searching. (default: %(default)d)")
7481
parser.add_argument(
75-
"--use_gpu",
76-
type=distutils.util.strtobool,
77-
default=True,
78-
help="Whether to use gpu. (default: %(default)d)")
82+
'--device',
83+
type=str,
84+
default='GPU',
85+
choices=['CPU', 'GPU'],
86+
help="The device type.")
7987
parser.add_argument(
8088
"--max_length",
8189
type=int,
8290
default=250,
8391
help="The maximum length of sequence when doing generation. "
8492
"(default: %(default)d)")
93+
parser.add_argument(
94+
'--with_test',
95+
action='store_true',
96+
help='If set, test the testset during training.')
8597

8698

8799
def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
@@ -281,7 +293,7 @@ def train():
281293
paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
282294
batch_size=args.batch_size)
283295

284-
place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace()
296+
place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
285297
exe = Executor(place)
286298
exe.run(framework.default_startup_program())
287299

@@ -307,14 +319,20 @@ def do_validation():
307319

308320
return total_loss / count
309321

322+
iters, num_samples, start_time = 0, 0, time.time()
310323
for pass_id in xrange(args.pass_num):
311-
pass_start_time = time.time()
312-
words_seen = 0
324+
train_accs = []
325+
train_losses = []
313326
for batch_id, data in enumerate(train_batch_generator()):
327+
if iters == args.skip_batch_num:
328+
start_time = time.time()
329+
num_samples = 0
330+
if iters == args.iterations:
331+
break
314332
src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place)
315-
words_seen += word_num
333+
num_samples += word_num
316334
trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place)
317-
words_seen += word_num
335+
num_samples += word_num
318336
lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place)
319337

320338
fetch_outs = exe.run(framework.default_main_program(),
@@ -325,24 +343,36 @@ def do_validation():
325343
},
326344
fetch_list=[avg_cost])
327345

328-
avg_cost_val = np.array(fetch_outs[0])
329-
print('pass_id=%d, batch_id=%d, train_loss: %f' %
330-
(pass_id, batch_id, avg_cost_val))
346+
iters += 1
347+
loss = np.array(fetch_outs[0])
348+
print(
349+
"Pass = %d, Iter = %d, Loss = %f" % (pass_id, iters, loss)
350+
) # The accuracy is the accumulation of batches, but not the current batch.
331351

332-
pass_end_time = time.time()
333-
test_loss = do_validation()
334-
time_consumed = pass_end_time - pass_start_time
335-
words_per_sec = words_seen / time_consumed
336-
print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" %
337-
(pass_id, test_loss, words_per_sec, time_consumed))
352+
train_elapsed = time.time() - start_time
353+
examples_per_sec = num_samples / train_elapsed
354+
print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
355+
(num_samples, train_elapsed, examples_per_sec))
356+
# evaluation
357+
if args.with_test:
358+
test_loss = do_validation()
359+
exit(0)
338360

339361

340362
def infer():
341363
pass
342364

343365

366+
def print_arguments(args):
367+
print('----------- seq2seq Configuration Arguments -----------')
368+
for arg, value in sorted(vars(args).iteritems()):
369+
print('%s: %s' % (arg, value))
370+
print('------------------------------------------------')
371+
372+
344373
if __name__ == '__main__':
345374
args = parser.parse_args()
375+
print_arguments(args)
346376
if args.infer_only:
347377
infer()
348378
else:

benchmark/fluid/mnist.py

Lines changed: 43 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,12 @@ def parse_args():
3535
parser = argparse.ArgumentParser("mnist model benchmark.")
3636
parser.add_argument(
3737
'--batch_size', type=int, default=128, help='The minibatch size.')
38+
parser.add_argument(
39+
'--skip_batch_num',
40+
type=int,
41+
default=5,
42+
help='The first num of minibatch num to skip, for better performance test'
43+
)
3844
parser.add_argument(
3945
'--iterations', type=int, default=35, help='The number of minibatches.')
4046
parser.add_argument(
@@ -53,19 +59,14 @@ def parse_args():
5359
'--use_nvprof',
5460
action='store_true',
5561
help='If set, use nvprof for CUDA.')
62+
parser.add_argument(
63+
'--with_test',
64+
action='store_true',
65+
help='If set, test the testset during training.')
5666
args = parser.parse_args()
5767
return args
5868

5969

60-
def print_arguments(args):
61-
vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
62-
vars(args)['device'] == 'GPU')
63-
print('----------- Configuration Arguments -----------')
64-
for arg, value in sorted(vars(args).iteritems()):
65-
print('%s: %s' % (arg, value))
66-
print('------------------------------------------------')
67-
68-
6970
def cnn_model(data):
7071
conv_pool_1 = fluid.nets.simple_img_conv_pool(
7172
input=data,
@@ -161,38 +162,59 @@ def run_benchmark(model, args):
161162
paddle.dataset.mnist.train(), batch_size=args.batch_size)
162163

163164
accuracy = fluid.average.WeightedAverage()
165+
iters, num_samples, start_time = 0, 0, time.time()
164166
for pass_id in range(args.pass_num):
165167
accuracy.reset()
166-
pass_start = time.time()
168+
train_accs = []
169+
train_losses = []
167170
for batch_id, data in enumerate(train_reader()):
171+
if iters == args.skip_batch_num:
172+
start_time = time.time()
173+
num_samples = 0
174+
if iters == args.iterations:
175+
break
168176
img_data = np.array(
169177
map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE)
170178
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
171179
y_data = y_data.reshape([len(y_data), 1])
172180

173-
start = time.time()
174181
outs = exe.run(
175182
fluid.default_main_program(),
176183
feed={"pixel": img_data,
177184
"label": y_data},
178185
fetch_list=[avg_cost, batch_acc, batch_size_tensor]
179186
) # The accuracy is the accumulation of batches, but not the current batch.
180187
accuracy.add(value=outs[1], weight=outs[2])
181-
end = time.time()
188+
iters += 1
189+
num_samples += len(y_data)
182190
loss = np.array(outs[0])
183191
acc = np.array(outs[1])
184-
print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" %
185-
(pass_id, batch_id, loss, 1 - acc, (end - start) / 1000))
192+
train_losses.append(loss)
193+
train_accs.append(acc)
194+
print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
195+
(pass_id, iters, loss, acc))
196+
197+
print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
198+
(pass_id, np.mean(train_losses), np.mean(train_accs)))
199+
train_elapsed = time.time() - start_time
200+
examples_per_sec = num_samples / train_elapsed
186201

187-
pass_end = time.time()
202+
print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
203+
(num_samples, train_elapsed, examples_per_sec))
204+
# evaluation
205+
if args.with_test:
206+
test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
207+
inference_program)
208+
exit(0)
188209

189-
train_avg_acc = accuracy.eval()
190-
test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor,
191-
inference_program)
192210

193-
print("pass=%d, train_avg_acc=%f, test_avg_acc=%f, elapse=%f" %
194-
(pass_id, train_avg_acc, test_avg_acc,
195-
(pass_end - pass_start) / 1000))
211+
def print_arguments(args):
212+
vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
213+
vars(args)['device'] == 'GPU')
214+
print('----------- mnist Configuration Arguments -----------')
215+
for arg, value in sorted(vars(args).iteritems()):
216+
print('%s: %s' % (arg, value))
217+
print('------------------------------------------------')
196218

197219

198220
if __name__ == '__main__':

benchmark/fluid/resnet.py

Lines changed: 14 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -87,15 +87,6 @@ def parse_args():
8787
return args
8888

8989

90-
def print_arguments(args):
91-
vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
92-
vars(args)['device'] == 'GPU')
93-
print('----------- Configuration Arguments -----------')
94-
for arg, value in sorted(vars(args).iteritems()):
95-
print('%s: %s' % (arg, value))
96-
print('------------------------------------------------')
97-
98-
9990
def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
10091
conv1 = fluid.layers.conv2d(
10192
input=input,
@@ -279,32 +270,31 @@ def test(exe):
279270
'label': label},
280271
fetch_list=[avg_cost, batch_acc, batch_size_tensor])
281272
iters += 1
282-
num_samples += label[0]
273+
num_samples += len(label)
283274
accuracy.add(value=acc, weight=weight)
284275
train_losses.append(loss)
285276
train_accs.append(acc)
286277
print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
287278
(pass_id, iters, loss, acc))
288-
pass_train_acc = accuracy.eval()
289-
# evaluation
290-
if args.with_test:
291-
pass_test_acc = test(exe)
292-
train_elapsed = time.time() - start_time
293279
print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
294280
(pass_id, np.mean(train_losses), np.mean(train_accs)))
295-
281+
train_elapsed = time.time() - start_time
296282
examples_per_sec = num_samples / train_elapsed
297-
298283
print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
299284
(num_samples, train_elapsed, examples_per_sec))
285+
# evaluation
286+
if args.with_test:
287+
pass_test_acc = test(exe)
288+
exit(0)
300289

301-
if args.use_cprof:
302-
pr.disable()
303-
s = StringIO.StringIO()
304-
sortby = 'cumulative'
305-
ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
306-
ps.print_stats()
307-
print(s.getvalue())
290+
291+
def print_arguments(args):
292+
vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
293+
vars(args)['device'] == 'GPU')
294+
print('----------- resnet Configuration Arguments -----------')
295+
for arg, value in sorted(vars(args).iteritems()):
296+
print('%s: %s' % (arg, value))
297+
print('------------------------------------------------')
308298

309299

310300
if __name__ == '__main__':

benchmark/fluid/run.sh

Lines changed: 63 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
#!/bin/bash
22
# This script benchmarking the PaddlePaddle Fluid on
33
# single thread single GPU.
4-
export CUDNN_PATH=/paddle/cudnn_v5/cuda/lib
4+
5+
#export FLAGS_fraction_of_gpu_memory_to_use=0.0
6+
export CUDNN_PATH=/paddle/cudnn_v5
57

68
# disable openmp and mkl parallel
79
#https://github.com/PaddlePaddle/Paddle/issues/7199
@@ -25,25 +27,79 @@ export CUDA_VISIBLE_DEVICES=0
2527
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
2628
export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH
2729

30+
# only query the gpu used
31+
nohup stdbuf -oL nvidia-smi \
32+
--id=${CUDA_VISIBLE_DEVICES} \
33+
--query-gpu=timestamp \
34+
--query-compute-apps=pid,process_name,used_memory \
35+
--format=csv \
36+
--filename=mem.log \
37+
-l 1 &
38+
# mnist
39+
# mnist gpu mnist 128
40+
FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
41+
--device=GPU \
42+
--batch_size=128 \
43+
--skip_batch_num=5 \
44+
--iterations=500 \
45+
2>&1 | tee -a mnist_gpu_128.log
2846

2947
# vgg16
30-
# cifar10 gpu cifar10 128
31-
FLAGS_benchmark=true python fluid/vgg.py \
48+
# gpu cifar10 128
49+
FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
3250
--device=GPU \
3351
--batch_size=128 \
3452
--skip_batch_num=5 \
35-
--iterations=30 \
36-
2>&1 > vgg16_gpu_128.log
53+
--iterations=30 \
54+
2>&1 | tee -a vgg16_gpu_128.log
55+
56+
# flowers gpu 128
57+
FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
58+
--device=GPU \
59+
--batch_size=32 \
60+
--data_set=flowers \
61+
--skip_batch_num=5 \
62+
--iterations=30 \
63+
2>&1 | tee -a vgg16_gpu_flowers_32.log
3764

3865
# resnet50
3966
# resnet50 gpu cifar10 128
40-
FLAGS_benchmark=true python fluid/resnet.py \
67+
FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
4168
--device=GPU \
4269
--batch_size=128 \
4370
--data_set=cifar10 \
4471
--model=resnet_cifar10 \
4572
--skip_batch_num=5 \
4673
--iterations=30 \
47-
2>&1 > resnet50_gpu_128.log
74+
2>&1 | tee -a resnet50_gpu_128.log
75+
76+
# resnet50 gpu flowers 64
77+
FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
78+
--device=GPU \
79+
--batch_size=64 \
80+
--data_set=flowers \
81+
--model=resnet_imagenet \
82+
--skip_batch_num=5 \
83+
--iterations=30 \
84+
2>&1 | tee -a resnet50_gpu_flowers_64.log
4885

4986
# lstm
87+
# lstm gpu imdb 32 # tensorflow only support batch=32
88+
FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \
89+
--device=GPU \
90+
--batch_size=32 \
91+
--skip_batch_num=5 \
92+
--iterations=30 \
93+
--hidden_dim=512 \
94+
--emb_dim=512 \
95+
--crop_size=1500 \
96+
2>&1 | tee -a lstm_gpu_32.log
97+
98+
# seq2seq
99+
# seq2seq gpu wmb 128
100+
FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \
101+
--device=GPU \
102+
--batch_size=128 \
103+
--skip_batch_num=5 \
104+
--iterations=30 \
105+
2>&1 | tee -a lstm_gpu_128.log

0 commit comments

Comments
 (0)