Skip to content

Commit 7c90d7a

Browse files
authored
Merge pull request #10128 from panyx0718/dist0423
add some instructions for running vgg distributedly
2 parents c7e23bb + f091774 commit 7c90d7a

File tree

2 files changed

+29
-7
lines changed

2 files changed

+29
-7
lines changed
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#!/bin/bash
2+
3+
# Update to point to the source file.
4+
VGG_SRC="vgg16_fluid.py"
5+
6+
export TRAINING_ROLE=PSERVER
7+
export TRAINERS=2
8+
export POD_IP=127.0.0.1
9+
export PADDLE_INIT_PORT=6174
10+
MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 &
11+
12+
# Need to wait for the ps to start first.
13+
sleep 10
14+
echo "done start ps"
15+
16+
export TRAINING_ROLE=TRAINER
17+
export TRAINERS=2
18+
export POD_IP=127.0.0.1
19+
export PADDLE_INIT_PORT=6174
20+
CUDA_VISIBLE_DEVICES=4 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=0 &
21+
CUDA_VISIBLE_DEVICES=5 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=1 &

benchmark/cluster/vgg16/vgg16_fluid.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -200,18 +200,19 @@ def train_loop(exe, trainer_prog):
200200
num_samples += len(data)
201201
train_pass_acc.add(value=acc, weight=b_size)
202202
print(
203-
"Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
204-
% (pass_id, iters, loss, acc,
205-
len(data) / (time.time() - ts))
203+
"Task:%d Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
204+
"Speed = %.2f img/s " % (args.task_index, pass_id, iters,
205+
loss, acc,
206+
len(data) / (time.time() - ts))
206207
) # The accuracy is the accumulation of batches, but not the current batch.
207208

208209
pass_elapsed = time.time() - start_time
209210
pass_train_acc = train_pass_acc.eval()
210211
pass_test_acc = test(exe)
211-
print(
212-
"Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n"
213-
% (pass_id, num_samples / pass_elapsed, pass_train_acc,
214-
pass_test_acc))
212+
print("Task:%d Pass = %d, Training performance = %f imgs/s, "
213+
"Train accuracy = %f, Test accuracy = %f\n" %
214+
(args.task_index, pass_id, num_samples / pass_elapsed,
215+
pass_train_acc, pass_test_acc))
215216

216217
if args.local:
217218
# Parameter initialization

0 commit comments

Comments
 (0)