Skip to content

Commit f3f889b

Browse files
committed
Add instructions to run vgg
1 parent 32372c0 commit f3f889b

File tree

1 file changed

+27
-8
lines changed

1 file changed

+27
-8
lines changed

benchmark/cluster/vgg16/vgg16_fluid.py

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,25 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
"""VGG16 benchmark in Fluid"""
14+
"""VGG16 benchmark in Fluid
15+
16+
# Single trainer, single PS on a single machine.
17+
VGG_SRC="${CODE_DIR}/vgg16_fluid.py"
18+
export TRAINING_ROLE=PSERVER
19+
export TRAINERS=1
20+
export POD_IP=127.0.0.1
21+
export PADDLE_INIT_PORT=6174
22+
MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 &
23+
sleep 10 # wait for PS to start.
24+
export TRAINING_ROLE=TRAINER
25+
MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU &
26+
27+
# To run multiple trainers on a single machine
28+
# change TRAINERS=2 and launch 2 trainers.
29+
# CUDA_VISIBLE_DEVICES=4 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=0 &
30+
# CUDA_VISIBLE_DEVICES=5 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=1 &
31+
"""
32+
1533
from __future__ import print_function
1634

1735
import sys
@@ -200,18 +218,19 @@ def train_loop(exe, trainer_prog):
200218
num_samples += len(data)
201219
train_pass_acc.add(value=acc, weight=b_size)
202220
print(
203-
"Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
204-
% (pass_id, iters, loss, acc,
205-
len(data) / (time.time() - ts))
221+
"Task:%d Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
222+
"Speed = %.2f img/s " % (args.task_index, pass_id, iters,
223+
loss, acc,
224+
len(data) / (time.time() - ts))
206225
) # The accuracy is the accumulation of batches, but not the current batch.
207226

208227
pass_elapsed = time.time() - start_time
209228
pass_train_acc = train_pass_acc.eval()
210229
pass_test_acc = test(exe)
211-
print(
212-
"Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n"
213-
% (pass_id, num_samples / pass_elapsed, pass_train_acc,
214-
pass_test_acc))
230+
print("Task:%d Pass = %d, Training performance = %f imgs/s, "
231+
"Train accuracy = %f, Test accuracy = %f\n" %
232+
(args.task_index, pass_id, num_samples / pass_elapsed,
233+
pass_train_acc, pass_test_acc))
215234

216235
if args.local:
217236
# Parameter initialization

0 commit comments

Comments
 (0)