Skip to content

Commit ef35c4e

Browse files
authored
Tensorflow benchmark (#8522)
Tensorflow benchmark
1 parent 1ac31d3 commit ef35c4e

File tree

7 files changed

+609
-21
lines changed

7 files changed

+609
-21
lines changed

benchmark/cluster/vgg16/Dockerfile

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,35 @@
1-
#FROM python:2.7.14
21
FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
3-
RUN apt-get update && apt-get install -y python
4-
RUN pip install -U kubernetes opencv-python && apt-get update -y && apt-get install -y iputils-ping libgtk2.0-dev
5-
# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
6-
# so we must build one with distribute support to install in this image.
2+
3+
# you can get mirror list here:
4+
# https://launchpad.net/ubuntu/+archivemirrors
5+
ARG UBUNTU_MIRROR
6+
RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
7+
8+
RUN apt-get update && apt-get install -y python python-dev python-pip iputils-ping libgtk2.0-dev
9+
RUN pip install -U kubernetes opencv-python
10+
711
RUN pip install paddlepaddle
12+
# if network is slowly, you may need to add proxy here.
13+
# ENV https_proxy=
814
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
915
RUN pip uninstall -y paddlepaddle
16+
# unset proxy if it is setted.
17+
# ENV https_proxy=""
18+
19+
# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
20+
# so we must build one with distribute support to install in this image.
21+
ADD *.whl /
22+
RUN pip install /*.whl && rm -f /*.whl
23+
ENV LD_LIBRARY_PATH=/usr/local/lib
24+
25+
# tf k8s
26+
RUN pip install tensorflow==1.4.0
27+
ADD tf_k8s /usr/bin
28+
RUN chmod +x /usr/bin/tf_k8s
29+
ADD vgg16_tf.py /workspace/
1030

1131
# below lines may change a lot for debugging
1232
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
1333
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
14-
ADD *.whl /
15-
RUN pip install /*.whl && rm -f /*.whl && \
16-
chmod +x /usr/bin/paddle_k8s
17-
ENV LD_LIBRARY_PATH=/usr/local/lib
34+
RUN chmod +x /usr/bin/paddle_k8s
1835
ADD vgg16_fluid.py vgg16_v2.py /workspace/

benchmark/cluster/vgg16/fluid_trainer.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ spec:
1111
paddle-job: vgg16job
1212
spec:
1313
imagePullSecrets:
14-
- name: job-registry-secret
14+
- name: job-registry-secret
1515
hostNetwork: true
1616
containers:
1717
- name: trainer

benchmark/cluster/vgg16/tf_k8s

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
#!/bin/bash
2+
check_trainer_ret() {
3+
ret=$1
4+
stdbuf -oL echo "job returned $ret...setting pod return message..."
5+
stdbuf -oL echo "==============================="
6+
7+
if [ $ret -eq 136 ] ; then
8+
echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log
9+
elif [ $ret -eq 139 ] ; then
10+
echo "Segmentation Fault" > /dev/termination-log
11+
elif [ $ret -eq 1 ] ; then
12+
echo "General Error" > /dev/termination-log
13+
elif [ $ret -eq 134 ] ; then
14+
echo "Program Abort" > /dev/termination-log
15+
fi
16+
stdbuf -oL echo "termination log wroted..."
17+
exit $ret
18+
}
19+
20+
g_pservers=""
21+
g_trainers=""
22+
23+
wait_running_pods(){
24+
pserver_label="tf-job-pserver=${JOB_NAME}"
25+
trainer_label="tf-job-trainer=${JOB_NAME}"
26+
27+
stdbuf -oL python /root/k8s_tools.py wait_pods_running ${pserver_label} ${PSERVERS_NUM}
28+
stdbuf -oL python /root/k8s_tools.py wait_pods_running ${trainer_label} ${TRAINERS_NUM}
29+
30+
g_pservers=$(python /root/k8s_tools.py fetch_endpoints ${pserver_label} ${PORT})
31+
g_trainers=$(python /root/k8s_tools.py fetch_endpoints ${trainer_label} ${PORT})
32+
}
33+
34+
start_tf_pserver(){
35+
wait_running_pods
36+
37+
label="tf-job-pserver=${JOB_NAME}"
38+
pserver_id=$(python /root/k8s_tools.py fetch_id ${label})
39+
40+
cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
41+
--job_name=${TF_JOB_NAME} --task_index=${pserver_id}"
42+
43+
stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
44+
}
45+
46+
start_tf_trainer(){
47+
wait_running_pods
48+
49+
label="tf-job-trainer=${JOB_NAME}"
50+
trainer_id=$(python /root/k8s_tools.py fetch_id ${label})
51+
52+
cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
53+
--job_name=${TF_JOB_NAME} --task_index=${trainer_id} --batch_size=${BATCH_SIZE}"
54+
55+
stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
56+
check_trainer_ret $?
57+
}
58+
59+
start_tf(){
60+
if [[ "${TF_JOB_NAME}" == "worker" ]]; then
61+
start_tf_trainer
62+
else
63+
start_tf_pserver
64+
fi
65+
}
66+
67+
usage() {
68+
echo "usage: tf_k8s [<args>]:"
69+
echo " start_tf Start tensorflow jobs"
70+
}
71+
72+
case "$1" in
73+
start_tf)
74+
start_tf
75+
;;
76+
--help)
77+
usage
78+
;;
79+
*)
80+
usage
81+
;;
82+
esac
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
apiVersion: extensions/v1beta1
2+
kind: ReplicaSet
3+
metadata:
4+
name: vgg16job-tf-pserver
5+
spec:
6+
replicas: 10
7+
template:
8+
metadata:
9+
labels:
10+
tf-job-pserver: vgg16job-tf
11+
spec:
12+
hostNetwork: true
13+
imagePullSecrets:
14+
- name: job-registry-secret
15+
containers:
16+
- name: pserver
17+
image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
18+
imagePullPolicy: Always
19+
command: ["tf_k8s", "start_tf"]
20+
ports:
21+
- name: jobport-30236
22+
containerPort: 30236
23+
env:
24+
- name: PORT
25+
value: "32036"
26+
- name: ENTRY
27+
value: "python vgg16_tf.py"
28+
- name: JOB_NAME
29+
value: vgg16job-tf
30+
- name: PSERVERS_NUM
31+
value: "10"
32+
- name: TF_JOB_NAME
33+
value: "ps"
34+
- name: TRAINERS_NUM
35+
value: "20"
36+
- name: BATCH_SIZE
37+
value: "128"
38+
- name: TRAINER_PACKAGE
39+
value: "/workspace"
40+
- name: NUM_PASSES
41+
value: "1"
42+
- name: NAMESPACE
43+
valueFrom:
44+
fieldRef:
45+
fieldPath: "metadata.namespace"
46+
- name: POD_IP
47+
valueFrom:
48+
fieldRef:
49+
fieldPath: "status.podIP"
50+
resources:
51+
requests:
52+
memory: 10Gi
53+
cpu: 4
54+
limits:
55+
memory: 10Gi
56+
cpu: 4
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
apiVersion: batch/v1
2+
kind: Job
3+
metadata:
4+
name: vgg16job-tf-trainer
5+
spec:
6+
parallelism: 20
7+
completions: 20
8+
template:
9+
metadata:
10+
labels:
11+
tf-job-trainer: vgg16job-tf
12+
spec:
13+
imagePullSecrets:
14+
- name: job-registry-secret
15+
hostNetwork: true
16+
containers:
17+
- name: trainer
18+
image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
19+
imagePullPolicy: Always
20+
command: ["tf_k8s", "start_tf"]
21+
ports:
22+
- name: jobport-30236
23+
containerPort: 30236
24+
env:
25+
- name: PORT
26+
value: "32036"
27+
- name: JOB_NAME
28+
value: vgg16job-tf
29+
- name: TF_JOB_NAME
30+
value: "worker"
31+
- name: ENTRY
32+
value: "python vgg16_tf.py"
33+
- name: PSERVERS_NUM
34+
value: "10"
35+
- name: BATCH_SIZE
36+
value: "128"
37+
- name: TRAINERS_NUM
38+
value: "20"
39+
- name: TRAINER_PACKAGE
40+
value: "/workspace"
41+
- name: NUM_PASSES
42+
value: "1"
43+
- name: NAMESPACE
44+
valueFrom:
45+
fieldRef:
46+
fieldPath: "metadata.namespace"
47+
- name: POD_IP
48+
valueFrom:
49+
fieldRef:
50+
fieldPath: "status.podIP"
51+
resources:
52+
requests:
53+
memory: 40Gi
54+
cpu: 2
55+
limits:
56+
memory: 40Gi
57+
cpu: 2
58+
restartPolicy: Never

benchmark/cluster/vgg16/vgg16_fluid.py

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,21 @@ def str2bool(v):
6868
type=str2bool,
6969
default=True,
7070
help='Whether to run as local mode.')
71+
72+
parser.add_argument(
73+
"--ps_hosts",
74+
type=str,
75+
default="",
76+
help="Comma-separated list of hostname:port pairs")
77+
parser.add_argument(
78+
"--trainer_hosts",
79+
type=str,
80+
default="",
81+
help="Comma-separated list of hostname:port pairs")
82+
83+
# Flags for defining the tf.train.Server
84+
parser.add_argument(
85+
"--task_index", type=int, default=0, help="Index of task within the job")
7186
args = parser.parse_args()
7287

7388

@@ -180,8 +195,9 @@ def train_loop(exe, trainer_prog):
180195
iters += 1
181196
num_samples += len(data)
182197
print(
183-
"Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f"
184-
% (pass_id, iters, loss, acc, time.time() - ts)
198+
"Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
199+
% (pass_id, iters, loss, acc,
200+
len(data) / (time.time() - ts))
185201
) # The accuracy is the accumulation of batches, but not the current batch.
186202

187203
pass_elapsed = time.time() - start_time
@@ -209,27 +225,24 @@ def train_loop(exe, trainer_prog):
209225
batch_size=args.batch_size)
210226
train_loop(exe, fluid.default_main_program())
211227
else:
212-
pserver_ips = os.getenv("PADDLE_INIT_PSERVERS") # all pserver endpoints
213-
eplist = []
214-
for ip in pserver_ips.split(","):
215-
eplist.append(':'.join([ip, "6174"]))
216-
pserver_endpoints = ",".join(eplist)
217-
print("pserver endpoints: ", pserver_endpoints)
218228
trainers = int(os.getenv("TRAINERS")) # total trainer count
219229
print("trainers total: ", trainers)
220-
current_endpoint = os.getenv(
221-
"POD_IP") + ":6174" # current pserver endpoint
230+
222231
training_role = os.getenv(
223232
"TRAINING_ROLE",
224233
"TRAINER") # get the training role: trainer/pserver
234+
225235
t = fluid.DistributeTranspiler()
226236
t.transpile(
227237
optimize_ops,
228238
params_grads,
229-
pservers=pserver_endpoints,
239+
trainer_id=args.task_index,
240+
pservers=args.ps_hosts,
230241
trainers=trainers)
231242

232243
if training_role == "PSERVER":
244+
current_endpoint = os.getenv("POD_IP") + ":" + os.getenv(
245+
"PADDLE_INIT_PORT")
233246
if not current_endpoint:
234247
print("need env SERVER_ENDPOINT")
235248
exit(1)

0 commit comments

Comments
 (0)