Skip to content

Commit 7ff3855

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into seq_error
2 parents 5e99706 + 6b49401 commit 7ff3855

File tree

409 files changed

+17181
-3050
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

409 files changed

+17181
-3050
lines changed

.copyright.hook

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ import subprocess
99
import platform
1010

1111
COPYRIGHT = '''
12-
Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
12+
Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
1313

1414
Licensed under the Apache License, Version 2.0 (the "License");
1515
you may not use this file except in compliance with the License.

CMakeLists.txt

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_F
3939
option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
4040
option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND})
4141
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
42-
option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON)
42+
option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
4343
option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON)
4444
option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check" ON)
4545
option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
@@ -137,7 +137,7 @@ include(external/openblas) # download, build, install openblas
137137
include(external/mkldnn) # download, build, install mkldnn
138138
include(external/swig) # download, build, install swig
139139
include(external/warpctc) # download, build, install warpctc
140-
include(external/boost) # download, build, install boost
140+
include(external/boost) # download boost
141141
include(external/any) # download libn::any
142142
include(external/eigen) # download eigen3
143143
include(external/pybind11) # download pybind11
@@ -156,6 +156,7 @@ include(rdma) # set rdma libraries
156156
include(flags) # set paddle compile flags
157157
include(version) # set PADDLE_VERSION
158158
include(coveralls) # set code coverage
159+
include(inference_lib) # add paddle fluid inference libraries
159160

160161

161162
include_directories("${PADDLE_SOURCE_DIR}")

benchmark/cluster/vgg16/Dockerfile

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#FROM python:2.7.14
2+
FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
3+
RUN apt-get update && apt-get install -y python
4+
RUN pip install -U kubernetes opencv-python && apt-get update -y && apt-get install -y iputils-ping libgtk2.0-dev
5+
# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
6+
# so we must build one with distribute support to install in this image.
7+
RUN pip install paddlepaddle
8+
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
9+
RUN pip uninstall -y paddlepaddle
10+
11+
# below lines may change a lot for debugging
12+
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
13+
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
14+
ADD *.whl /
15+
RUN pip install /*.whl && rm -f /*.whl && \
16+
chmod +x /usr/bin/paddle_k8s
17+
ENV LD_LIBRARY_PATH=/usr/local/lib
18+
ADD vgg16_fluid.py vgg16_v2.py /workspace/

benchmark/cluster/vgg16/README.md

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# Performance for Distributed vgg16
2+
3+
## Test Result
4+
5+
### Hardware Infomation
6+
7+
- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
8+
- cpu MHz : 2101.000
9+
- cache size : 20480 KB
10+
11+
### Single Node Single Thread
12+
13+
- PServer Count: 10
14+
- Trainer Count: 20
15+
- Metrics: samples / sec
16+
17+
| Batch Size | 32 | 64 | 128 | 256 |
18+
| -- | -- | -- | -- | -- |
19+
| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
20+
| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
21+
| TensorFlow | - | - | - | - |
22+
23+
### Different Batch Size
24+
25+
- PServer Count: 10
26+
- Trainer Count: 20
27+
- Per trainer CPU Core: 1
28+
- Metrics: samples / sec
29+
30+
| Batch Size | 32 | 64 | 128 | 256 |
31+
| -- | -- | -- | -- | -- |
32+
| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
33+
| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
34+
| TensorFlow | - | - | - | - |
35+
36+
37+
### Accelerate Rate
38+
39+
- Pserver Count: 20
40+
- Batch Size: 128
41+
- Metrics: samples / sec
42+
43+
| Trainer Count | 20 | 40 | 80 | 100 |
44+
| -- | -- | -- | -- | -- |
45+
| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) |
46+
| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) |
47+
| TensorFlow | - | - | - | - |
48+
49+
### Different Pserver Count
50+
51+
- Trainer Count: 60
52+
- Batch Size: 128
53+
- Metrics: samples/ sec
54+
55+
| PServer Count | 3 | 6 |10 | 20 |
56+
| -- | -- | -- | -- | -- |
57+
| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 |
58+
| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 |
59+
| TensorFlow | - | - | - | - |
60+
61+
*The performance gap between Fuild and v2 comes from the network interference.*
62+
63+
64+
## Steps to Run the Performance Test
65+
66+
1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
67+
1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
68+
1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
69+
1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
70+
1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
71+
72+
Check the logs for the distributed training progress and analyze the performance.
73+
74+
## Enable Verbos Logs
75+
76+
Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
apiVersion: extensions/v1beta1
2+
kind: ReplicaSet
3+
metadata:
4+
name: vgg16job-pserver
5+
spec:
6+
replicas: 10
7+
template:
8+
metadata:
9+
labels:
10+
paddle-job-pserver: vgg16job
11+
spec:
12+
hostNetwork: true
13+
imagePullSecrets:
14+
- name: job-registry-secret
15+
containers:
16+
- name: pserver
17+
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
18+
imagePullPolicy: Always
19+
ports:
20+
- name: jobport-30236
21+
containerPort: 30236
22+
env:
23+
- name: PADDLE_JOB_NAME
24+
value: vgg16job
25+
- name: MKL_NUM_THREADS
26+
value: "1"
27+
- name: TRAINING_ROLE
28+
value: "PSERVER"
29+
- name: TRAINERS
30+
value: "20"
31+
- name: PSERVERS
32+
value: "10"
33+
- name: TOPOLOGY
34+
value: ""
35+
- name: ENTRY
36+
value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
37+
- name: TRAINER_PACKAGE
38+
value: "/workspace"
39+
- name: PADDLE_INIT_PORT
40+
value: "30236"
41+
- name: PADDLE_INIT_NICS
42+
value: "xgbe0"
43+
- name: PADDLE_INIT_TRAINER_COUNT
44+
value: "1"
45+
- name: PADDLE_INIT_PORTS_NUM
46+
value: "1"
47+
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
48+
value: "1"
49+
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
50+
value: "20"
51+
- name: PADDLE_INIT_NUM_PASSES
52+
value: "1"
53+
- name: PADDLE_INIT_USE_GPU
54+
value: "0"
55+
- name: LD_LIBRARY_PATH
56+
value: "/usr/local/lib:/usr/local/nvidia/lib64"
57+
- name: NAMESPACE
58+
valueFrom:
59+
fieldRef:
60+
fieldPath: "metadata.namespace"
61+
- name: POD_IP
62+
valueFrom:
63+
fieldRef:
64+
fieldPath: "status.podIP"
65+
command: ["paddle_k8s", "start_fluid"]
66+
resources:
67+
requests:
68+
memory: 10Gi
69+
cpu: 4
70+
limits:
71+
memory: 10Gi
72+
cpu: 4
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
apiVersion: batch/v1
2+
kind: Job
3+
metadata:
4+
name: vgg16job-trainer
5+
spec:
6+
parallelism: 20
7+
completions: 20
8+
template:
9+
metadata:
10+
labels:
11+
paddle-job: vgg16job
12+
spec:
13+
imagePullSecrets:
14+
- name: job-registry-secret
15+
hostNetwork: true
16+
containers:
17+
- name: trainer
18+
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
19+
imagePullPolicy: Always
20+
command: ["paddle_k8s", "start_fluid"]
21+
env:
22+
- name: PADDLE_JOB_NAME
23+
value: vgg16job
24+
- name: TRAINING_ROLE
25+
value: "TRAINER"
26+
- name: TRAINERS
27+
value: "20"
28+
- name: PSERVERS
29+
value: "10"
30+
- name: TOPOLOGY
31+
value: ""
32+
- name: ENTRY
33+
value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
34+
- name: TRAINER_PACKAGE
35+
value: "/workspace"
36+
- name: PADDLE_INIT_PORT
37+
value: "30236"
38+
- name: PADDLE_INIT_NICS
39+
value: "xgbe0"
40+
- name: PADDLE_INIT_TRAINER_COUNT
41+
value: "1"
42+
- name: PADDLE_INIT_PORTS_NUM
43+
value: "1"
44+
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
45+
value: "1"
46+
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
47+
value: "20"
48+
- name: PADDLE_INIT_NUM_PASSES
49+
value: "1"
50+
- name: PADDLE_INIT_USE_GPU
51+
value: "0"
52+
- name: LD_LIBRARY_PATH
53+
value: "/usr/local/lib:/usr/local/nvidia/lib64"
54+
- name: NAMESPACE
55+
valueFrom:
56+
fieldRef:
57+
fieldPath: "metadata.namespace"
58+
- name: POD_IP
59+
valueFrom:
60+
fieldRef:
61+
fieldPath: "status.podIP"
62+
resources:
63+
requests:
64+
memory: 40Gi
65+
cpu: 2
66+
limits:
67+
memory: 40Gi
68+
cpu: 2
69+
restartPolicy: Never
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
apiVersion: extensions/v1beta1
2+
kind: ReplicaSet
3+
metadata:
4+
name: vgg16v2job-pserver
5+
spec:
6+
replicas: 10
7+
template:
8+
metadata:
9+
labels:
10+
paddle-job-pserver: vgg16v2job
11+
spec:
12+
hostNetwork: true
13+
imagePullSecrets:
14+
- name: job-registry-secret
15+
containers:
16+
- name: pserver
17+
image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
18+
imagePullPolicy: Always
19+
ports:
20+
- name: jobport-30236
21+
containerPort: 30236
22+
env:
23+
- name: PADDLE_JOB_NAME
24+
value: vgg16v2job
25+
- name: TRAINERS
26+
value: "20"
27+
- name: PSERVERS
28+
value: "10"
29+
- name: TOPOLOGY
30+
value: ""
31+
- name: ENTRY
32+
value: "python train.py"
33+
- name: TRAINER_PACKAGE
34+
value: "/workspace"
35+
- name: PADDLE_INIT_PORT
36+
value: "30236"
37+
- name: PADDLE_INIT_NICS
38+
value: "xgbe0"
39+
- name: PADDLE_INIT_TRAINER_COUNT
40+
value: "1"
41+
- name: PADDLE_INIT_PORTS_NUM
42+
value: "1"
43+
- name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
44+
value: "1"
45+
- name: PADDLE_INIT_NUM_GRADIENT_SERVERS
46+
value: "20"
47+
- name: PADDLE_INIT_NUM_PASSES
48+
value: "1"
49+
- name: PADDLE_INIT_USE_GPU
50+
value: "0"
51+
- name: LD_LIBRARY_PATH
52+
value: "/usr/local/lib:/usr/local/nvidia/lib64"
53+
- name: NAMESPACE
54+
valueFrom:
55+
fieldRef:
56+
fieldPath: "metadata.namespace"
57+
command: ["paddle_k8s", "start_pserver"]
58+
resources:
59+
requests:
60+
memory: 10Gi
61+
cpu: 4
62+
limits:
63+
memory: 10Gi
64+
cpu: 4

0 commit comments

Comments
 (0)