Skip to content

Commit 101378c

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into remove_evaluator
2 parents d01318c + f9c1b6f commit 101378c

File tree

380 files changed

+5109
-4340
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

380 files changed

+5109
-4340
lines changed

.gitignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ third_party/
2727
cmake-build-*
2828

2929
# generated while compiling
30-
python/paddle/v2/fluid/core.so
3130
paddle/pybind/pybind.h
3231
CMakeFiles
3332
cmake_install.cmake

CMakeLists.txt

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
1919

2020
include(system)
2121

22-
project(paddle CXX C Go)
22+
project(paddle CXX C)
2323
message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
2424
"${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
2525
message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
@@ -60,7 +60,7 @@ option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF)
6060
option(WITH_DISTRIBUTE "Compile with grpc distributed support" OFF)
6161
option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF)
6262
option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF)
63-
option(WITH_FAST_BUNDLE_TEST "Bundle tests that can be run in a single process together to reduce launch overhead" ON)
63+
option(WITH_FAST_BUNDLE_TEST "Bundle tests that can be run in a single process together to reduce launch overhead" OFF)
6464

6565
# CMAKE_BUILD_TYPE
6666
if(NOT CMAKE_BUILD_TYPE)
@@ -146,6 +146,7 @@ include(external/cares)
146146
include(external/grpc)
147147

148148
include(cudnn) # set cudnn libraries, must before configure
149+
include(cupti)
149150
include(configure) # add paddle env configuration
150151
include(generic) # simplify cmake module
151152
include(package) # set paddle packages
@@ -174,7 +175,7 @@ set(EXTERNAL_LIBS
174175
)
175176

176177
if(WITH_GPU)
177-
include(cuda)
178+
include(cuda)
178179
endif(WITH_GPU)
179180

180181
if(WITH_MKLML)
@@ -201,17 +202,18 @@ endif()
201202
# "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
202203
# placed after this block, because they depends on it.
203204
if(WITH_GOLANG)
205+
enable_language(Go)
204206
add_subdirectory(go)
205207
endif(WITH_GOLANG)
206208

207209
set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
208210

209-
SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
210-
SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
211+
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
212+
set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
211213

212214
add_subdirectory(paddle)
213215
if(WITH_PYTHON)
214-
add_subdirectory(python)
216+
add_subdirectory(python)
215217
endif()
216218

217219
if(WITH_DOC)

Dockerfile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ COPY ./paddle/scripts/docker/root/ /root/
2222

2323
RUN apt-get update && \
2424
apt-get install -y \
25-
git python-pip python-dev openssh-server bison libnccl-dev \
25+
git python-pip python-dev openssh-server bison \
26+
libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
2627
wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
2728
curl sed grep graphviz libjpeg-dev zlib1g-dev \
2829
python-matplotlib gcc-4.8 g++-4.8 \

Dockerfile.android

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,6 @@ RUN apt-get update && \
2121
wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \
2222
apt-get clean -y
2323

24-
# Install Go and glide
25-
RUN wget -qO- go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
26-
tar -xz -C /usr/local && \
27-
mkdir /root/gopath && \
28-
mkdir /root/gopath/bin && \
29-
mkdir /root/gopath/src
30-
ENV GOROOT=/usr/local/go GOPATH=/root/gopath
31-
# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
32-
ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
33-
3424
# git credential to skip password typing
3525
RUN git config --global credential.helper store
3626

benchmark/cluster/vgg16/Dockerfile

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,35 @@
1-
#FROM python:2.7.14
21
FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
3-
RUN apt-get update && apt-get install -y python
4-
RUN pip install -U kubernetes opencv-python && apt-get update -y && apt-get install -y iputils-ping libgtk2.0-dev
5-
# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
6-
# so we must build one with distribute support to install in this image.
2+
3+
# you can get mirror list here:
4+
# https://launchpad.net/ubuntu/+archivemirrors
5+
ARG UBUNTU_MIRROR
6+
RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
7+
8+
RUN apt-get update && apt-get install -y python python-dev python-pip iputils-ping libgtk2.0-dev
9+
RUN pip install -U kubernetes opencv-python
10+
711
RUN pip install paddlepaddle
12+
# if network is slowly, you may need to add proxy here.
13+
# ENV https_proxy=
814
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
915
RUN pip uninstall -y paddlepaddle
16+
# unset proxy if it is setted.
17+
# ENV https_proxy=""
18+
19+
# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
20+
# so we must build one with distribute support to install in this image.
21+
ADD *.whl /
22+
RUN pip install /*.whl && rm -f /*.whl
23+
ENV LD_LIBRARY_PATH=/usr/local/lib
24+
25+
# tf k8s
26+
RUN pip install tensorflow==1.4.0
27+
ADD tf_k8s /usr/bin
28+
RUN chmod +x /usr/bin/tf_k8s
29+
ADD vgg16_tf.py /workspace/
1030

1131
# below lines may change a lot for debugging
1232
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
1333
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
14-
ADD *.whl /
15-
RUN pip install /*.whl && rm -f /*.whl && \
16-
chmod +x /usr/bin/paddle_k8s
17-
ENV LD_LIBRARY_PATH=/usr/local/lib
34+
RUN chmod +x /usr/bin/paddle_k8s
1835
ADD vgg16_fluid.py vgg16_v2.py /workspace/

benchmark/cluster/vgg16/fluid_trainer.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ spec:
1111
paddle-job: vgg16job
1212
spec:
1313
imagePullSecrets:
14-
- name: job-registry-secret
14+
- name: job-registry-secret
1515
hostNetwork: true
1616
containers:
1717
- name: trainer

benchmark/cluster/vgg16/tf_k8s

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
#!/bin/bash
2+
check_trainer_ret() {
3+
ret=$1
4+
stdbuf -oL echo "job returned $ret...setting pod return message..."
5+
stdbuf -oL echo "==============================="
6+
7+
if [ $ret -eq 136 ] ; then
8+
echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log
9+
elif [ $ret -eq 139 ] ; then
10+
echo "Segmentation Fault" > /dev/termination-log
11+
elif [ $ret -eq 1 ] ; then
12+
echo "General Error" > /dev/termination-log
13+
elif [ $ret -eq 134 ] ; then
14+
echo "Program Abort" > /dev/termination-log
15+
fi
16+
stdbuf -oL echo "termination log wroted..."
17+
exit $ret
18+
}
19+
20+
g_pservers=""
21+
g_trainers=""
22+
23+
wait_running_pods(){
24+
pserver_label="tf-job-pserver=${JOB_NAME}"
25+
trainer_label="tf-job-trainer=${JOB_NAME}"
26+
27+
stdbuf -oL python /root/k8s_tools.py wait_pods_running ${pserver_label} ${PSERVERS_NUM}
28+
stdbuf -oL python /root/k8s_tools.py wait_pods_running ${trainer_label} ${TRAINERS_NUM}
29+
30+
g_pservers=$(python /root/k8s_tools.py fetch_endpoints ${pserver_label} ${PORT})
31+
g_trainers=$(python /root/k8s_tools.py fetch_endpoints ${trainer_label} ${PORT})
32+
}
33+
34+
start_tf_pserver(){
35+
wait_running_pods
36+
37+
label="tf-job-pserver=${JOB_NAME}"
38+
pserver_id=$(python /root/k8s_tools.py fetch_id ${label})
39+
40+
cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
41+
--job_name=${TF_JOB_NAME} --task_index=${pserver_id}"
42+
43+
stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
44+
}
45+
46+
start_tf_trainer(){
47+
wait_running_pods
48+
49+
label="tf-job-trainer=${JOB_NAME}"
50+
trainer_id=$(python /root/k8s_tools.py fetch_id ${label})
51+
52+
cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
53+
--job_name=${TF_JOB_NAME} --task_index=${trainer_id} --batch_size=${BATCH_SIZE}"
54+
55+
stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
56+
check_trainer_ret $?
57+
}
58+
59+
start_tf(){
60+
if [[ "${TF_JOB_NAME}" == "worker" ]]; then
61+
start_tf_trainer
62+
else
63+
start_tf_pserver
64+
fi
65+
}
66+
67+
usage() {
68+
echo "usage: tf_k8s [<args>]:"
69+
echo " start_tf Start tensorflow jobs"
70+
}
71+
72+
case "$1" in
73+
start_tf)
74+
start_tf
75+
;;
76+
--help)
77+
usage
78+
;;
79+
*)
80+
usage
81+
;;
82+
esac
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
apiVersion: extensions/v1beta1
2+
kind: ReplicaSet
3+
metadata:
4+
name: vgg16job-tf-pserver
5+
spec:
6+
replicas: 10
7+
template:
8+
metadata:
9+
labels:
10+
tf-job-pserver: vgg16job-tf
11+
spec:
12+
hostNetwork: true
13+
imagePullSecrets:
14+
- name: job-registry-secret
15+
containers:
16+
- name: pserver
17+
image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
18+
imagePullPolicy: Always
19+
command: ["tf_k8s", "start_tf"]
20+
ports:
21+
- name: jobport-30236
22+
containerPort: 30236
23+
env:
24+
- name: PORT
25+
value: "32036"
26+
- name: ENTRY
27+
value: "python vgg16_tf.py"
28+
- name: JOB_NAME
29+
value: vgg16job-tf
30+
- name: PSERVERS_NUM
31+
value: "10"
32+
- name: TF_JOB_NAME
33+
value: "ps"
34+
- name: TRAINERS_NUM
35+
value: "20"
36+
- name: BATCH_SIZE
37+
value: "128"
38+
- name: TRAINER_PACKAGE
39+
value: "/workspace"
40+
- name: NUM_PASSES
41+
value: "1"
42+
- name: NAMESPACE
43+
valueFrom:
44+
fieldRef:
45+
fieldPath: "metadata.namespace"
46+
- name: POD_IP
47+
valueFrom:
48+
fieldRef:
49+
fieldPath: "status.podIP"
50+
resources:
51+
requests:
52+
memory: 10Gi
53+
cpu: 4
54+
limits:
55+
memory: 10Gi
56+
cpu: 4
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
apiVersion: batch/v1
2+
kind: Job
3+
metadata:
4+
name: vgg16job-tf-trainer
5+
spec:
6+
parallelism: 20
7+
completions: 20
8+
template:
9+
metadata:
10+
labels:
11+
tf-job-trainer: vgg16job-tf
12+
spec:
13+
imagePullSecrets:
14+
- name: job-registry-secret
15+
hostNetwork: true
16+
containers:
17+
- name: trainer
18+
image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
19+
imagePullPolicy: Always
20+
command: ["tf_k8s", "start_tf"]
21+
ports:
22+
- name: jobport-30236
23+
containerPort: 30236
24+
env:
25+
- name: PORT
26+
value: "32036"
27+
- name: JOB_NAME
28+
value: vgg16job-tf
29+
- name: TF_JOB_NAME
30+
value: "worker"
31+
- name: ENTRY
32+
value: "python vgg16_tf.py"
33+
- name: PSERVERS_NUM
34+
value: "10"
35+
- name: BATCH_SIZE
36+
value: "128"
37+
- name: TRAINERS_NUM
38+
value: "20"
39+
- name: TRAINER_PACKAGE
40+
value: "/workspace"
41+
- name: NUM_PASSES
42+
value: "1"
43+
- name: NAMESPACE
44+
valueFrom:
45+
fieldRef:
46+
fieldPath: "metadata.namespace"
47+
- name: POD_IP
48+
valueFrom:
49+
fieldRef:
50+
fieldPath: "status.podIP"
51+
resources:
52+
requests:
53+
memory: 40Gi
54+
cpu: 2
55+
limits:
56+
memory: 40Gi
57+
cpu: 2
58+
restartPolicy: Never

0 commit comments

Comments
 (0)