Skip to content

Commit 9c69fdf

Browse files
author
Yibing Liu
committed
Merge branch 'develop' of upstream into argsort_dev
2 parents e710d2c + 6d6996a commit 9c69fdf

File tree

181 files changed

+6151
-2716
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

181 files changed

+6151
-2716
lines changed

.pre-commit-config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ repos:
2323
- id: clang-format-with-version-check
2424
name: clang-format
2525
description: Format files with ClangFormat.
26-
entry: bash ./.clang_format.hook -i
26+
entry: bash ./tools/codestyle/clang_format.hook -i
2727
language: system
2828
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
2929
- repo: local
@@ -52,7 +52,7 @@ repos:
5252
hooks:
5353
- id: copyright_checker
5454
name: copyright_checker
55-
entry: python ./.copyright.hook
55+
entry: python ./tools/codestyle/copyright.hook
5656
language: system
5757
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
5858
exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$

Dockerfile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,8 @@ RUN easy_install -U pip && \
7676
pip install sphinx-rtd-theme==0.1.9 recommonmark
7777

7878
RUN pip install pre-commit 'ipython==5.3.0' && \
79-
pip install 'ipykernel==4.6.0' 'jupyter==1.0.0'
79+
pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
80+
pip install opencv-python
8081

8182
#For docstring checker
8283
RUN pip install pylint pytest astroid isort

benchmark/fluid/Dockerfile

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,18 @@
11
FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
2+
3+
# Use UBUNTU_MIRROR can speed up apt-get speed.
4+
# ARG UBUNTU_MIRROR
5+
# RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
6+
27
RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv
38
RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
4-
RUN pip install -U pip
5-
RUN pip install -U kubernetes paddlepaddle
69

710
# IMPORTANT:
811
# Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
12+
# exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ...
13+
14+
RUN pip install -U pip
15+
RUN pip install -U kubernetes paddlepaddle
916

1017
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python'
1118
RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python'
@@ -14,9 +21,11 @@ RUN pip uninstall -y paddlepaddle && mkdir /workspace
1421

1522
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
1623
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
24+
RUN chmod +x /usr/bin/paddle_k8s
1725

1826
ADD *.whl /
19-
RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s
27+
RUN pip install /*.whl && rm -f /*.whl
2028

2129
ENV LD_LIBRARY_PATH=/usr/local/lib
22-
ADD fluid_benchmark.py recordio_converter.py models/ /workspace/
30+
ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh /workspace/
31+
ADD models/ /workspace/models/

benchmark/fluid/fluid_benchmark.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ def dist_transpile(trainer_id, args):
9797
return train_program, fluid.default_startup_program()
9898
else:
9999
raise ValueError(
100-
'TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
100+
'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
101101
)
102102

103103

@@ -264,8 +264,6 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
264264
break
265265
else:
266266
loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
267-
if args.update_method == "pserver":
268-
exe.bcast_params()
269267
if args.use_reader_op:
270268
num_samples += args.batch_size * args.gpus
271269
else:
@@ -301,9 +299,18 @@ def print_train_time(start_time, end_time, num_samples):
301299
(num_samples, train_elapsed, examples_per_sec))
302300

303301

302+
def print_paddle_envs():
303+
print('----------- Configuration envs -----------')
304+
for k in os.environ:
305+
if "PADDLE_" in k:
306+
print "ENV %s:%s" % (k, os.environ[k])
307+
print('------------------------------------------------')
308+
309+
304310
def main():
305311
args = parse_args()
306312
print_arguments(args)
313+
print_paddle_envs()
307314

308315
# the unique trainer id, starting from 0, needed by trainer
309316
# only

benchmark/fluid/kube_gen_job.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import argparse
1818
import random
1919
import os
20+
import copy
2021
from kube_templates import pserver, trainer, envs
2122

2223

@@ -108,10 +109,9 @@ def gen_job():
108109
tn_container["ports"][0]["containerPort"] = spreadport
109110

110111
envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname})
111-
envs.append({"name": "TRAINERS", "value": str(args.trainers)})
112-
envs.append({"name": "PSERVERS", "value": str(args.pservers)})
112+
envs.append({"name": "PADDLE_TRAINERS", "value": str(args.trainers)})
113+
envs.append({"name": "PADDLE_PSERVERS", "value": str(args.pservers)})
113114
envs.append({"name": "ENTRY", "value": args.entry})
114-
envs.append({"name": "PADDLE_INIT_PORT", "value": str(args.port)})
115115
envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
116116
# NOTE: these directories below are cluster specific, please modify
117117
# this settings before you run on your own cluster.
@@ -166,17 +166,23 @@ def gen_job():
166166
tn["spec"]["template"]["spec"]["volumes"] = volumes
167167
tn_container["volumeMounts"] = volumeMounts
168168

169-
ps_container["env"] = envs
170-
ps_container["env"].append({"name": "TRAINING_ROLE", "value": "PSERVER"})
169+
ps_container["env"] = copy.deepcopy(envs)
170+
ps_container["env"].append({
171+
"name": "PADDLE_TRAINING_ROLE",
172+
"value": "PSERVER"
173+
})
171174
tn_container["env"] = envs
172175
if args.disttype == "pserver":
173176
tn_container["env"].append({
174-
"name": "TRAINING_ROLE",
177+
"name": "PADDLE_TRAINING_ROLE",
175178
"value": "TRAINER"
176179
})
177180
elif args.disttype == "nccl2" or args.disttype == "local":
178181
# NCCL2 have no training role, set to plain WORKER
179-
tn_container["env"].append({"name": "TRAINING_ROLE", "value": "WORKER"})
182+
tn_container["env"].append({
183+
"name": "PADDLE_TRAINING_ROLE",
184+
"value": "WORKER"
185+
})
180186

181187
os.mkdir(args.jobname)
182188
if args.disttype == "pserver":

cmake/external/mkldnn.cmake

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,15 +45,16 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
4545
ELSE()
4646
MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
4747
ENDIF()
48-
SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-unused-result")
48+
SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result")
49+
SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
4950
SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
5051
SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
5152
ExternalProject_Add(
5253
${MKLDNN_PROJECT}
5354
${EXTERNAL_PROJECT_LOG_ARGS}
5455
DEPENDS ${MKLDNN_DEPENDS}
5556
GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git"
56-
GIT_TAG "db3424ad44901513c03a1ea31ccaacdf633fbe9f"
57+
GIT_TAG "a29d8487a63afca3d5b8c5bbdbb473cf8ccc6e51"
5758
PREFIX ${MKLDNN_SOURCES_DIR}
5859
UPDATE_COMMAND ""
5960
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}

doc/fluid/api/gen_doc.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/bin/bash
22
python gen_doc.py layers --submodules control_flow device io nn ops tensor detection learning_rate_scheduler metric > layers.rst
33

4-
for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer
4+
for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer transpiler
55
do
66
python gen_doc.py ${module} > ${module}.rst
77
done

doc/fluid/api/transpiler.rst

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
2+
!DO NOT EDIT THIS FILE MANUALLY!
3+
4+
==========
5+
transpiler
6+
==========
7+
8+
DistributeTranspiler
9+
--------------------
10+
11+
.. autoclass:: paddle.fluid.transpiler.DistributeTranspiler
12+
:members:
13+
:noindex:
14+
15+
InferenceTranspiler
16+
-------------------
17+
18+
.. autoclass:: paddle.fluid.transpiler.InferenceTranspiler
19+
:members:
20+
:noindex:
21+
22+
memory_optimize
23+
---------------
24+
25+
.. autofunction:: paddle.fluid.transpiler.memory_optimize
26+
:noindex:
27+
28+
release_memory
29+
--------------
30+
31+
.. autofunction:: paddle.fluid.transpiler.release_memory
32+
:noindex:
33+
34+
HashName
35+
--------
36+
37+
.. autoclass:: paddle.fluid.transpiler.HashName
38+
:members:
39+
:noindex:
40+
41+
RoundRobin
42+
----------
43+
44+
.. autoclass:: paddle.fluid.transpiler.RoundRobin
45+
:members:
46+
:noindex:

doc/fluid/howto/cluster/fluid_cluster_train_cn.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -168,13 +168,13 @@ cd /paddle/python/paddle/fluid/tests/book
168168

169169
第二步,启动Parameter Server:
170170
```bash
171-
PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.2 TRAINERS=2 POD_IP=192.168.1.2 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=PSERVER python test_fit_a_line.py
171+
PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.2 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=192.168.1.2 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=PSERVER python test_fit_a_line.py
172172
```
173173
执行命令后请等待出现提示: ```Server listening on 192.168.1.2:6174 ```, 表示Paramter Server已经正常启动。
174174

175175
第三步,启动Trainer:
176176
```bash
177-
PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.3 TRAINERS=2 POD_IP=192.168.1.3 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=TRAINER python test_fit_a_line.py
177+
PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.3 PADDLE_TRAINERS=2 PADDLE_CURRENT_IPP=192.168.1.3 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=TRAINER python test_fit_a_line.py
178178
```
179179
由于我们定义的Trainer的数量是2个,因此需要在另外一个计算节点上再启动一个Trainer。
180180

doc/fluid/howto/cluster/fluid_recordio.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,8 @@ def gen_train_list(file_pattern, trainers, trainer_id):
114114
ret_list.append(f)
115115
return ret_list
116116

117-
trainers = int(os.getenv("TRAINERS"))
118-
trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
117+
trainers = int(os.getenv("PADDLE_TRAINERS"))
118+
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
119119
data_file = fluid.layers.io.open_files(
120120
filenames=gen_train_list("./mnist-[0-9]*.recordio", 2, 0),
121121
thread_num=1,

0 commit comments

Comments
 (0)