Skip to content

Commit 4444e79

Browse files
committed
Merge branch 'develop' of github.com:PaddlePaddle/Paddle into overlap_memcpy_with_dist
2 parents d5a88b9 + 19fd071 commit 4444e79

File tree

150 files changed

+3675
-1147
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

150 files changed

+3675
-1147
lines changed

CMakeLists.txt

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,12 +55,13 @@ option(WITH_FLUID_ONLY "Compile PaddlePaddle fluid only" OFF)
5555
option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF)
5656
option(GLIDE_INSTALL "Download and install go dependencies " ON)
5757
option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF)
58-
option(WITH_DISTRIBUTE "Compile with grpc distributed support" OFF)
58+
option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
5959
option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF)
6060
option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF)
6161
option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF)
6262
option(WITH_FAST_BUNDLE_TEST "Bundle tests that can be run in a single process together to reduce launch overhead" OFF)
6363
option(WITH_CONTRIB "Compile the third-party contributation" OFF)
64+
option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE})
6465

6566
# CMAKE_BUILD_TYPE
6667
if(NOT CMAKE_BUILD_TYPE)
@@ -147,7 +148,16 @@ include(external/any) # download libn::any
147148
include(external/eigen) # download eigen3
148149
include(external/pybind11) # download pybind11
149150
include(external/cares)
150-
include(external/grpc)
151+
152+
if(WITH_DISTRIBUTE)
153+
if(WITH_GRPC)
154+
include(external/grpc)
155+
else()
156+
include(external/leveldb)
157+
include(external/brpc)
158+
endif()
159+
endif()
160+
151161
include(external/snappy) # download snappy
152162
include(external/snappystream)
153163
include(external/threadpool)

benchmark/fluid/README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,12 @@ Currently supported `--model` argument include:
2424

2525
* Run the following command to start a benchmark job locally:
2626
```bash
27-
python fluid_benchmark.py --model mnist --device GPU
27+
python fluid_benchmark.py --model mnist --device GPU
2828
```
2929
You can choose to use GPU/CPU training. With GPU training, you can specify
3030
`--gpus <gpu_num>` to run multi GPU training.
31+
You can set async mode parameter server. With async mode, you can specify
32+
`--async_mode` to train model asynchronous.
3133
* Run distributed training with parameter servers:
3234
* see [run_fluid_benchmark.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/fluid/run_fluid_benchmark.sh) as an example.
3335
* start parameter servers:

benchmark/fluid/args.py

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import argparse
16+
17+
__all__ = ['parse_args', ]
18+
19+
BENCHMARK_MODELS = [
20+
"machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
21+
]
22+
23+
24+
def parse_args():
25+
parser = argparse.ArgumentParser('Fluid model benchmarks.')
26+
parser.add_argument(
27+
'--model',
28+
type=str,
29+
choices=BENCHMARK_MODELS,
30+
default='resnet',
31+
help='The model to run benchmark with.')
32+
parser.add_argument(
33+
'--batch_size', type=int, default=32, help='The minibatch size.')
34+
# args related to learning rate
35+
parser.add_argument(
36+
'--learning_rate', type=float, default=0.001, help='The learning rate.')
37+
# TODO(wuyi): add "--use_fake_data" option back.
38+
parser.add_argument(
39+
'--skip_batch_num',
40+
type=int,
41+
default=5,
42+
help='The first num of minibatch num to skip, for better performance test'
43+
)
44+
parser.add_argument(
45+
'--iterations', type=int, default=80, help='The number of minibatches.')
46+
parser.add_argument(
47+
'--pass_num', type=int, default=100, help='The number of passes.')
48+
parser.add_argument(
49+
'--data_format',
50+
type=str,
51+
default='NCHW',
52+
choices=['NCHW', 'NHWC'],
53+
help='The data data_format, now only support NCHW.')
54+
parser.add_argument(
55+
'--device',
56+
type=str,
57+
default='GPU',
58+
choices=['CPU', 'GPU'],
59+
help='The device type.')
60+
parser.add_argument(
61+
'--gpus',
62+
type=int,
63+
default=1,
64+
help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
65+
# this option is available only for vgg and resnet.
66+
parser.add_argument(
67+
'--cpus',
68+
type=int,
69+
default=1,
70+
help='If cpus > 1, will use ParallelDo to run, else use Executor.')
71+
parser.add_argument(
72+
'--data_set',
73+
type=str,
74+
default='flowers',
75+
choices=['cifar10', 'flowers'],
76+
help='Optional dataset for benchmark.')
77+
parser.add_argument(
78+
'--infer_only', action='store_true', help='If set, run forward only.')
79+
parser.add_argument(
80+
'--use_cprof', action='store_true', help='If set, use cProfile.')
81+
parser.add_argument(
82+
'--use_nvprof',
83+
action='store_true',
84+
help='If set, use nvprof for CUDA.')
85+
parser.add_argument(
86+
'--no_test',
87+
action='store_true',
88+
help='If set, do not test the testset during training.')
89+
parser.add_argument(
90+
'--memory_optimize',
91+
action='store_true',
92+
help='If set, optimize runtime memory before start.')
93+
parser.add_argument(
94+
'--use_fake_data',
95+
action='store_true',
96+
help='If set ommit the actual read data operators.')
97+
parser.add_argument(
98+
'--profile', action='store_true', help='If set, profile a few steps.')
99+
parser.add_argument(
100+
'--update_method',
101+
type=str,
102+
default='local',
103+
choices=['local', 'pserver', 'nccl2'],
104+
help='Choose parameter update method, can be local, pserver, nccl2.')
105+
parser.add_argument(
106+
'--no_split_var',
107+
action='store_true',
108+
default=False,
109+
help='Whether split variables into blocks when update_method is pserver')
110+
parser.add_argument(
111+
'--async_mode',
112+
action='store_true',
113+
default=False,
114+
help='Whether start pserver in async mode to support ASGD')
115+
parser.add_argument(
116+
'--use_reader_op',
117+
action='store_true',
118+
help='Whether to use reader op, and must specify the data path if set this to true.'
119+
)
120+
parser.add_argument(
121+
'--data_path',
122+
type=str,
123+
default="",
124+
help='Directory that contains all the training recordio files.')
125+
args = parser.parse_args()
126+
return args

benchmark/fluid/fluid_benchmark.py

Lines changed: 9 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -24,108 +24,7 @@
2424
import paddle.fluid.profiler as profiler
2525
import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler
2626

27-
BENCHMARK_MODELS = [
28-
"machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
29-
]
30-
31-
32-
def parse_args():
33-
parser = argparse.ArgumentParser('Fluid model benchmarks.')
34-
parser.add_argument(
35-
'--model',
36-
type=str,
37-
choices=BENCHMARK_MODELS,
38-
default='resnet',
39-
help='The model to run benchmark with.')
40-
parser.add_argument(
41-
'--batch_size',
42-
type=int,
43-
default=32,
44-
help='The batch size on each gpu.')
45-
parser.add_argument(
46-
'--learning_rate', type=float, default=0.001, help='The learning rate.')
47-
parser.add_argument(
48-
'--skip_batch_num',
49-
type=int,
50-
default=5,
51-
help='The first num of minibatch num to skip, for better performance test'
52-
)
53-
parser.add_argument(
54-
'--iterations',
55-
type=int,
56-
default=80,
57-
help='The number of minibatches, set to -1 to run all batches.')
58-
parser.add_argument(
59-
'--pass_num', type=int, default=100, help='The number of passes.')
60-
parser.add_argument(
61-
'--data_format',
62-
type=str,
63-
default='NCHW',
64-
choices=['NCHW', 'NHWC'],
65-
help='The data data_format, now only support NCHW.')
66-
parser.add_argument(
67-
'--device',
68-
type=str,
69-
default='GPU',
70-
choices=['CPU', 'GPU'],
71-
help='The device type.')
72-
parser.add_argument(
73-
'--gpus',
74-
type=int,
75-
default=1,
76-
help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
77-
# this option is available only for vgg and resnet.
78-
parser.add_argument(
79-
'--cpus',
80-
type=int,
81-
default=1,
82-
help='If cpus > 1, will use ParallelDo to run, else use Executor.')
83-
parser.add_argument(
84-
'--data_set',
85-
type=str,
86-
default='flowers',
87-
choices=['cifar10', 'flowers', 'imagenet'],
88-
help='Optional dataset for benchmark.')
89-
parser.add_argument(
90-
'--infer_only', action='store_true', help='If set, run forward only.')
91-
parser.add_argument(
92-
'--use_cprof', action='store_true', help='If set, use cProfile.')
93-
parser.add_argument(
94-
'--use_nvprof',
95-
action='store_true',
96-
help='If set, use nvprof for CUDA.')
97-
parser.add_argument(
98-
'--no_test',
99-
action='store_true',
100-
help='If set, do not test the testset during training.')
101-
parser.add_argument(
102-
'--memory_optimize',
103-
action='store_true',
104-
help='If set, optimize runtime memory before start.')
105-
parser.add_argument(
106-
'--use_fake_data',
107-
action='store_true',
108-
help='If set ommit the actual read data operators.')
109-
parser.add_argument(
110-
'--profile', action='store_true', help='If set, profile a few steps.')
111-
parser.add_argument(
112-
'--update_method',
113-
type=str,
114-
default='local',
115-
choices=['local', 'pserver', 'nccl2'],
116-
help='Choose parameter update method, can be local, pserver, nccl2.')
117-
parser.add_argument(
118-
'--use_reader_op',
119-
action='store_true',
120-
help='Whether to use reader op, and must specify the data path if set this to true.'
121-
)
122-
parser.add_argument(
123-
'--data_path',
124-
type=str,
125-
default="",
126-
help='Directory that contains all the training recordio files.')
127-
args = parser.parse_args()
128-
return args
27+
from args import *
12928

13029

13130
def append_nccl2_prepare(trainer_id):
@@ -160,7 +59,7 @@ def append_nccl2_prepare(trainer_id):
16059
"nccl-based dist train.")
16160

16261

163-
def dist_transpile(trainer_id):
62+
def dist_transpile(trainer_id, args):
16463
if trainer_id < 0:
16564
return None, None
16665

@@ -182,7 +81,12 @@ def dist_transpile(trainer_id):
18281
training_role = os.getenv("PADDLE_TRAINING_ROLE")
18382

18483
t = distribute_transpiler.DistributeTranspiler()
185-
t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
84+
t.transpile(
85+
trainer_id,
86+
pservers=pserver_endpoints,
87+
trainers=trainers,
88+
sync_mode=not args.async_mode,
89+
slice_var_up=not args.no_split_var)
18690
if training_role == "PSERVER":
18791
pserver_program = t.get_pserver_program(current_endpoint)
18892
pserver_startup_program = t.get_startup_program(current_endpoint,
@@ -417,7 +321,7 @@ def main():
417321
fluid.memory_optimize(fluid.default_main_program())
418322

419323
if args.update_method == "pserver":
420-
train_prog, startup_prog = dist_transpile(trainer_id)
324+
train_prog, startup_prog = dist_transpile(trainer_id, args)
421325
if not train_prog:
422326
raise Exception(
423327
"Must configure correct environments to run dist train.")

benchmark/fluid/models/stacked_dynamic_lstm.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,9 @@ def gate_common(
104104
loss = fluid.layers.mean(x=loss)
105105

106106
# add acc
107+
batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
107108
batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
108-
shape=[1], dtype='int64'))
109+
shape=[1], dtype='int64'), total=batch_size_tensor)
109110

110111
inference_program = fluid.default_main_program().clone()
111112
with fluid.program_guard(inference_program):

benchmark/fluid/models/vgg.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,8 @@ def get_model(args):
8282
data_file, batch_size=args.batch_size))
8383
images, label = fluid.layers.read_file(data_file)
8484
else:
85-
images = fluid.layers.data(name='data', shape=dshape, dtype='float32')
85+
images = fluid.layers.data(
86+
name='data', shape=data_shape, dtype='float32')
8687
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
8788

8889
# Train program

cmake/configure.cmake

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,3 +166,7 @@ if(WITH_GOLANG)
166166
endif()
167167

168168
endif(WITH_GOLANG)
169+
170+
if(WITH_GRPC)
171+
add_definitions(-DPADDLE_WITH_GRPC)
172+
endif(WITH_GRPC)

0 commit comments

Comments
 (0)