Skip to content

Commit cc4a766

Browse files
committed
merge develop
2 parents d40402f + 3fbfcd9 commit cc4a766

File tree

71 files changed

+2286
-987
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

71 files changed

+2286
-987
lines changed

benchmark/fluid/args.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,5 +140,11 @@ def parse_args():
140140
'--use_lars',
141141
action='store_true',
142142
help='If set, use lars for optimizers, ONLY support resnet module.')
143+
parser.add_argument(
144+
'--reduce_strategy',
145+
type=str,
146+
choices=['reduce', 'all_reduce'],
147+
default='all_reduce',
148+
help='Specify the reduce strategy, can be reduce, all_reduce')
143149
args = parser.parse_args()
144150
return args

benchmark/fluid/fluid_benchmark.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,14 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
170170
strategy = fluid.ExecutionStrategy()
171171
strategy.num_threads = args.cpus
172172
strategy.allow_op_delay = False
173+
build_strategy = fluid.BuildStrategy()
174+
if args.reduce_strategy == "reduce":
175+
build_strategy.reduce_strategy = fluid.BuildStrategy(
176+
).ReduceStrategy.Reduce
177+
else:
178+
build_strategy.reduce_strategy = fluid.BuildStrategy(
179+
).ReduceStrategy.AllReduce
180+
173181
avg_loss = train_args[0]
174182

175183
if args.update_method == "pserver":
@@ -184,6 +192,7 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
184192
avg_loss.name,
185193
main_program=train_prog,
186194
exec_strategy=strategy,
195+
build_strategy=build_strategy,
187196
num_trainers=num_trainers,
188197
trainer_id=trainer_id)
189198

benchmark/fluid/models/mnist.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -67,11 +67,14 @@ def cnn_model(data):
6767

6868
def get_model(args, is_train, main_prog, startup_prog):
6969
# NOTE: mnist is small, we don't implement data sharding yet.
70-
filelist = [
71-
os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
72-
]
70+
opt = None
71+
data_file_handle = None
7372
with fluid.program_guard(main_prog, startup_prog):
7473
if args.use_reader_op:
74+
filelist = [
75+
os.path.join(args.data_path, f)
76+
for f in os.listdir(args.data_path)
77+
]
7578
data_file_handle = fluid.layers.open_files(
7679
filenames=filelist,
7780
shapes=[[-1, 1, 28, 28], (-1, 1)],
@@ -100,7 +103,7 @@ def get_model(args, is_train, main_prog, startup_prog):
100103
if is_train:
101104
opt = fluid.optimizer.AdamOptimizer(
102105
learning_rate=0.001, beta1=0.9, beta2=0.999)
103-
opt.minimize()
106+
opt.minimize(avg_cost)
104107
if args.memory_optimize:
105108
fluid.memory_optimize(main_prog)
106109

benchmark/fluid/models/resnet.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ def get_model(args, is_train, main_prog, startup_prog):
207207

208208
total_images = 1281167 / trainer_count
209209

210-
step = int(total_images / args.batch_size + 1)
210+
step = int(total_images / (args.batch_size * args.gpus) + 1)
211211
epochs = [30, 60, 90]
212212
bd = [step * e for e in epochs]
213213
base_lr = args.learning_rate

cmake/tensorrt.cmake

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@ find_library(TENSORRT_LIBRARY NAMES libnvinfer.so libnvinfer.a
1616
DOC "Path to TensorRT library.")
1717

1818
if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY)
19+
if(WITH_DSO)
1920
set(TENSORRT_FOUND ON)
21+
endif(WITH DSO)
2022
else()
2123
set(TENSORRT_FOUND OFF)
2224
endif()

paddle/fluid/API.spec

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], vara
5959
paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
6060
paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
6161
paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
62-
paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
62+
paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
6363
paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None))
6464
paddle.fluid.InferenceTranspiler.__init__
6565
paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
@@ -305,9 +305,9 @@ paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'neg
305305
paddle.fluid.layers.detection_output ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0))
306306
paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None))
307307
paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral'))
308-
paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'anchor_var', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3))
308+
paddle.fluid.layers.rpn_target_assign ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True))
309309
paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None))
310-
paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'gt_boxes', 'im_scales', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None))
310+
paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True))
311311
paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None))
312312
paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
313313
paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
@@ -346,7 +346,7 @@ paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'con
346346
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
347347
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
348348
paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
349-
paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
349+
paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
350350
paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None))
351351
paddle.fluid.transpiler.InferenceTranspiler.__init__
352352
paddle.fluid.transpiler.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))

paddle/fluid/framework/details/all_reduce_op_handle.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
4646
#endif
4747

4848
void AllReduceOpHandle::RunImpl() {
49-
platform::RecordEvent r("all_reduce", nullptr);
49+
platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
50+
5051
if (NoDummyInputSize() == 1) {
5152
return; // No need to all reduce when GPU count = 1;
5253
} else {

paddle/fluid/framework/details/broadcast_op_handle.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,15 @@
1515
#include "paddle/fluid/framework/details/broadcast_op_handle.h"
1616
#include "paddle/fluid/framework/details/container_cast.h"
1717
#include "paddle/fluid/framework/details/variable_visitor.h"
18+
#include "paddle/fluid/platform/profiler.h"
1819

1920
namespace paddle {
2021
namespace framework {
2122
namespace details {
2223

2324
void BroadcastOpHandle::RunImpl() {
25+
platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
26+
2427
if (places_.size() == 1) return;
2528

2629
// The input and output may have dummy vars.

paddle/fluid/framework/details/multi_devices_graph_pass.cc

Lines changed: 36 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -348,14 +348,31 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
348348

349349
size_t cur_device_id = 0;
350350
bool is_forwarding = true;
351+
bool is_dist_train = false;
351352

352353
for (ir::Node *node : sorted_ops) {
353354
if (boost::get<int>(
354355
node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
355356
static_cast<int>(OpRole::kRPC)) {
356-
CreateRPCOp(&result, node);
357+
int op_dev_id = CreateRPCOp(&result, node);
358+
PADDLE_ENFORCE(op_dev_id != -1,
359+
"Can not schedule the RPC operator to the right place.");
360+
if (node->Op()->Type() == "recv") {
361+
auto recv_vars_attr =
362+
boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
363+
OpProtoAndCheckerMaker::OpRoleVarAttrName()));
364+
PADDLE_ENFORCE(recv_vars_attr.size() == 2UL); // [parameter, gradient]
365+
if (recv_vars_attr[0].find(".block") == std::string::npos) {
366+
bcast_var_name_set[op_dev_id].emplace(recv_vars_attr[0]);
367+
}
368+
}
369+
is_dist_train = true;
357370
} else if (IsDistTrainOp(node, send_vars, recv_vars)) {
358-
CreateDistTrainOp(&result, node);
371+
int op_dev_id = CreateDistTrainOp(&result, node);
372+
if (node->Op()->Type() == "concat") {
373+
auto origin_param_name = node->Op()->OutputArgumentNames()[0];
374+
bcast_var_name_set[op_dev_id].emplace(origin_param_name);
375+
}
359376
} else if (IsScaleLossOp(node)) {
360377
// user can customize loss@grad if not use_default_grad_scale_
361378
if (strategy_.gradient_scale_ !=
@@ -414,7 +431,9 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
414431
CreateReduceOp(&result, g_name, cur_device_id);
415432
graph->Get<ShardedVarDevice>(kShardedVarDevice)
416433
.emplace(g_name, cur_device_id);
417-
bcast_var_name_set[cur_device_id].emplace(p_name);
434+
if (!is_dist_train) {
435+
bcast_var_name_set[cur_device_id].emplace(p_name);
436+
}
418437
break;
419438
case BuildStrategy::ReduceStrategy::kAllReduce:
420439
if (IsSparseGradient(g_name)) {
@@ -436,14 +455,19 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
436455
}
437456
}
438457
}
439-
440458
bool use_gpu = false;
441459
#ifdef PADDLE_WITH_CUDA
442460
use_gpu = nccl_ctxs_ != nullptr;
443461
#endif
444462

445-
if (use_gpu && strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
446-
// Insert BCast Ops
463+
// Insert broadcast operators principle:
464+
// 1. Broadcast optimized parameters in Reduce strategy;
465+
// 2. No need broadcast optimized parameters in AllReduce strategy because of
466+
// the optimization sub-graph would be run on every GPU;
467+
// 3. Allways broadcast received parameters in Distribute Training.
468+
if ((use_gpu &&
469+
strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) ||
470+
is_dist_train) {
447471
for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
448472
auto &to_bcast_set = bcast_var_name_set[dev_id];
449473
for (auto &bcast_name : to_bcast_set) {
@@ -675,8 +699,8 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
675699
return var;
676700
}
677701

678-
void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
679-
ir::Node *node) const {
702+
int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
703+
ir::Node *node) const {
680704
int op_dev_id = -1;
681705
std::vector<std::string> input_var_names;
682706
std::vector<std::string> output_var_names;
@@ -719,6 +743,7 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
719743
node->Op()->Type());
720744

721745
CreateComputationalOp(result, node, op_dev_id);
746+
return op_dev_id;
722747
}
723748

724749
void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
@@ -737,8 +762,8 @@ void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
737762
}
738763

739764
// Create RPC related op handles that connects its in ops and out ops.
740-
void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
741-
ir::Node *node) const {
765+
int MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
766+
ir::Node *node) const {
742767
int op_dev_id = -1;
743768
if (node->Op()->Type() == "send") {
744769
// TODO(paddle-dev): getting the first var is not safe.
@@ -824,6 +849,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
824849
CreateOpOutput(result, op_handle, new_node, p, outvar_dev_id);
825850
}
826851
}
852+
return op_dev_id;
827853
}
828854

829855
bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const {

paddle/fluid/framework/details/multi_devices_graph_pass.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,8 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
5454

5555
bool IsScaleLossOp(ir::Node *node) const;
5656

57-
void CreateRPCOp(ir::Graph *result, ir::Node *node) const;
58-
void CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
57+
int CreateRPCOp(ir::Graph *result, ir::Node *node) const;
58+
int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
5959

6060
/**
6161
* Is this operator as the end-point operator before/after send operator.

0 commit comments

Comments
 (0)