Skip to content

Commit 928418a

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into gen_nccl_id_op
2 parents 5ae0c66 + 9923be5 commit 928418a

40 files changed

+588
-227
lines changed

doc/fluid/design/concepts/lod_tensor.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ into offsets
155155
3 2+3 4+5 1+9 2+10 3+12
156156
```
157157

158-
so we know that the first sentence is from word 0 to word 3, and the second sentence from work 3 to word 5.
158+
so we know that the first sentence is from word 0 to word 3, and the second sentence from word 3 to word 5.
159159

160160
Similarly, the lengths in the top level LoD
161161

paddle/fluid/framework/details/multi_devices_graph_builder.cc

Lines changed: 64 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -37,20 +37,26 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
3737
const std::string &loss_var_name,
3838
const std::unordered_set<std::string> &params,
3939
const std::vector<Scope *> &local_scopes,
40-
platform::NCCLContextMap *nccl_ctxs, bool use_default_grad_scale)
40+
platform::NCCLContextMap *nccl_ctxs, bool use_default_grad_scale,
41+
bool balance_parameter_opt_between_cards)
4142
: loss_var_name_(loss_var_name),
4243
places_(places),
4344
local_scopes_(local_scopes),
44-
nccl_ctxs_(nccl_ctxs) {
45+
nccl_ctxs_(nccl_ctxs),
46+
balance_parameter_opt_between_cards_(
47+
balance_parameter_opt_between_cards) {
4548
#else
4649
MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
4750
const std::vector<platform::Place> &places,
4851
const std::string &loss_var_name,
4952
const std::unordered_set<std::string> &params,
50-
const std::vector<Scope *> &local_scopes, bool use_default_grad_scale)
53+
const std::vector<Scope *> &local_scopes, bool use_default_grad_scale,
54+
bool balance_parameter_opt_between_cards)
5155
: loss_var_name_(loss_var_name),
5256
places_(places),
53-
local_scopes_(local_scopes) {
57+
local_scopes_(local_scopes),
58+
balance_parameter_opt_between_cards_(
59+
balance_parameter_opt_between_cards) {
5460
#endif
5561
for (auto &p : params) {
5662
grad_names_.insert(GradVarName(p));
@@ -124,6 +130,12 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
124130
// Find "send" op first for split is in front of send.
125131
OpDesc *send_op = GetSendOpDesc(program);
126132

133+
size_t cur_device_id = 0;
134+
std::vector<std::unordered_set<std::string>> var_name_on_devices;
135+
std::vector<std::unordered_set<std::string>> bcast_var_name_set;
136+
var_name_on_devices.resize(places_.size());
137+
bcast_var_name_set.resize(places_.size());
138+
127139
bool is_forwarding = true;
128140
for (auto *op : program.Block(0).AllOps()) {
129141
if (op->Type() == "send") {
@@ -139,24 +151,47 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
139151
}
140152
is_forwarding = false;
141153
} else {
142-
CreateComputationalOps(&result, *op, places_.size());
154+
int op_dev_id = GetOpDeviceID(var_name_on_devices, *op);
155+
if (op_dev_id == -1) { // var on all device
156+
CreateComputationalOps(&result, *op, places_.size());
157+
} else {
158+
CreateComputationalOp(&result, *op, op_dev_id);
159+
for (auto &var_name : op->OutputArgumentNames()) {
160+
var_name_on_devices[op_dev_id].emplace(var_name);
161+
}
162+
}
143163
if (!is_forwarding && places_.size() > 1) {
144164
// Currently, we assume that once gradient is generated, it can be
145165
// broadcast, and each gradient is only broadcast once.
146166
for (auto &og : op->OutputArgumentNames()) {
147167
if (IsParameterGradientOnce(og, &og_has_been_broadcast)) {
148-
if (IsSparseGradient(var_types, og)) {
149-
CreateReduceOp(&result, og, 0);
150-
CreateBroadcastOp(&result, og, 0);
168+
if (balance_parameter_opt_between_cards_) {
169+
CreateReduceOp(&result, og, cur_device_id);
170+
var_name_on_devices[cur_device_id].emplace(og);
171+
bcast_var_name_set[cur_device_id].emplace(
172+
og.substr(0, og.size() - strlen(kGradVarSuffix)));
173+
cur_device_id = (cur_device_id + 1) % places_.size();
151174
} else {
152-
InsertNCCLAllReduceOp(&result, og);
175+
if (IsSparseGradient(var_types, og)) {
176+
CreateReduceOp(&result, og, 0);
177+
CreateBroadcastOp(&result, og, 0);
178+
} else {
179+
InsertNCCLAllReduceOp(&result, og);
180+
}
153181
}
154182
}
155183
}
156184
}
157185
}
158186
}
159187

188+
// Insert BCast Ops
189+
for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
190+
auto &to_bcast_set = bcast_var_name_set[dev_id];
191+
for (auto &bcast_name : to_bcast_set) {
192+
CreateBroadcastOp(&result, bcast_name, dev_id);
193+
}
194+
}
160195
/*
161196
Dependency graph has been constructed. However, there are still data
162197
harzaeds need to be handled.
@@ -265,6 +300,26 @@ bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
265300
return is_pg_once;
266301
}
267302

303+
int MultiDevSSAGraphBuilder::GetOpDeviceID(
304+
const std::vector<std::unordered_set<std::string>> &var_name_on_devices,
305+
const OpDesc &op) const {
306+
if (!balance_parameter_opt_between_cards_) {
307+
return -1;
308+
}
309+
310+
int var_dev_id = -1;
311+
for (auto &var_name : op.InputArgumentNames()) {
312+
if (var_dev_id != -1) break;
313+
for (size_t i = 0; i < var_name_on_devices.size(); ++i) {
314+
if (var_name_on_devices[i].count(var_name)) {
315+
var_dev_id = static_cast<int>(i);
316+
break;
317+
}
318+
}
319+
}
320+
return var_dev_id;
321+
}
322+
268323
void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
269324
for (size_t i = 0; i < places_.size(); ++i) {
270325
// Insert ScaleCost OpHandle

paddle/fluid/framework/details/multi_devices_graph_builder.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,15 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
3636
const std::unordered_set<std::string> &params,
3737
const std::vector<Scope *> &local_scopes,
3838
platform::NCCLContextMap *nccl_ctxs,
39-
bool use_default_grad_scale);
39+
bool use_default_grad_scale,
40+
bool balance_parameter_opt_between_cards);
4041
#else
4142
MultiDevSSAGraphBuilder(const std::vector<platform::Place> &places,
4243
const std::string &loss_var_name,
4344
const std::unordered_set<std::string> &params,
4445
const std::vector<Scope *> &local_scopes,
45-
bool use_default_grad_scale);
46+
bool use_default_grad_scale,
47+
bool balance_parameter_opt_between_cards);
4648
#endif
4749

4850
std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
@@ -60,6 +62,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
6062
#ifdef PADDLE_WITH_CUDA
6163
platform::NCCLContextMap *nccl_ctxs_;
6264
#endif
65+
bool balance_parameter_opt_between_cards_;
6366
bool use_default_grad_scale_;
6467

6568
bool IsScaleLossOp(const OpDesc &op) const;
@@ -84,6 +87,10 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
8487
const std::string &og,
8588
std::unordered_set<std::string> *og_has_been_broadcast) const;
8689

90+
int GetOpDeviceID(
91+
const std::vector<std::unordered_set<std::string>> &var_name_on_devices,
92+
const OpDesc &op) const;
93+
8794
void InsertNCCLAllReduceOp(SSAGraph *result, const std::string &og) const;
8895

8996
void CreateBroadcastOp(SSAGraph *result, const std::string &p_name,

paddle/fluid/framework/parallel_executor.cc

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,8 @@ ParallelExecutor::ParallelExecutor(
5858
const std::unordered_set<std::string> &bcast_vars,
5959
const ProgramDesc &main_program, const std::string &loss_var_name,
6060
Scope *scope, const std::vector<Scope *> &local_scopes, bool allow_op_delay,
61-
bool use_default_grad_scale, size_t num_trainers, size_t trainer_id)
61+
bool use_default_grad_scale, bool balance_parameter_opt_between_cards,
62+
size_t num_trainers, size_t trainer_id)
6263
: member_(new ParallelExecutorPrivate(places)) {
6364
member_->global_scope_ = scope;
6465

@@ -99,11 +100,12 @@ ParallelExecutor::ParallelExecutor(
99100
#ifdef PADDLE_WITH_CUDA
100101
details::MultiDevSSAGraphBuilder builder(
101102
member_->places_, loss_var_name, params, member_->local_scopes_,
102-
member_->nccl_ctxs_.get(), use_default_grad_scale);
103+
member_->nccl_ctxs_.get(), use_default_grad_scale,
104+
balance_parameter_opt_between_cards);
103105
#else
104-
details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name,
105-
params, member_->local_scopes_,
106-
use_default_grad_scale);
106+
details::MultiDevSSAGraphBuilder builder(
107+
member_->places_, loss_var_name, params, member_->local_scopes_,
108+
use_default_grad_scale, balance_parameter_opt_between_cards);
107109
#endif
108110
auto graph = builder.Build(main_program);
109111

paddle/fluid/framework/parallel_executor.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ class ParallelExecutor {
4141
const std::string& loss_var_name, Scope* scope,
4242
const std::vector<Scope*>& local_scopes,
4343
bool allow_op_delay, bool use_default_grad_scale,
44+
bool balance_parameter_opt_between_cards,
4445
size_t num_trainers = 0, size_t trainer_id = 0);
4546

4647
~ParallelExecutor();

paddle/fluid/operators/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,11 @@ foreach(src ${READER_LIBRARY})
276276
set(OP_LIBRARY ${src} ${OP_LIBRARY})
277277
endforeach()
278278

279+
add_subdirectory(detection)
280+
foreach(src ${DETECTION_LIBRARY})
281+
set(OP_LIBRARY ${src} ${OP_LIBRARY})
282+
endforeach()
283+
279284
set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
280285

281286
cc_test(gather_test SRCS gather_test.cc DEPS tensor)
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
set(LOCAL_DETECTION_LIBS)
2+
3+
function(detection_library TARGET_NAME)
4+
set(oneValueArgs "")
5+
set(multiValueArgs SRCS DEPS)
6+
set(options "")
7+
set(common_deps op_registry)
8+
set(pybind_flag 0)
9+
cmake_parse_arguments(detection_library "${options}" "${oneValueArgs}"
10+
"${multiValueArgs}" ${ARGN})
11+
op_library(${TARGET_NAME} SRCS ${detection_library_SRCS} DEPS ${common_deps} ${detection_library_DEPS})
12+
set(LOCAL_DETECTION_LIBS
13+
${TARGET_NAME}
14+
${LOCAL_DETECTION_LIBS}
15+
PARENT_SCOPE)
16+
endfunction()
17+
18+
detection_library(bipartite_match_op SRCS bipartite_match_op.cc)
19+
detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu)
20+
detection_library(iou_similarity_op SRCS iou_similarity_op.cc
21+
iou_similarity_op.cu)
22+
detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc)
23+
detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc)
24+
detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
25+
detection_library(target_assign_op SRCS target_assign_op.cc
26+
target_assign_op.cu)
27+
28+
# Export local libraries to parent
29+
set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)

paddle/fluid/operators/box_coder_op.cc renamed to paddle/fluid/operators/detection/box_coder_op.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
99
See the License for the specific language governing permissions and
1010
limitations under the License. */
1111

12-
#include "paddle/fluid/operators/box_coder_op.h"
12+
#include "paddle/fluid/operators/detection/box_coder_op.h"
1313

1414
namespace paddle {
1515
namespace operators {

paddle/fluid/operators/box_coder_op.cu renamed to paddle/fluid/operators/detection/box_coder_op.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
99
See the License for the specific language governing permissions and
1010
limitations under the License. */
1111

12-
#include "paddle/fluid/operators/box_coder_op.h"
12+
#include "paddle/fluid/operators/detection/box_coder_op.h"
1313
#include "paddle/fluid/platform/cuda_primitives.h"
1414

1515
namespace paddle {

0 commit comments

Comments
 (0)