@@ -30,16 +30,13 @@ static constexpr char kParallelScopes[] = "parallel_scopes";
30
30
31
31
static constexpr char kParallelBlock [] = " sub_block" ;
32
32
33
- // using ParallelScopeVar = std::vector<framework::Scope *>;
34
33
using LoDTensor = framework::LoDTensor;
35
- using OperatorBase = framework::OperatorBase;
36
34
37
- void SplitTensorAndMoveTensorToScopes (
38
- const framework::Scope &scope,
39
- const std::vector<framework::Scope *> &sub_scopes,
35
+ static void SplitTensorAndMoveTensorToScopes (
36
+ const framework::Scope &scope, std::vector<framework::Scope *> *sub_scopes,
40
37
const std::vector<platform::Place> &places,
41
38
const std::vector<std::string> &names) {
42
- PADDLE_ENFORCE_EQ (sub_scopes. size (), places. size ()) ;
39
+ size_t num_sub_scopes = 0 ;
43
40
for (auto &argu : names) {
44
41
auto *var = scope.FindVar (argu);
45
42
const auto &tensor = var->Get <LoDTensor>();
@@ -48,9 +45,21 @@ void SplitTensorAndMoveTensorToScopes(
48
45
for (auto &lod : lod_tensors) {
49
46
VLOG (3 ) << lod.dims ();
50
47
}
48
+ if (num_sub_scopes == 0 ) {
49
+ num_sub_scopes = lod_tensors.size ();
50
+ } else {
51
+ PADDLE_ENFORCE_EQ (num_sub_scopes, lod_tensors.size ());
52
+ }
53
+ PADDLE_ENFORCE_NE (num_sub_scopes, 0 );
54
+ if (sub_scopes->size () == 0 ) {
55
+ sub_scopes->reserve (num_sub_scopes);
56
+ for (size_t i = 0 ; i < num_sub_scopes; ++i) {
57
+ sub_scopes->emplace_back (&scope.NewScope ());
58
+ }
59
+ }
51
60
52
- for (size_t i = 0 ; i < sub_scopes .size (); ++i) {
53
- *sub_scopes[i]->Var (argu)->GetMutable <LoDTensor>() = lod_tensors[i];
61
+ for (size_t i = 0 ; i < lod_tensors .size (); ++i) {
62
+ *(* sub_scopes) [i]->Var (argu)->GetMutable <LoDTensor>() = lod_tensors[i];
54
63
}
55
64
}
56
65
}
@@ -70,7 +79,7 @@ class ParallelDoOp : public framework::OperatorBase {
70
79
const framework::VariableNameMap &inputs,
71
80
const framework::VariableNameMap &outputs,
72
81
const framework::AttributeMap &attrs)
73
- : OperatorBase(type, inputs, outputs, attrs) {}
82
+ : framework:: OperatorBase(type, inputs, outputs, attrs) {}
74
83
75
84
void Run (const framework::Scope &scope,
76
85
const platform::Place &place) const override {
@@ -85,19 +94,17 @@ class ParallelDoOp : public framework::OperatorBase {
85
94
86
95
auto &sub_scopes = *scope.FindVar (Output (kParallelScopes ))
87
96
->GetMutable <std::vector<framework::Scope *>>();
88
- for (size_t place_idx = 0 ; place_idx < places.size (); ++place_idx) {
89
- sub_scopes.push_back (&scope.NewScope ());
90
- }
91
97
92
98
// split input
93
- SplitTensorAndMoveTensorToScopes (scope, sub_scopes, places,
99
+ SplitTensorAndMoveTensorToScopes (scope, & sub_scopes, places,
94
100
Inputs (kInputs ));
101
+
95
102
// copy parameter
96
103
for (auto ¶m : Inputs (kParameters )) {
97
104
PADDLE_ENFORCE (scope.FindVar (param)->IsType <LoDTensor>(),
98
105
" Only support parameter type as LoDTensor" );
99
106
auto &src = scope.FindVar (param)->Get <LoDTensor>();
100
- for (size_t i = 0 ; i < places .size (); ++i) {
107
+ for (size_t i = 0 ; i < sub_scopes .size (); ++i) {
101
108
auto &place = places[i];
102
109
auto *sub_scope = sub_scopes[i];
103
110
auto *dst = sub_scope->Var (param)->GetMutable <LoDTensor>();
@@ -108,9 +115,7 @@ class ParallelDoOp : public framework::OperatorBase {
108
115
109
116
std::vector<std::future<void >> workers;
110
117
workers.reserve (places.size ());
111
- for (size_t place_idx = 0 ; place_idx < places.size (); ++place_idx) {
112
- VLOG (3 ) << " Run " << place_idx;
113
-
118
+ for (size_t place_idx = 0 ; place_idx < sub_scopes.size (); ++place_idx) {
114
119
auto &place = places[place_idx];
115
120
auto *cur_scope = sub_scopes[place_idx];
116
121
@@ -157,21 +162,16 @@ ParallelDo Operator.
157
162
}
158
163
};
159
164
160
- class ParallelDoGradOp : public OperatorBase {
165
+ class ParallelDoGradOp : public framework :: OperatorBase {
161
166
public:
162
167
ParallelDoGradOp (const std::string &type,
163
168
const framework::VariableNameMap &inputs,
164
169
const framework::VariableNameMap &outputs,
165
170
const framework::AttributeMap &attrs)
166
- : OperatorBase(type, inputs, outputs, attrs) {}
171
+ : framework:: OperatorBase(type, inputs, outputs, attrs) {}
167
172
168
173
void Run (const framework::Scope &scope,
169
174
const platform::Place &place) const override {
170
- // // get device context from pool
171
- // platform::DeviceContextPool &pool =
172
- // platform::DeviceContextPool::Instance();
173
- // auto &dev_ctx = *pool.Get(place);
174
-
175
175
auto *block = Attr<framework::BlockDesc *>(kParallelBlock );
176
176
auto *program = block->Program ();
177
177
@@ -181,26 +181,16 @@ class ParallelDoGradOp : public OperatorBase {
181
181
auto &places = scope.FindVar (Input (kPlaces ))->Get <platform::PlaceList>();
182
182
183
183
// feed output@grad
184
- SplitTensorAndMoveTensorToScopes (scope, sub_scopes, places,
185
- Inputs (framework::GradVarName (kOutputs )));
184
+ SplitTensorAndMoveTensorToScopes (
185
+ scope, const_cast <std::vector<framework::Scope *> *>(&sub_scopes),
186
+ places, Inputs (framework::GradVarName (kOutputs )));
186
187
WaitOnPlaces (places);
187
188
188
- // for debugging
189
- for (auto &s : Inputs (framework::GradVarName (kOutputs ))) {
190
- VLOG (3 ) << s;
191
- VLOG (3 ) << scope.FindVar (s)->Get <LoDTensor>();
192
- for (auto *sub_scope : sub_scopes) {
193
- VLOG (3 ) << sub_scope->FindVar (s)->Get <LoDTensor>();
194
- }
195
- }
196
-
197
189
// exe run
198
190
std::vector<std::future<void >> workers;
199
- for (size_t place_idx = 0 ; place_idx < places.size (); ++place_idx) {
200
- VLOG (3 ) << " Run " << place_idx;
201
-
202
- auto &place = places[place_idx];
203
- auto *cur_scope = sub_scopes[place_idx];
191
+ for (size_t i = 0 ; i < sub_scopes.size (); ++i) {
192
+ auto &place = places[i];
193
+ auto *cur_scope = sub_scopes[i];
204
194
205
195
// execute
206
196
workers.emplace_back (framework::Async ([program, cur_scope, place, block] {
@@ -216,33 +206,38 @@ class ParallelDoGradOp : public OperatorBase {
216
206
217
207
// merge grad
218
208
for (auto &s : Outputs (framework::GradVarName (kParameters ))) {
219
- VLOG (3 ) << " merge grad " << s;
220
-
221
- auto &t = sub_scopes[0 ]->FindVar (s)->Get <LoDTensor>();
222
- VLOG (3 ) << t;
223
-
224
- std::string s_buf = s + " @BUF" ;
225
- auto *t_buf = sub_scopes[0 ]->Var (s_buf)->GetMutable <LoDTensor>();
226
-
227
- for (size_t place_idx = 1 ; place_idx < places.size (); ++place_idx) {
228
- auto &tt = sub_scopes[place_idx]->FindVar (s)->Get <LoDTensor>();
229
- VLOG (3 ) << place_idx;
230
- VLOG (3 ) << tt;
231
- framework::Copy (tt, places[0 ], t_buf);
209
+ auto &result = sub_scopes[0 ]->FindVar (s)->Get <LoDTensor>();
210
+ std::string tmp_name;
211
+ auto *tmp = sub_scopes[0 ]->Var (&tmp_name)->GetMutable <LoDTensor>();
212
+
213
+ for (size_t i = 1 ; i < sub_scopes.size (); ++i) {
214
+ auto &tensor_to_merge = sub_scopes[i]->FindVar (s)->Get <LoDTensor>();
215
+ if (!(places[i] == places[0 ])) {
216
+ framework::Copy (tensor_to_merge, places[0 ], tmp);
217
+ } else {
218
+ tmp->ShareDataWith (tensor_to_merge);
219
+ }
232
220
233
221
auto sum_op = framework::OpRegistry::CreateOp (
234
- " sum" , {{" X" , {s, s_buf }}}, {{" Out" , {s}}},
222
+ " sum" , {{" X" , {s, tmp_name }}}, {{" Out" , {s}}},
235
223
framework::AttributeMap{});
236
224
sum_op->Run (*sub_scopes[0 ], places[0 ]);
237
225
WaitOnPlaces (places);
238
226
}
239
227
240
- VLOG (3 ) << t ;
241
- framework::Copy (t , place, scope.FindVar (s)->GetMutable <LoDTensor>());
228
+ VLOG (3 ) << result ;
229
+ framework::Copy (result , place, scope.FindVar (s)->GetMutable <LoDTensor>());
242
230
}
243
231
}
244
232
};
245
233
234
+ std::ostream &operator <<(std::ostream &sout,
235
+ const std::vector<std::string> &strs) {
236
+ std::copy (strs.begin (), strs.end (),
237
+ std::ostream_iterator<std::string>(sout, " ," ));
238
+ return sout;
239
+ }
240
+
246
241
class ParallelDoGradOpDescMaker : public framework ::SingleGradOpDescMaker {
247
242
public:
248
243
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
@@ -283,18 +278,30 @@ class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
283
278
void operator ()(framework::InferShapeContext *ctx) const override {
284
279
std::vector<std::string> input{kParameters , kInputs };
285
280
std::vector<std::string> output{kOutputs };
286
- for (auto &s : input) {
287
- PADDLE_ENFORCE (ctx->HasInputs (s));
288
- PADDLE_ENFORCE (ctx->HasOutputs (framework::GradVarName (s)),
289
- " Cannot find the gradient variable %s" ,
290
- framework::GradVarName (s));
291
- }
281
+
282
+ PADDLE_ENFORCE (ctx->HasInputs (kParameters ));
283
+ PADDLE_ENFORCE (ctx->HasOutputs (framework::GradVarName (kParameters )));
284
+ PADDLE_ENFORCE (ctx->HasInput (kInputs ));
285
+
292
286
for (auto &s : output) {
293
287
PADDLE_ENFORCE (ctx->HasInputs (s));
294
288
}
295
- for (auto &s : input) {
296
- ctx->SetOutputsDim (framework::GradVarName (s), ctx->GetInputsDim (s));
289
+
290
+ ctx->SetOutputsDim (framework::GradVarName (kParameters ),
291
+ ctx->GetInputsDim (kParameters ));
292
+
293
+ auto i_dims = ctx->GetInputsDim (kInputs );
294
+ auto ig_names = ctx->Outputs (framework::GradVarName (kInputs ));
295
+
296
+ for (size_t i = 0 ; i < ig_names.size (); ++i) {
297
+ auto &ig_name = ig_names[i];
298
+ if (ig_name == framework::kEmptyVarName ) {
299
+ continue ;
300
+ }
301
+
302
+ ctx->SetDims ({ig_name}, {i_dims[i]});
297
303
}
304
+
298
305
if (ctx->HasInputs (kParameters )) {
299
306
PADDLE_ENFORCE (ctx->HasOutputs (framework::GradVarName (kParameters )));
300
307
ctx->SetOutputsDim (framework::GradVarName (kParameters ),
0 commit comments