@@ -14,19 +14,15 @@ limitations under the License. */
14
14
15
15
#include < glog/logging.h>
16
16
#include < gtest/gtest.h>
17
- #include < algorithm>
18
17
#include < memory>
19
18
#include < mutex>
20
19
#include < thread>
21
- #include < utility>
22
20
#include < vector>
23
21
24
- #include " paddle/fluid/framework/block_desc.h"
25
22
#include " paddle/fluid/framework/init.h"
26
23
#include " paddle/fluid/framework/op_desc.h"
27
24
#include " paddle/fluid/framework/op_registry.h"
28
25
#include " paddle/fluid/framework/program_desc.h"
29
- #include " paddle/fluid/framework/var_desc.h"
30
26
#include " paddle/fluid/operators/nccl/nccl_gpu_common.h"
31
27
#include " paddle/fluid/platform/device_context.h"
32
28
#include " paddle/fluid/platform/enforce.h"
@@ -41,26 +37,35 @@ USE_CUDA_ONLY_OP(ncclBcast);
41
37
namespace f = paddle::framework;
42
38
namespace p = paddle::platform;
43
39
44
- static std::vector<int > gpu_list;
45
-
46
40
// test data amount
47
- const f::DDim kDims = {100 , 100 };
41
+ const f::DDim kDims = {20 , 20 };
48
42
49
43
// nccl op common tester, init communicator.
50
44
class NCCLTester : public ::testing::Test {
51
45
public:
52
46
virtual void SetUp () override {
47
+ int count = p::GetCUDADeviceCount ();
48
+ if (count <= 1 ) {
49
+ LOG (WARNING)
50
+ << " Cannot test multi-gpu nccl, because the CUDA device count is "
51
+ << count;
52
+ exit (0 );
53
+ }
54
+ for (int i = 0 ; i < count; ++i) {
55
+ gpu_list_.emplace_back (i);
56
+ }
57
+
53
58
paddle::platform::CPUPlace cpu_place;
54
- for (size_t i = 0 ; i < gpu_list .size (); ++i) {
59
+ for (size_t i = 0 ; i < gpu_list_ .size (); ++i) {
55
60
p::CUDAPlace place (i);
56
- dev_ctxs .emplace_back (new p::CUDADeviceContext (place));
61
+ dev_ctxs_ .emplace_back (new p::CUDADeviceContext (place));
57
62
}
58
63
59
64
NCCLInitOp ();
60
65
}
61
66
62
67
virtual void TearDown () override {
63
- for (auto &device_context : dev_ctxs ) {
68
+ for (auto &device_context : dev_ctxs_ ) {
64
69
delete device_context;
65
70
}
66
71
}
@@ -70,36 +75,40 @@ class NCCLTester : public ::testing::Test {
70
75
std::unique_ptr<f::OpDesc> op1 (new f::OpDesc);
71
76
72
77
op1->SetType (" ncclInit" );
78
+ op1->SetInput (" parallel_scopes" , {" p_scopes" });
73
79
op1->SetOutput (" Communicator" , {" comm" });
74
- op1->SetAttr (" gpus" , {gpu_list});
75
80
76
- auto *var = g_scope .Var (" comm" );
81
+ auto *var = g_scope_ .Var (" comm" );
77
82
var->GetMutable <p::Communicator>();
78
83
84
+ auto *scope_var = g_scope_.Var (" p_scopes" );
85
+ auto *p_scopes = scope_var->GetMutable <std::vector<f::Scope *>>();
86
+ (*p_scopes).resize (gpu_list_.size ());
87
+
79
88
auto op = f::OpRegistry::CreateOp (*op1);
80
89
VLOG (1 ) << " invoke NCCLInitOp." ;
81
- op->Run (g_scope , cpu_place);
90
+ op->Run (g_scope_ , cpu_place);
82
91
VLOG (1 ) << " NCCLInitOp finished." ;
83
92
}
84
93
94
+ int GetGPUData (int gpu_id) { return gpu_id + 42 ; }
95
+
85
96
template <class T >
86
97
void PerThreadProgram (int gpu_id, const f::OpDesc &op_desc, f::Scope *scope) {
87
- std::unique_lock<std::mutex> lk (mu );
98
+ std::unique_lock<std::mutex> lk (mu_ );
88
99
const f::OpDesc *op1 = &op_desc;
89
100
90
101
p::CUDAPlace place (gpu_id);
91
- auto &ctx = dev_ctxs .at (gpu_id);
102
+ auto &ctx = dev_ctxs_ .at (gpu_id);
92
103
93
104
auto *send_tensor = scope->Var (" st" )->GetMutable <f::LoDTensor>();
94
105
auto *recv_tensor = scope->Var (" rt" )->GetMutable <f::LoDTensor>();
95
106
96
107
if (!send_tensor->numel ()) {
97
- send_tensor->Resize (kDims );
98
108
send_tensor->mutable_data <T>(kDims , place);
99
109
100
- std::vector<T> send_vector (f::product (kDims ), gpu_id);
110
+ std::vector<T> send_vector (f::product (kDims ), GetGPUData ( gpu_id) );
101
111
paddle::framework::TensorFromVector<T>(send_vector, *ctx, send_tensor);
102
- ctx->Wait ();
103
112
VLOG (1 ) << " Send Tensor filled with elements " << send_tensor->numel ();
104
113
}
105
114
@@ -118,30 +127,14 @@ class NCCLTester : public ::testing::Test {
118
127
}
119
128
120
129
public:
121
- std::vector<p::DeviceContext *> dev_ctxs;
122
- f::Scope g_scope;
123
- std::mutex mu;
130
+ std::vector<p::DeviceContext *> dev_ctxs_;
131
+ f::Scope g_scope_;
132
+ std::mutex mu_;
133
+ std::vector<int > gpu_list_;
124
134
};
125
135
126
136
// ncclInitOp with desc
127
- TEST (NCCL, ncclInitOp) {
128
- std::unique_ptr<f::OpDesc> op_desc (new f::OpDesc);
129
-
130
- op_desc->SetType (" ncclInit" );
131
- op_desc->SetOutput (" Communicator" , {" x1" });
132
- op_desc->SetAttr (" gpus" , {gpu_list});
133
-
134
- f::Scope g_scope;
135
- paddle::platform::CPUPlace cpu_place;
136
-
137
- auto *var = g_scope.Var (" x1" );
138
- var->GetMutable <p::Communicator>();
139
-
140
- auto op = f::OpRegistry::CreateOp (*op_desc);
141
- VLOG (1 ) << " invoke NCCLInitOp." ;
142
- op->Run (g_scope, cpu_place);
143
- VLOG (1 ) << " NCCLInitOp finished." ;
144
- }
137
+ TEST_F (NCCLTester, ncclInitOp) {}
145
138
146
139
// ncclAllReduceOp with desc
147
140
TEST_F (NCCLTester, ncclAllReduceOp) {
@@ -155,23 +148,25 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
155
148
156
149
std::vector<std::thread> ths;
157
150
158
- for (size_t i = 0 ; i < gpu_list .size (); ++i) {
159
- dev_scopes.emplace_back (&g_scope .NewScope ());
160
- std::thread th (&NCCLTester::PerThreadProgram<float >, this , gpu_list [i],
151
+ for (size_t i = 0 ; i < gpu_list_ .size (); ++i) {
152
+ dev_scopes.emplace_back (&g_scope_ .NewScope ());
153
+ std::thread th (&NCCLTester::PerThreadProgram<float >, this , gpu_list_ [i],
161
154
*op2.get (), dev_scopes[i]);
162
155
ths.emplace_back (std::move (th));
163
156
}
164
157
165
- for (size_t i = 0 ; i < gpu_list .size (); ++i) {
158
+ for (size_t i = 0 ; i < gpu_list_ .size (); ++i) {
166
159
ths[i].join ();
167
160
}
168
161
169
- // check results
170
- float result = std::accumulate (gpu_list.begin (), gpu_list.end (), 0 );
162
+ float expected_result = 0.0 ;
163
+ for (int gpu_id : gpu_list_) {
164
+ expected_result = expected_result + GetGPUData (gpu_id);
165
+ }
171
166
172
167
for (size_t i = 0 ; i < dev_scopes.size (); ++i) {
173
168
p::CPUPlace cpu_place;
174
- p::CUDAPlace gpu_place (gpu_list [i]);
169
+ p::CUDAPlace gpu_place (gpu_list_ [i]);
175
170
176
171
auto &recv_tensor = dev_scopes[i]->FindVar (" rt" )->Get <f::LoDTensor>();
177
172
auto *rt = recv_tensor.data <float >();
@@ -180,12 +175,12 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
180
175
auto *ct = result_tensor->mutable_data <float >(cpu_place);
181
176
182
177
paddle::memory::Copy (
183
- cpu_place, ct, p::CUDAPlace (gpu_list [i]), rt,
178
+ cpu_place, ct, p::CUDAPlace (gpu_list_ [i]), rt,
184
179
recv_tensor.numel () * sizeof (float ),
185
- static_cast <p::CUDADeviceContext *>(dev_ctxs [i])->stream ());
180
+ static_cast <p::CUDADeviceContext *>(dev_ctxs_ [i])->stream ());
186
181
187
182
for (int64_t j = 0 ; j < f::product (kDims ); ++j) {
188
- ASSERT_NEAR (ct[j], result , 1e-5 );
183
+ ASSERT_NEAR (ct[j], expected_result , 1e-5 );
189
184
}
190
185
}
191
186
}
@@ -204,22 +199,24 @@ TEST_F(NCCLTester, ncclReduceOp) {
204
199
205
200
std::vector<std::thread> ths;
206
201
207
- for (size_t i = 0 ; i < gpu_list .size (); ++i) {
208
- dev_scopes.emplace_back (&g_scope .NewScope ());
209
- std::thread th (&NCCLTester::PerThreadProgram<float >, this , gpu_list [i],
202
+ for (size_t i = 0 ; i < gpu_list_ .size (); ++i) {
203
+ dev_scopes.emplace_back (&g_scope_ .NewScope ());
204
+ std::thread th (&NCCLTester::PerThreadProgram<float >, this , gpu_list_ [i],
210
205
*op2.get (), dev_scopes[i]);
211
206
ths.emplace_back (std::move (th));
212
207
}
213
208
214
- for (size_t i = 0 ; i < gpu_list .size (); ++i) {
209
+ for (size_t i = 0 ; i < gpu_list_ .size (); ++i) {
215
210
ths[i].join ();
216
211
}
217
212
218
- // check results on
219
- float result = std::accumulate (gpu_list.begin (), gpu_list.end (), 0 );
213
+ float expected_result = 0.0 ;
214
+ for (int gpu_id : gpu_list_) {
215
+ expected_result = expected_result + GetGPUData (gpu_id);
216
+ }
220
217
221
218
p::CPUPlace cpu_place;
222
- p::CUDAPlace gpu_place (gpu_list [kRoot ]);
219
+ p::CUDAPlace gpu_place (gpu_list_ [kRoot ]);
223
220
224
221
auto &recv_tensor = dev_scopes[kRoot ]->FindVar (" rt" )->Get <f::LoDTensor>();
225
222
auto *rt = recv_tensor.data <float >();
@@ -229,12 +226,12 @@ TEST_F(NCCLTester, ncclReduceOp) {
229
226
auto *ct = result_tensor->mutable_data <float >(cpu_place);
230
227
231
228
paddle::memory::Copy (
232
- cpu_place, ct, p::CUDAPlace (gpu_list [kRoot ]), rt,
229
+ cpu_place, ct, p::CUDAPlace (gpu_list_ [kRoot ]), rt,
233
230
recv_tensor.numel () * sizeof (float ),
234
- static_cast <p::CUDADeviceContext *>(dev_ctxs [kRoot ])->stream ());
231
+ static_cast <p::CUDADeviceContext *>(dev_ctxs_ [kRoot ])->stream ());
235
232
236
233
for (int64_t j = 0 ; j < f::product (kDims ); ++j) {
237
- ASSERT_NEAR (ct[j], result , 1e-5 );
234
+ ASSERT_NEAR (ct[j], expected_result , 1e-5 );
238
235
}
239
236
}
240
237
@@ -252,23 +249,22 @@ TEST_F(NCCLTester, ncclBcastOp) {
252
249
253
250
std::vector<std::thread> ths;
254
251
255
- for (size_t i = 0 ; i < gpu_list .size (); ++i) {
256
- dev_scopes.emplace_back (&g_scope .NewScope ());
257
- std::thread th (&NCCLTester::PerThreadProgram<float >, this , gpu_list [i],
252
+ for (size_t i = 0 ; i < gpu_list_ .size (); ++i) {
253
+ dev_scopes.emplace_back (&g_scope_ .NewScope ());
254
+ std::thread th (&NCCLTester::PerThreadProgram<float >, this , gpu_list_ [i],
258
255
*op2.get (), dev_scopes[i]);
259
256
ths.emplace_back (std::move (th));
260
257
}
261
258
262
- for (size_t i = 0 ; i < gpu_list .size (); ++i) {
259
+ for (size_t i = 0 ; i < gpu_list_ .size (); ++i) {
263
260
ths[i].join ();
264
261
}
265
262
266
263
const int idx = 1 ;
267
- // check results on
268
- float result = kRoot ;
264
+ float result = GetGPUData (kRoot );
269
265
270
266
p::CPUPlace cpu_place;
271
- p::CUDAPlace gpu_place (gpu_list [idx]);
267
+ p::CUDAPlace gpu_place (gpu_list_ [idx]);
272
268
273
269
auto &recv_tensor = dev_scopes[idx]->FindVar (" rt" )->Get <f::LoDTensor>();
274
270
auto *rt = recv_tensor.data <float >();
@@ -277,42 +273,11 @@ TEST_F(NCCLTester, ncclBcastOp) {
277
273
auto *ct = result_tensor->mutable_data <float >(cpu_place);
278
274
279
275
paddle::memory::Copy (
280
- cpu_place, ct, p::CUDAPlace (gpu_list [idx]), rt,
276
+ cpu_place, ct, p::CUDAPlace (gpu_list_ [idx]), rt,
281
277
recv_tensor.numel () * sizeof (float ),
282
- static_cast <p::CUDADeviceContext *>(dev_ctxs [idx])->stream ());
278
+ static_cast <p::CUDADeviceContext *>(dev_ctxs_ [idx])->stream ());
283
279
284
280
for (int64_t j = 0 ; j < f::product (kDims ); ++j) {
285
281
ASSERT_NEAR (ct[j], result, 1e-5 );
286
282
}
287
283
}
288
-
289
- int main (int argc, char **argv) {
290
- // FIXME(tonyyang-svail):
291
- // Due to the driver issue on our CI, disable for now
292
- return 0 ;
293
- const int dev_count = p::GetCUDADeviceCount ();
294
- if (dev_count <= 1 ) {
295
- LOG (WARNING)
296
- << " Cannot test multi-gpu nccl, because the CUDA device count is "
297
- << dev_count;
298
- return 0 ;
299
- }
300
-
301
- std::vector<paddle::platform::Place> places;
302
-
303
- places.emplace_back (paddle::platform::CPUPlace ());
304
- int count = paddle::platform::GetCUDADeviceCount ();
305
- for (int i = 0 ; i < count; ++i) {
306
- places.emplace_back (paddle::platform::CUDAPlace (i));
307
- gpu_list.emplace_back (i);
308
- }
309
-
310
- VLOG (0 ) << " DeviceCount " << count;
311
- paddle::platform::DeviceContextPool::Init (places);
312
-
313
- testing::InitGoogleTest (&argc, argv);
314
-
315
- // device context should be release before scope.
316
- // otherwise driver will down.
317
- return RUN_ALL_TESTS ();
318
- }
0 commit comments