@@ -236,49 +236,51 @@ TEST_F(NCCLTester, ncclReduceOp) {
236
236
}
237
237
238
238
// ncclBcastOp with desc
239
- // TODO(helin): enable the test for ncclBcastOp
240
- // TEST_F(NCCLTester, ncclBcastOp) {
241
- // std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
242
- // const int kRoot = 0;
243
- // op2->SetType("ncclBcast");
244
- // op2->SetInput("X", {"st"});
245
- // op2->SetInput("Communicator", {"comm"});
246
- // op2->SetOutput("Out", {"rt"});
247
- // op2->SetAttr("root", kRoot);
248
-
249
- // std::vector<f::Scope *> dev_scopes;
250
-
251
- // std::vector<std::thread> ths;
252
-
253
- // for (size_t i = 0; i < gpu_list_.size(); ++i) {
254
- // dev_scopes.emplace_back(&g_scope_.NewScope());
255
- // std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
256
- // *op2.get(), dev_scopes[i]);
257
- // ths.emplace_back(std::move(th));
258
- // }
259
-
260
- // for (size_t i = 0; i < gpu_list_.size(); ++i) {
261
- // ths[i].join();
262
- // }
263
-
264
- // const int idx = 1;
265
- // float result = GetGPUData(kRoot);
266
-
267
- // p::CPUPlace cpu_place;
268
- // p::CUDAPlace gpu_place(gpu_list_[idx]);
269
-
270
- // auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
271
- // auto *rt = recv_tensor.data<float>();
272
- // auto *result_tensor = dev_scopes[idx]->Var("ct")->GetMutable<f::LoDTensor>();
273
- // result_tensor->Resize(kDims);
274
- // auto *ct = result_tensor->mutable_data<float>(cpu_place);
275
-
276
- // paddle::memory::Copy(
277
- // cpu_place, ct, p::CUDAPlace(gpu_list_[idx]), rt,
278
- // recv_tensor.numel() * sizeof(float),
279
- // static_cast<p::CUDADeviceContext *>(dev_ctxs_[idx])->stream());
280
-
281
- // for (int64_t j = 0; j < f::product(kDims); ++j) {
282
- // ASSERT_NEAR(ct[j], result, 1e-5);
283
- // }
284
- // }
239
+ // TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9540
240
+ /*
241
+ TEST_F(NCCLTester, ncclBcastOp) {
242
+ std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
243
+ const int kRoot = 0;
244
+ op2->SetType("ncclBcast");
245
+ op2->SetInput("X", {"st"});
246
+ op2->SetInput("Communicator", {"comm"});
247
+ op2->SetOutput("Out", {"rt"});
248
+ op2->SetAttr("root", kRoot);
249
+
250
+ std::vector<f::Scope *> dev_scopes;
251
+
252
+ std::vector<std::thread> ths;
253
+
254
+ for (size_t i = 0; i < gpu_list_.size(); ++i) {
255
+ dev_scopes.emplace_back(&g_scope_.NewScope());
256
+ std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
257
+ *op2.get(), dev_scopes[i]);
258
+ ths.emplace_back(std::move(th));
259
+ }
260
+
261
+ for (size_t i = 0; i < gpu_list_.size(); ++i) {
262
+ ths[i].join();
263
+ }
264
+
265
+ const int idx = 1;
266
+ float result = GetGPUData(kRoot);
267
+
268
+ p::CPUPlace cpu_place;
269
+ p::CUDAPlace gpu_place(gpu_list_[idx]);
270
+
271
+ auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
272
+ auto *rt = recv_tensor.data<float>();
273
+ auto *result_tensor = dev_scopes[idx]->Var("ct")->GetMutable<f::LoDTensor>();
274
+ result_tensor->Resize(kDims);
275
+ auto *ct = result_tensor->mutable_data<float>(cpu_place);
276
+
277
+ paddle::memory::Copy(
278
+ cpu_place, ct, p::CUDAPlace(gpu_list_[idx]), rt,
279
+ recv_tensor.numel() * sizeof(float),
280
+ static_cast<p::CUDADeviceContext *>(dev_ctxs_[idx])->stream());
281
+
282
+ for (int64_t j = 0; j < f::product(kDims); ++j) {
283
+ ASSERT_NEAR(ct[j], result, 1e-5);
284
+ }
285
+ }
286
+ */
0 commit comments