@@ -236,48 +236,49 @@ TEST_F(NCCLTester, ncclReduceOp) {
236
236
}
237
237
238
238
// ncclBcastOp with desc
239
- TEST_F (NCCLTester, ncclBcastOp) {
240
- std::unique_ptr<f::OpDesc> op2 (new f::OpDesc);
241
- const int kRoot = 0 ;
242
- op2->SetType (" ncclBcast" );
243
- op2->SetInput (" X" , {" st" });
244
- op2->SetInput (" Communicator" , {" comm" });
245
- op2->SetOutput (" Out" , {" rt" });
246
- op2->SetAttr (" root" , kRoot );
247
-
248
- std::vector<f::Scope *> dev_scopes;
249
-
250
- std::vector<std::thread> ths;
251
-
252
- for (size_t i = 0 ; i < gpu_list_.size (); ++i) {
253
- dev_scopes.emplace_back (&g_scope_.NewScope ());
254
- std::thread th (&NCCLTester::PerThreadProgram<float >, this , gpu_list_[i],
255
- *op2.get (), dev_scopes[i]);
256
- ths.emplace_back (std::move (th));
257
- }
258
-
259
- for (size_t i = 0 ; i < gpu_list_.size (); ++i) {
260
- ths[i].join ();
261
- }
262
-
263
- const int idx = 1 ;
264
- float result = GetGPUData (kRoot );
265
-
266
- p::CPUPlace cpu_place;
267
- p::CUDAPlace gpu_place (gpu_list_[idx]);
268
-
269
- auto &recv_tensor = dev_scopes[idx]->FindVar (" rt" )->Get <f::LoDTensor>();
270
- auto *rt = recv_tensor.data <float >();
271
- auto *result_tensor = dev_scopes[idx]->Var (" ct" )->GetMutable <f::LoDTensor>();
272
- result_tensor->Resize (kDims );
273
- auto *ct = result_tensor->mutable_data <float >(cpu_place);
274
-
275
- paddle::memory::Copy (
276
- cpu_place, ct, p::CUDAPlace (gpu_list_[idx]), rt,
277
- recv_tensor.numel () * sizeof (float ),
278
- static_cast <p::CUDADeviceContext *>(dev_ctxs_[idx])->stream ());
279
-
280
- for (int64_t j = 0 ; j < f::product (kDims ); ++j) {
281
- ASSERT_NEAR (ct[j], result, 1e-5 );
282
- }
283
- }
239
+ // TODO(helin): enable the test for ncclBcastOp
240
+ // TEST_F(NCCLTester, ncclBcastOp) {
241
+ // std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
242
+ // const int kRoot = 0;
243
+ // op2->SetType("ncclBcast");
244
+ // op2->SetInput("X", {"st"});
245
+ // op2->SetInput("Communicator", {"comm"});
246
+ // op2->SetOutput("Out", {"rt"});
247
+ // op2->SetAttr("root", kRoot);
248
+
249
+ // std::vector<f::Scope *> dev_scopes;
250
+
251
+ // std::vector<std::thread> ths;
252
+
253
+ // for (size_t i = 0; i < gpu_list_.size(); ++i) {
254
+ // dev_scopes.emplace_back(&g_scope_.NewScope());
255
+ // std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
256
+ // *op2.get(), dev_scopes[i]);
257
+ // ths.emplace_back(std::move(th));
258
+ // }
259
+
260
+ // for (size_t i = 0; i < gpu_list_.size(); ++i) {
261
+ // ths[i].join();
262
+ // }
263
+
264
+ // const int idx = 1;
265
+ // float result = GetGPUData(kRoot);
266
+
267
+ // p::CPUPlace cpu_place;
268
+ // p::CUDAPlace gpu_place(gpu_list_[idx]);
269
+
270
+ // auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
271
+ // auto *rt = recv_tensor.data<float>();
272
+ // auto *result_tensor = dev_scopes[idx]->Var("ct")->GetMutable<f::LoDTensor>();
273
+ // result_tensor->Resize(kDims);
274
+ // auto *ct = result_tensor->mutable_data<float>(cpu_place);
275
+
276
+ // paddle::memory::Copy(
277
+ // cpu_place, ct, p::CUDAPlace(gpu_list_[idx]), rt,
278
+ // recv_tensor.numel() * sizeof(float),
279
+ // static_cast<p::CUDADeviceContext *>(dev_ctxs_[idx])->stream());
280
+
281
+ // for (int64_t j = 0; j < f::product(kDims); ++j) {
282
+ // ASSERT_NEAR(ct[j], result, 1e-5);
283
+ // }
284
+ // }
0 commit comments