fix p2p comm memory release logic (#47497) (#47517)

FeixLiu · web-flow · commit 0201ccc41255 · 2022-11-01T11:13:39.000+08:00
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -448,7 +448,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
 
   platform::CUDADeviceGuard cuda_guard;
 
-  if (FLAGS_use_stream_safe_cuda_allocator) {
+  {
+    platform::NCCLGroupGuard nccl_guard;
     for (size_t i = 0; i < tensors.size(); ++i) {
       cuda_guard.SetDevice(places[i]);
       gpuStream_t nccl_stream;
@@ -460,12 +461,11 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
       } else {
         nccl_stream = places_to_ctx_[key][i]->stream();
       }
-      memory::RecordStream(tensors[i].Holder(), nccl_stream);
+      fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);
     }
   }
 
-  {
-    platform::NCCLGroupGuard nccl_guard;
+  if (FLAGS_use_stream_safe_cuda_allocator) {
     for (size_t i = 0; i < tensors.size(); ++i) {
       cuda_guard.SetDevice(places[i]);
       gpuStream_t nccl_stream;
@@ -477,7 +477,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
       } else {
         nccl_stream = places_to_ctx_[key][i]->stream();
       }
-      fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);
+      memory::RecordStream(tensors[i].Holder(), nccl_stream);
     }
   }
 
@@ -516,20 +516,20 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
   // construct uninitialize guard for device
   platform::CUDADeviceGuard cuda_guard;
 
-  if (FLAGS_use_stream_safe_cuda_allocator) {
+  {
+    platform::NCCLGroupGuard nccl_guard;
     for (size_t i = 0; i < tensors.size(); ++i) {
       cuda_guard.SetDevice(places[i]);
-      memory::RecordStream(tensors[i].Holder(),
-                           places_to_ctx_[key][i]->stream());
+      const auto& nccl_stream = places_to_ctx_[key][i]->stream();
+      fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);
     }
   }
 
-  {
-    platform::NCCLGroupGuard nccl_guard;
+  if (FLAGS_use_stream_safe_cuda_allocator) {
     for (size_t i = 0; i < tensors.size(); ++i) {
       cuda_guard.SetDevice(places[i]);
-      const auto& nccl_stream = places_to_ctx_[key][i]->stream();
-      fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);
+      memory::RecordStream(tensors[i].Holder(),
+                           places_to_ctx_[key][i]->stream());
     }
   }
 

Original file line number	Diff line number	Diff line change
`@@ -448,7 +448,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(`
`448`	`448`
`449`	`449`	`platform::CUDADeviceGuard cuda_guard;`
`450`	`450`
`451`		`- if (FLAGS_use_stream_safe_cuda_allocator) {`
	`451`	`+ {`
	`452`	`+ platform::NCCLGroupGuard nccl_guard;`
`452`	`453`	`for (size_t i = 0; i < tensors.size(); ++i) {`
`453`	`454`	`cuda_guard.SetDevice(places[i]);`
`454`	`455`	`gpuStream_t nccl_stream;`
`@@ -460,12 +461,11 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(`
`460`	`461`	`} else {`
`461`	`462`	`nccl_stream = places_to_ctx_[key][i]->stream();`
`462`	`463`	`}`
`463`		`- memory::RecordStream(tensors[i].Holder(), nccl_stream);`
	`464`	`+ fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);`
`464`	`465`	`}`
`465`	`466`	`}`
`466`	`467`
`467`		`- {`
`468`		`- platform::NCCLGroupGuard nccl_guard;`
	`468`	`+ if (FLAGS_use_stream_safe_cuda_allocator) {`
`469`	`469`	`for (size_t i = 0; i < tensors.size(); ++i) {`
`470`	`470`	`cuda_guard.SetDevice(places[i]);`
`471`	`471`	`gpuStream_t nccl_stream;`
`@@ -477,7 +477,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(`
`477`	`477`	`} else {`
`478`	`478`	`nccl_stream = places_to_ctx_[key][i]->stream();`
`479`	`479`	`}`
`480`		`- fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);`
	`480`	`+ memory::RecordStream(tensors[i].Holder(), nccl_stream);`
`481`	`481`	`}`
`482`	`482`	`}`
`483`	`483`
`@@ -516,20 +516,20 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(`
`516`	`516`	`// construct uninitialize guard for device`
`517`	`517`	`platform::CUDADeviceGuard cuda_guard;`
`518`	`518`
`519`		`- if (FLAGS_use_stream_safe_cuda_allocator) {`
	`519`	`+ {`
	`520`	`+ platform::NCCLGroupGuard nccl_guard;`
`520`	`521`	`for (size_t i = 0; i < tensors.size(); ++i) {`
`521`	`522`	`cuda_guard.SetDevice(places[i]);`
`522`		`- memory::RecordStream(tensors[i].Holder(),`
`523`		`- places_to_ctx_[key][i]->stream());`
	`523`	`+ const auto& nccl_stream = places_to_ctx_[key][i]->stream();`
	`524`	`+ fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);`
`524`	`525`	`}`
`525`	`526`	`}`
`526`	`527`
`527`		`- {`
`528`		`- platform::NCCLGroupGuard nccl_guard;`
	`528`	`+ if (FLAGS_use_stream_safe_cuda_allocator) {`
`529`	`529`	`for (size_t i = 0; i < tensors.size(); ++i) {`
`530`	`530`	`cuda_guard.SetDevice(places[i]);`
`531`		`- const auto& nccl_stream = places_to_ctx_[key][i]->stream();`
`532`		`- fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);`
	`531`	`+ memory::RecordStream(tensors[i].Holder(),`
	`532`	`+ places_to_ctx_[key][i]->stream());`
`533`	`533`	`}`
`534`	`534`	`}`
`535`	`535`