[PGNCCL] Make sure we do not use split for P2P comm creation (pytorch#139013)

kwen2501 · pytorchmergebot · commit 74878ac271fe · 2024-10-28T18:03:25.000Z
Resolve comment pytorch#138527 (comment) There was a split-vs-P2P bug: When P2P comm creation invokes `getNCCLComm`, it may see a `split_from` options which is meant for the previous PG creation. Then the P2P comm creation may use `ncclCommSplit` and hang, because not all ranks join this call. The bug slips previously/today because there is no CI test with the following recipe: eager init + new group + P2P in that new group. Pull Request resolved: pytorch#139013 Approved by: https://github.com/shuqiangzhang
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
@@ -982,6 +982,28 @@ def test_non_blocking_p2p(self):
             self.assertEqual(send_tensor, recv_tensor)
         dist.destroy_process_group()
 
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @parametrize("eager_init", [True, False])
+    def test_subgroup_p2p(self, eager_init: bool):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        device = torch.device(f"cuda:{self.rank % torch.cuda.device_count()}")
+        c10d.init_process_group(
+            "nccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+            device_id=device if eager_init else None,
+        )
+        send_tensor = torch.ones(10, 10, device=device)
+        group = dist.new_group()
+        if self.rank == 0:
+            dist.send(send_tensor, 1, group=group)
+        if self.rank == 1:
+            recv_tensor = torch.rand(10, 10, device=device)
+            dist.recv(recv_tensor, 0, group=group)
+            self.assertEqual(send_tensor, recv_tensor)
+        dist.destroy_process_group()
+
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
     def test_get_uid(self):
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -2401,7 +2401,12 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::getNCCLComm(
 #endif
 
 #ifdef NCCL_HAS_COMM_SPLIT
-  if (options_->split_from) {
+  // Use split to create a new communicator only if:
+  // 1. The parent comm is known; AND
+  // 2. The new comm is not for a point-to-point operation.
+  // ncclCommSplit() is a collective call, so it does not work for P2P
+  // operations.
+  if (options_->split_from && !singleP2POp) {
     // Find a valid, healthy communicator to split from if possible.
     std::lock_guard<std::mutex> lock(options_->split_from->mutex_);
     auto& other_comms = options_->split_from->devNCCLCommMap_;