[dist] handle discontiguous allgather/reducescatter inputs (pytorch#163712)

ngimel · pytorchmergebot · commit 71eec6a0bf69 · 2025-09-24T19:38:44.000Z
Fixes pytorch#163483 Pull Request resolved: pytorch#163712 Approved by: https://github.com/ezyang, https://github.com/kwen2501
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
@@ -3821,6 +3821,27 @@ def test_allgather_base(self):
         dist.all_gather_into_tensor(output_tensor, tensor)
         self.assertEqual(output_tensor, tensor)
 
+    @requires_nccl()
+    @skip_if_lt_x_gpu(2)
+    def test_allgather_noncontig(self):
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            "nccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+        device = "cuda"
+        tensor = (
+            torch.arange(0, 16, device=torch.device(device))
+            .view(2, 2, 2, 2)
+            .to(memory_format=torch.channels_last)
+        )
+        tensor_list = [torch.empty_like(tensor) for _ in range(self.world_size)]
+        dist.all_gather(tensor_list, tensor)
+        for o in tensor_list:
+            self.assertEqual(o, tensor)
+
     @requires_nccl()
     @skip_if_lt_x_gpu(1)
     @parametrize("float8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
@@ -1381,7 +1381,8 @@ class AsyncAllgatherWork : public ProcessGroupGloo::AsyncWork {
     // Use single flat output tensor.
     // The first dimension corresponds to the index into outputs[N],
     // so copying into the actual output later is easy.
-    at::Tensor flatOutputTensor = newLikeFlat(outputs[0]);
+    at::Tensor flatOutputTensor =
+        newLikeFlat(outputs[0], /*preserve_strides*/ false);
     GENERATE_ALL_TYPES(scalarType, setOutput, opts, flatOutputTensor);
     gloo::allgather(opts);
 
@@ -1398,7 +1399,7 @@ class AsyncAllgatherWork : public ProcessGroupGloo::AsyncWork {
   }
 
   const std::vector<at::Tensor> getOutputTensors() override {
-    return {newLikeFlat(outputs[0])};
+    return {newLikeFlat(outputs[0], /*preserve_strides*/ false)};
   }
 
   void run() override {
@@ -1694,7 +1695,7 @@ class AsyncAllgatherCoalescedWork : public ProcessGroupGloo::AsyncWork {
   }
 
   const std::vector<at::Tensor> getOutputTensors() override {
-    return {newLikeFlat(output_lists[0])};
+    return {newLikeFlat(output_lists[0], /*preserve_strides*/ false)};
   }
 
   void run() override {
@@ -1818,7 +1819,7 @@ class AsyncGatherWork : public ProcessGroupGloo::AsyncWork {
     // This is later scattered to the separate output tensors.
     at::Tensor flatOutputTensor;
     if (context_->rank == root) {
-      flatOutputTensor = newLikeFlat(outputs[0]);
+      flatOutputTensor = newLikeFlat(outputs[0], /*preserve_strides*/ false);
       GENERATE_ALL_TYPES(scalarType, setOutput, opts, flatOutputTensor);
     }
 
@@ -1841,7 +1842,8 @@ class AsyncGatherWork : public ProcessGroupGloo::AsyncWork {
 
   const std::vector<at::Tensor> getOutputTensors() override {
     return outputs.empty() ? std::vector<at::Tensor>{}
-                           : std::vector<at::Tensor>{newLikeFlat(outputs[0])};
+                           : std::vector<at::Tensor>{newLikeFlat(
+                                 outputs[0], /*preserve_strides*/ false)};
   }
 
   void run() override {
@@ -2057,7 +2059,8 @@ class AsyncScatterWork : public ProcessGroupGloo::AsyncWork {
 
   const std::vector<at::Tensor> getInputTensors() override {
     return inputs.empty() ? std::vector<at::Tensor>{}
-                          : std::vector<at::Tensor>{newLikeFlat(inputs[0])};
+                          : std::vector<at::Tensor>{newLikeFlat(
+                                inputs[0], /*preserve_strides*/ false)};
   }
 
   const std::vector<at::Tensor> getOutputTensors() override {
diff --git a/torch/csrc/distributed/c10d/Utils.hpp b/torch/csrc/distributed/c10d/Utils.hpp
@@ -444,15 +444,30 @@ inline at::Tensor newLikeFlat(
       sizes, strides, t.options().memory_format(std::nullopt));
 }
 
-inline at::Tensor newLikeFlat(std::vector<at::Tensor>& tensors) {
+inline at::Tensor newLikeFlat(
+    std::vector<at::Tensor>& tensors,
+    bool preserve_strides = true) {
   if (tensors.empty()) {
     TORCH_CHECK(false, "Received an empty list");
   }
   auto& t = tensors[0];
   at::DeviceGuard gpuGuard(t.device());
   std::vector<int64_t> sizes{static_cast<int64_t>(tensors.size())};
   sizes.insert(sizes.end(), t.sizes().begin(), t.sizes().end());
-  return at::empty(sizes, t.options());
+  if (t.is_contiguous() ||
+      !preserve_strides) { // we are checking for memory format, so tensor might
+    // not be contiguous
+    // TODO handle all non-overlapping-and-dense, although if the strides
+    // disagree in ranks we are opening a door for more bugs than currently
+    // where channels-last might disagree between ranks
+    // fast path, don't call empty_strided
+    return at::empty(sizes, t.options());
+  } else {
+    // memory-dense, but not necessarily contiguous tensor
+    std::vector<int64_t> strides{t.numel()};
+    strides.insert(strides.end(), t.strides().begin(), t.strides().end());
+    return at::empty_strided(sizes, strides, t.options());
+  }
 }
 
 inline std::vector<std::vector<int64_t>> getSizes(