intel · Chao1Han · Oct 13, 2025 · Oct 14, 2025 · Oct 15, 2025 · Oct 16, 2025
diff --git a/src/xccl/ProcessGroupXCCL.cpp b/src/xccl/ProcessGroupXCCL.cpp
@@ -99,9 +99,15 @@ void checkSingleTensor(
     C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense");
   }
 
-  // Skip the following requirements for P2P operations
+  // Check memory format
   if (!tensor.is_contiguous(tensor.suggest_memory_format())) {
+    // P2P is a bit relaxed, supporting transfer of a transposed tensor
     if (p2p) {
+      // But must be dense still
+      if (!tensor.is_non_overlapping_and_dense()) {
+        C10_THROW_ERROR(
+            ValueError, "Tensors for P2P must be non-overlapping and dense");
+      }
       TORCH_WARN_ONCE(
           "Detected non-contiguous tensor in P2P operations. It is user "
           "responsibility to guarantee that source and destination tensors have "

diff --git a/test/xpu/distributed/test_c10d_xccl.py b/test/xpu/distributed/test_c10d_xccl.py
@@ -248,6 +248,21 @@ def rank_to_GPU(self):
         # return rank to GPU map
         return init_multigpu_helper(self.world_size, "xccl")
 
+    @requires_xccl()
+    @skip_if_lt_x_gpu(2)
+    def test_send_recv_non_dense_tensor(self):
+        pg = self._create_process_group_xccl()
+        device = self.rank_to_GPU[self.rank][0]
+        full = torch.empty((64, 64), device=device).fill_(self.rank)
+        # Take a slice in col dimension, making it non-dense
-        # Take a slice in col dimension, making it non-dense
+        # Take a slice along columns 16 to 31 (inclusive), resulting in a non-contiguous (non-dense) tensor due to the stride pattern in memory
-        # Take a slice in col dimension, making it non-dense
+        # Take a slice along columns 16 to 31 (inclusive), resulting in a non-contiguous (non-dense) tensor due to the stride pattern in memory
+        block = full[:, 16:32]
+        if self.rank == 0:
+            with self.assertRaises(ValueError):
+                dist.send(block, dst=1)
+        elif self.rank == 1:
+            with self.assertRaises(ValueError):
+                dist.recv(block, src=0)
+
     @requires_xccl()
     @skip_but_pass_in_sandcastle_if(
         torch.xpu.device_count() < 2, "XCCL test requires 2+ GPUs"