Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion src/xccl/ProcessGroupXCCL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,15 @@ void checkSingleTensor(
C10_THROW_ERROR(ValueError, "Tensors must be XPU and dense");
}

// Skip the following requirements for P2P operations
// Check memory format
if (!tensor.is_contiguous(tensor.suggest_memory_format())) {
// P2P is a bit relaxed, supporting transfer of a transposed tensor
if (p2p) {
// But must be dense still
if (!tensor.is_non_overlapping_and_dense()) {
C10_THROW_ERROR(
ValueError, "Tensors for P2P must be non-overlapping and dense");
}
TORCH_WARN_ONCE(
"Detected non-contiguous tensor in P2P operations. It is user "
"responsibility to guarantee that source and destination tensors have "
Expand Down
15 changes: 15 additions & 0 deletions test/xpu/distributed/test_c10d_xccl.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,21 @@ def rank_to_GPU(self):
# return rank to GPU map
return init_multigpu_helper(self.world_size, "xccl")

@requires_xccl()
@skip_if_lt_x_gpu(2)
def test_send_recv_non_dense_tensor(self):
pg = self._create_process_group_xccl()
device = self.rank_to_GPU[self.rank][0]
full = torch.empty((64, 64), device=device).fill_(self.rank)
# Take a slice in col dimension, making it non-dense
Copy link

Copilot AI Oct 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment mentions 'col dimension' but should be more precise. The slice [:, 16:32] creates a non-contiguous view by selecting columns 16-31, which results in a non-dense tensor due to the stride pattern in memory.

Suggested change
# Take a slice in col dimension, making it non-dense
# Take a slice along columns 16 to 31 (inclusive), resulting in a non-contiguous (non-dense) tensor due to the stride pattern in memory

Copilot uses AI. Check for mistakes.

block = full[:, 16:32]
if self.rank == 0:
with self.assertRaises(ValueError):
dist.send(block, dst=1)
elif self.rank == 1:
with self.assertRaises(ValueError):
dist.recv(block, src=0)

@requires_xccl()
@skip_but_pass_in_sandcastle_if(
torch.xpu.device_count() < 2, "XCCL test requires 2+ GPUs"
Expand Down
Loading