Skip to content

Commit 11a231e

Browse files
kwen2501pytorchmergebot
authored andcommitted
[c10d] P2P tensors must be dense (pytorch#163719)
Fixes pytorch#161324 by adding `is_non_overlapping_and_dense` check. Pull Request resolved: pytorch#163719 Approved by: https://github.com/ngimel
1 parent dad54ca commit 11a231e

File tree

2 files changed

+26
-1
lines changed

2 files changed

+26
-1
lines changed

test/distributed/test_c10d_nccl.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2893,6 +2893,25 @@ def _reduce_timeout(self):
28932893
os.environ["TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC"] = "4"
28942894
os.environ["TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC"] = "1000"
28952895

2896+
@requires_nccl()
2897+
@skip_if_lt_x_gpu(3)
2898+
@skip_if_rocm_multiprocess
2899+
def test_send_recv_non_dense_tensor(self):
2900+
store = c10d.FileStore(self.file_name, self.world_size)
2901+
device = torch.device("cuda", self.rank % torch.cuda.device_count())
2902+
dist.init_process_group(
2903+
rank=self.rank, world_size=self.world_size, store=store, device_id=device
2904+
)
2905+
full = torch.empty((64, 64), device=device).fill_(self.rank)
2906+
# Take a slice in col dimension, making it non-dense
2907+
block = full[:, 16:32]
2908+
if self.rank == 0:
2909+
with self.assertRaises(ValueError):
2910+
dist.send(block, dst=1)
2911+
elif self.rank == 1:
2912+
with self.assertRaises(ValueError):
2913+
dist.recv(block, src=0)
2914+
28962915
@requires_nccl()
28972916
@requires_nccl_version((2, 4, 0), "Need NCCL 2.4+ for error checking")
28982917
@skip_if_lt_x_gpu(3)

torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3217,9 +3217,15 @@ void check_gpu_single_tensor(
32173217
if (!tensor.is_cuda() || tensor.is_sparse()) {
32183218
C10_THROW_ERROR(ValueError, "Tensors must be CUDA and dense");
32193219
}
3220-
// Skip the following requirements for P2P operations
3220+
// Check memory format
32213221
if (!tensor.is_contiguous(tensor.suggest_memory_format())) {
3222+
// P2P is a bit relaxed, supporting transfer of a transposed tensor
32223223
if (p2p) {
3224+
// But must be dense still
3225+
if (!tensor.is_non_overlapping_and_dense()) {
3226+
C10_THROW_ERROR(
3227+
ValueError, "Tensors for P2P must be non-overlapping and dense");
3228+
}
32233229
TORCH_WARN_ONCE(
32243230
"Detected non-contiguous tensor in P2P operations. It is user "
32253231
"responsibility to guarantee that source and destination tensors have "

0 commit comments

Comments
 (0)