implement collective all_to_all op (pytorch#9442)

bfolie · web-flow · commit e7dcc7bd000a · 2025-07-17T16:55:10.000-07:00
diff --git a/test/pjrt/test_collective_ops_tpu.py b/test/pjrt/test_collective_ops_tpu.py
@@ -336,6 +336,36 @@ def test_all_to_all_single(self, use_dynamo):
                          expected.sort().values),
           f"Got {val}, expected {expected}")
 
+  @staticmethod
+  def _all_to_all():
+    dist.init_process_group("xla", init_method='xla://')
+    device = torch_xla.device()
+    world_size = xr.world_size()
+    rank = xr.global_ordinal()
+
+    input_tensors = list(
+        torch.full([world_size * 2],
+                   fill_value=rank,
+                   dtype=torch.float,
+                   device=device).chunk(world_size))
+    output_tensors = list(
+        torch.empty([world_size * 2], dtype=torch.float,
+                    device=device).chunk(world_size))
+    dist.all_to_all(output_tensors, input_tensors)
+
+    return [t.cpu() for t in output_tensors]
+
+  def test_all_to_all(self):
+    # Input on device i is ([i, i], [i, i], ...). After all_to_all,
+    # output on every device is ([0, 0], [1, 1], ...).
+    results = pjrt.run_multiprocess(self._all_to_all)
+    expected = [
+        torch.tensor([i, i], dtype=torch.float)
+        for i in range(tpu.num_expected_global_devices())
+    ]
+    for _, value in results.items():
+      torch.testing.assert_close(value, expected)
+
   @staticmethod
   def _scatter():
     dist.init_process_group("xla", init_method='xla://')
diff --git a/test/test_torch_distributed_xla_backend.py b/test/test_torch_distributed_xla_backend.py
@@ -357,7 +357,6 @@ def test_barrier(self):
 
   @parameterized.parameters(
       'allreduce_coalesced',
-      'alltoall',
       'gather',
       'recv_anysource',
       'monitored_barrier',
diff --git a/torch_xla/distributed/xla_backend.py b/torch_xla/distributed/xla_backend.py
@@ -6,7 +6,7 @@
 from torch_xla._internal import rendezvous
 import logging
 import os
-from torch._C._distributed_c10d import ProcessGroup, ScatterOptions, ReduceScatterOptions, AllgatherOptions, ReduceOptions
+from torch._C._distributed_c10d import ProcessGroup, ScatterOptions, ReduceScatterOptions, AllgatherOptions, AllToAllOptions, ReduceOptions
 
 
 def _create_xla_process_group(prefix_store, rank, size, timeout):
@@ -247,8 +247,24 @@ def reduce(self, tensors: list[torch.Tensor], opts: ReduceOptions):
   def allreduce_coalesced(self, *args):
     raise NotImplementedError
 
-  def alltoall(self, *args):
-    raise NotImplementedError
+  # Called by torch.distributed.all_to_all. Call site example:
+  # https://github.com/pytorch/pytorch/blob/v2.7.1/torch/distributed/distributed_c10d.py#L4577
+  # The difference between this and all_to_all_single is that this works
+  #  on a list of tensors while all_to_all_single works on a single tensor
+  #  and splits/concats along dimension 0.
+  def alltoall(self, output_tensor_list: list[torch.Tensor],
+               input_tensor_list: list[torch.Tensor], opts: AllToAllOptions):
+    stacked_inputs = torch.stack(input_tensor_list, dim=0)
+    split_count = len(input_tensor_list)
+    stacked_results = xm.all_to_all(
+        stacked_inputs,
+        split_dimension=0,
+        concat_dimension=0,
+        split_count=split_count)
+    results = torch.chunk(stacked_results, split_count, dim=0)
+    for result, output_tensor in zip(results, output_tensor_list):
+      output_tensor.copy_(result.squeeze(dim=0))
+    return _ret_work(output_tensor_list)
 
   # handle the nondynamo path when call torch.distributed.all_to_all_single
   # call from https://github.com/pytorch/pytorch/blob/758d78790164bfb041555daed380de96e06f78a3/torch/distributed/distributed_c10d.py#L3996