Implement XLAShardedTensor.redistribute and test (#9529)

aws-cph · web-flow · commit 43589c03b72d · 2025-08-04T22:08:35.000Z
diff --git a/test/neuron/run_tests.sh b/test/neuron/run_tests.sh
@@ -256,6 +256,7 @@ function run_xla_op_tests3 {
   #run_test "$_TEST_DIR/spmd/test_dtensor_integration2.py"
   run_test_multi_device "$_TEST_DIR/spmd/test_dtensor_convert_mesh.py"
   run_test_multi_device "$_TEST_DIR/spmd/test_xla_dtensor_spec_conv.py"
+  run_test_multi_device "$_TEST_DIR/spmd/test_dtensor_redistribute.py"
   run_test "$_TEST_DIR/spmd/test_xla_auto_sharding.py"
   #run_test "$_TEST_DIR/spmd/test_spmd_parameter_wrapping.py"
   run_test "$_TEST_DIR/spmd/test_train_spmd_linear_model.py"
diff --git a/test/run_tests.sh b/test/run_tests.sh
@@ -256,6 +256,7 @@ function run_xla_op_tests3 {
   run_test_multi_devices_without_func "$_TEST_DIR/spmd/test_dtensor_integration3.py"
   run_test_multi_devices "$_TEST_DIR/spmd/test_dtensor_convert_mesh.py"
   run_test_multi_devices "$_TEST_DIR/spmd/test_xla_dtensor_spec_conversion.py"
+  run_test_multi_devices "$_TEST_DIR/spmd/test_dtensor_redistribute.py"
   run_test "$_TEST_DIR/spmd/test_xla_auto_sharding.py"
   run_test "$_TEST_DIR/spmd/test_spmd_parameter_wrapping.py"
   run_test "$_TEST_DIR/spmd/test_mp_input_sharding.py"
diff --git a/test/spmd/test_dtensor_redistribute.py b/test/spmd/test_dtensor_redistribute.py
@@ -0,0 +1,269 @@
+import sys
+import unittest
+import torch
+from torch.distributed.tensor.placement_types import Shard, Replicate, Partial
+import torch_xla.runtime as xr
+import torch_xla.distributed.spmd as xs
+import torch_xla
+import numpy as np
+import test_xla_sharding_base
+from absl.testing import parameterized
+
+
+class DTensorRedistributeTest(test_xla_sharding_base.XlaShardingTest,
+                              parameterized.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    super().setUpClass()
+    xr.use_spmd()
+
+  def _verify_sharding_spec(self, tensor, expected_devices=None):
+    """Verify tensor sharding spec after mark_step"""
+    torch_xla.sync()
+    sharding_spec = torch_xla._XLAC._get_xla_sharding_spec(tensor)
+    if expected_devices:
+      self.assertIn(expected_devices, sharding_spec)
+    return sharding_spec
+
+  # Test tensor shapes: 0D, 1D, 2D, 3D
+  @parameterized.parameters(
+      ((), ()),  # 0D scalar
+      ((8,), (0,)),  # 1D
+      ((8, 16), (0, None)),  # 2D
+      ((4, 8, 16), (0, None, None))  # 3D
+  )
+  @unittest.skipIf(xr.global_runtime_device_count() < 2, "Need ≥2 devices")
+  def test_tensor_shapes(self, shape, partition_spec):
+    device_count = xr.global_runtime_device_count()
+    mesh = xs.Mesh(np.arange(device_count), (device_count,))
+
+    if len(shape) == 0:
+      tensor = torch.tensor(1.0).to('xla')
+      placements = [Replicate()]
+      expected_spec = ()
+    else:
+      tensor = torch.randn(shape).to('xla')
+      sharded_tensor = xs.mark_sharding(tensor, mesh, partition_spec)
+      placements = [Shard(0)]
+      expected_spec = partition_spec
+
+      redistributed = sharded_tensor.redistribute(mesh, placements)
+      self.assertEqual(redistributed.partition_spec, expected_spec)
+
+      # Convert partition spec to expected devices pattern
+      devices_pattern = [
+          str(device_count) if spec == 0 else '1' for spec in expected_spec
+      ]
+      expected_devices = f"devices=[{','.join(devices_pattern)}]"
+
+      # Skip HLO verification for 4D tensors due to XLA optimization issues
+      if len(shape) < 4:
+        self._verify_sharding_spec(redistributed.global_tensor,
+                                   expected_devices)
+
+  # Test tensor dtypes: bf16, f32, int32
+  @parameterized.parameters(torch.bfloat16, torch.float32, torch.int32)
+  @unittest.skipIf(xr.global_runtime_device_count() < 2, "Need ≥2 devices")
+  def test_tensor_dtypes(self, dtype):
+    device_count = xr.global_runtime_device_count()
+    mesh = xs.Mesh(np.arange(device_count), (device_count,))
+
+    if dtype == torch.int32:
+      tensor = torch.randint(0, 100, (8, 16), dtype=dtype).to('xla')
+    else:
+      tensor = torch.randn(8, 16, dtype=dtype).to('xla')
+
+    sharded_tensor = xs.mark_sharding(tensor, mesh, (0, None))
+    placements = [Shard(0)]
+
+    redistributed = sharded_tensor.redistribute(mesh, placements)
+    self.assertEqual(redistributed.partition_spec, (0, None))
+    self.assertEqual(redistributed.global_tensor.dtype, dtype)
+
+    # Verify HLO sharding
+    expected_devices = f"devices=[{device_count},1]"
+    self._verify_sharding_spec(redistributed.global_tensor, expected_devices)
+
+  # Test device mesh dimensions: 1D, 2D
+  @unittest.skipIf(xr.global_runtime_device_count() < 4, "Need ≥4 devices")
+  def test_device_mesh_dimensions(self):
+    device_count = xr.global_runtime_device_count()
+
+    # 1D mesh
+    mesh_1d = xs.Mesh(np.arange(device_count), (device_count,))
+    tensor = torch.randn(8, 16).to('xla')
+    sharded_tensor = xs.mark_sharding(tensor, mesh_1d, (0, None))
+
+    redistributed = sharded_tensor.redistribute(mesh_1d, [Shard(1)])
+    self.assertEqual(redistributed.partition_spec, (None, 0))
+
+    # Verify HLO sharding for 1D mesh
+    expected_devices = f"devices=[1,{device_count}]"
+    self._verify_sharding_spec(redistributed.global_tensor, expected_devices)
+
+    # 2D mesh
+    if device_count >= 4 and device_count % 2 == 0:
+      mesh_2d = xs.Mesh(np.arange(device_count), (2, device_count // 2))
+      tensor_2d = torch.randn(8, 16).to('xla')
+      sharded_tensor = xs.mark_sharding(tensor_2d, mesh_2d, (0, None))
+
+      redistributed = sharded_tensor.redistribute(
+          mesh_2d, [Replicate(), Shard(1)])
+      self.assertEqual(redistributed.partition_spec, (None, 1))
+
+      # Verify HLO sharding for 2D mesh
+      expected_devices = f"devices=[1,{device_count // 2},{device_count // 2}]"
+      self._verify_sharding_spec(redistributed.global_tensor, expected_devices)
+
+  # Test placement types: Replicate, Shard
+  @unittest.skipIf(xr.global_runtime_device_count() < 2, "Need ≥2 devices")
+  def test_placement_types(self):
+    device_count = xr.global_runtime_device_count()
+    mesh = xs.Mesh(np.arange(device_count), (device_count,))
+    tensor = torch.randn(8, 16).to('xla')
+
+    # Test Replicate
+    sharded_tensor = xs.mark_sharding(tensor, mesh, (0, None))
+    redistributed = sharded_tensor.redistribute(mesh, [Replicate()])
+    self.assertEqual(redistributed.partition_spec, (None, None))
+
+    # Verify HLO sharding for replicated
+    self._verify_sharding_spec(redistributed.global_tensor, "replicated")
+
+    # Test Shard on different dimensions
+    for dim in [0, 1]:
+      with self.subTest(shard_dim=dim):
+        redistributed = sharded_tensor.redistribute(mesh, [Shard(dim)])
+        expected_spec = [None, None]
+        expected_spec[dim] = 0
+        self.assertEqual(redistributed.partition_spec, tuple(expected_spec))
+
+        # Verify HLO sharding
+        devices_pattern = [
+            str(device_count) if i == dim else '1' for i in range(2)
+        ]
+        expected_devices = f"devices=[{','.join(devices_pattern)}]"
+        self._verify_sharding_spec(redistributed.global_tensor,
+                                   expected_devices)
+
+  # Test error cases with invalid inputs
+  @unittest.skipIf(xr.global_runtime_device_count() < 2, "Need ≥2 devices")
+  def test_invalid_inputs(self):
+    device_count = xr.global_runtime_device_count()
+    mesh = xs.Mesh(np.arange(device_count), (device_count,))
+    tensor = torch.randn(8, 16).to('xla')
+    sharded_tensor = xs.mark_sharding(tensor, mesh, (0, None))
+
+    # Test invalid shard dimension (tensor only has dims 0,1 but asking for dim 2)
+    with self.assertRaises(IndexError):
+      sharded_tensor.redistribute(mesh, [Shard(2)])
+
+    # Test mismatched placements length (1D mesh expects 1 placement, not 2)
+    with self.assertRaises(ValueError):
+      sharded_tensor.redistribute(mesh, [Shard(0), Shard(1)])
+
+    # Test Partial placement - should raise error about not being implemented
+    with self.assertRaises(NotImplementedError):
+      sharded_tensor.redistribute(mesh, [Partial()])
+
+  # Test sharding propagation through operations
+  @unittest.skipIf(xr.global_runtime_device_count() < 2, "Need ≥2 devices")
+  def test_sharding_propagation(self):
+    device_count = xr.global_runtime_device_count()
+    mesh = xs.Mesh(np.arange(device_count), (device_count,))
+
+    # Unary ops
+    tensor = torch.randn(8, 16).to('xla')
+    sharded_tensor = xs.mark_sharding(tensor, mesh, (0, None))
+    redistributed = sharded_tensor.redistribute(mesh, [Shard(0)])
+
+    relu_result = torch.relu(redistributed.global_tensor)
+    self.assertEqual(relu_result.shape, (8, 16))
+    self.assertTrue(torch.all(relu_result >= 0))
+
+    # Binary ops
+    tensor2 = torch.randn(8, 16).to('xla')
+    sharded_tensor2 = xs.mark_sharding(tensor2, mesh, (0, None))
+    redistributed2 = sharded_tensor2.redistribute(mesh, [Shard(0)])
+
+    add_result = redistributed.global_tensor + redistributed2.global_tensor
+    mul_result = redistributed.global_tensor * redistributed2.global_tensor
+
+    # Verify operation results
+    self.assertEqual(add_result.shape, (8, 16))
+    self.assertEqual(mul_result.shape, (8, 16))
+
+    # Verify operations work correctly
+    self.assertTrue(
+        torch.allclose(
+            add_result,
+            redistributed.global_tensor + redistributed2.global_tensor))
+    self.assertTrue(
+        torch.allclose(
+            mul_result,
+            redistributed.global_tensor * redistributed2.global_tensor))
+
+  # Test comprehensive redistribute scenarios
+  @unittest.skipIf(xr.global_runtime_device_count() < 2, "Need ≥2 devices")
+  def test_comprehensive_redistribute(self):
+    device_count = xr.global_runtime_device_count()
+    mesh = xs.Mesh(np.arange(device_count), (device_count,))
+
+    tensor = torch.randn(8, 16).to('xla')
+    sharded_tensor = xs.mark_sharding(tensor, mesh, (0, None))
+
+    # Test all placement combinations for 1D mesh
+    placement_types = [Replicate(), Shard(0), Shard(1)]
+
+    for placement in placement_types:
+      with self.subTest(placement=placement):
+        placements = [placement]
+
+        if isinstance(placement, Shard):
+          expected_spec = [None] * 2
+          expected_spec[placement.dim] = 0
+          expected_spec = tuple(expected_spec)
+        else:
+          expected_spec = (None, None)
+
+        redistributed = sharded_tensor.redistribute(mesh, placements)
+        self.assertEqual(redistributed.partition_spec, expected_spec)
+
+        # Verify HLO sharding
+        if isinstance(placement, Shard):
+          devices_pattern = [
+              str(device_count) if i == placement.dim else '1' for i in range(2)
+          ]
+          expected_devices = f"devices=[{','.join(devices_pattern)}]"
+        else:
+          expected_devices = "replicated"
+        self._verify_sharding_spec(redistributed.global_tensor,
+                                   expected_devices)
+
+  # Test async redistribute
+  @unittest.skipIf(xr.global_runtime_device_count() < 4, "Need ≥4 devices")
+  def test_async_redistribute(self):
+    device_count = xr.global_runtime_device_count()
+    mesh_shape = (2, device_count // 2)
+    mesh = xs.Mesh(np.arange(device_count), mesh_shape)
+
+    tensor = torch.randn(8, 16).to('xla')
+    sharded_tensor = xs.mark_sharding(tensor, mesh, (0, None))
+
+    # Test async redistribute
+    placements = [Replicate(), Shard(0)]
+    redistributed = sharded_tensor.redistribute(mesh, placements, async_op=True)
+    self.assertEqual(redistributed.partition_spec, (1, None))
+
+    # Verify async operation creates different tensor object
+    self.assertIsNot(redistributed.global_tensor, sharded_tensor.global_tensor)
+
+    # Verify HLO sharding for async redistribute (XLA generates more complex pattern)
+    expected_devices = f"devices=[2,1,{device_count // 2}]"
+    self._verify_sharding_spec(redistributed.global_tensor, expected_devices)
+
+
+if __name__ == '__main__':
+  test = unittest.main()
+  sys.exit(0 if test.result.wasSuccessful() else 1)
diff --git a/test/tpu/run_tests.sh b/test/tpu/run_tests.sh
@@ -62,6 +62,7 @@ run_test "$_TEST_DIR/spmd/test_xla_auto_sharding.py"
 run_test "$_TEST_DIR/spmd/test_fsdp_v2.py"
 run_test "$_TEST_DIR/spmd/test_dtensor_convert_mesh.py"
 run_test "$_TEST_DIR/spmd/test_xla_dtensor_spec_conversion.py"
+run_test "$_TEST_DIR/spmd/test_dtensor_redistribute.py"
 run_test "$_TEST_DIR/test_gradient_accumulation.py"
 XLA_EXPERIMENTAL=nonzero:masked_select:nms run_test "$_TEST_DIR/ds/test_dynamic_shape_models.py" -v
 run_test "$_TEST_DIR/test_autocast.py"
diff --git a/torch_xla/distributed/spmd/xla_sharded_tensor.py b/torch_xla/distributed/spmd/xla_sharded_tensor.py
@@ -9,7 +9,7 @@
 import torch_xla.runtime as xr
 from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
 from torch.distributed.device_mesh import DeviceMesh
-from torch.distributed.tensor.placement_types import Placement, Shard, Replicate
+from torch.distributed.tensor.placement_types import Placement, Shard, Replicate, Partial
 from torch.utils._pytree import tree_map_only
 
 
@@ -264,6 +264,40 @@ def invalidate_spec_cache(self):
     """Invalidate the cached DTensorSpec."""
     self._cached_spec = None
 
+  def redistribute(self, device_mesh, placements, *, async_op: bool = False):
+    # Validate inputs
+    if len(placements) != len(device_mesh.mesh_shape):
+      raise ValueError(
+          f"Number of placements ({len(placements)}) must match mesh dimensions ({len(device_mesh.mesh_shape)})"
+      )
+
+    # Check for unsupported placement types
+    for placement in placements:
+      if isinstance(placement, Partial):
+        raise NotImplementedError(
+            "Partial placement is not yet implemented and may have unexpected behavior. "
+            "Use Shard or Replicate placements instead.")
+
+    # Convert placements to partition spec
+    partition_spec = [None] * len(self.global_tensor.shape)
+    for mesh_dim, placement in enumerate(placements):
+      if isinstance(placement, Shard):
+        if placement.dim >= len(self.global_tensor.shape):
+          raise IndexError(
+              f"Shard dimension {placement.dim} is out of bounds for tensor with {len(self.global_tensor.shape)} dimensions"
+          )
+        partition_spec[placement.dim] = mesh_dim
+
+    result_tensor = self.global_tensor.clone(
+    ) if async_op else self.global_tensor
+    op_sharding = device_mesh.get_op_sharding(tuple(partition_spec))
+    torch_xla._XLAC._xla_annotate_custom_sharding(result_tensor, op_sharding)
+
+    return XLAShardedTensor(
+        result_tensor,
+        mesh_shape=device_mesh.mesh_shape,
+        partition_spec=tuple(partition_spec))
+
   @classmethod
   def __torch_function__(cls, func, types, args=(), kwargs=None):
     return super().__torch_function__(func, types, args, kwargs)