[FSDP][Replicate] tests replicate casting module after init (pytorch#162636)

anshul-si · pytorchmergebot · commit 7a0f93344e2c · 2025-09-17T20:36:13.000Z
**Summary:** In order to ensure that replicate acts as intended (a specialized version of hsdp) we need to make sure that it can pass the same tests that fully_shard can for training. This test is important as it verifies we can cast a replicated module to a different type after initialization, and import feature for enabling mixed precision, **Test Cases** 1. pytest test/distributed/_composable/test_replicate_training.py -k test_to_float64_after_init Pull Request resolved: pytorch#162636 Approved by: https://github.com/mori360 ghstack dependencies: pytorch#162631
diff --git a/test/distributed/_composable/test_replicate_training.py b/test/distributed/_composable/test_replicate_training.py
@@ -10,8 +10,13 @@
 from torch.distributed.fsdp import FSDPModule
 from torch.distributed.tensor import DTensor
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
-from torch.testing._internal.common_fsdp import FSDPTestMultiThread, get_devtype, MLP
-from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.common_fsdp import (
+    check_sharded_parity,
+    FSDPTestMultiThread,
+    get_devtype,
+    MLP,
+)
+from torch.testing._internal.common_utils import run_tests, wrapSwapTensorsTest
 
 
 c10d_ops = torch.ops.c10d
@@ -169,5 +174,61 @@ def _assert_same_params(
             self.assertEqual(param, ref_param)
 
 
+class TestReplicateCastAfterInit(FSDPTestMultiThread):
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @skip_if_lt_x_gpu(1)
+    @wrapSwapTensorsTest(True)
+    def test_to_float64_after_init(self):
+        """Tests that the user can cast the module to float64 after init."""
+        # NOTE: Test fp64 instead of a lower precision dtype like bf16 for
+        # better numerics. The important part is changing the dtype.
+
+        torch.manual_seed(42)
+        mlp_dim, device, dtype = 4, device_type, torch.float64
+        model = MLP(mlp_dim, device=device)
+        for param in model.parameters():
+            dist.broadcast(param, src=0)
+        ref_model = copy.deepcopy(model).to(dtype)
+
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        for module in (model.in_proj, model.out_proj, model):
+            replicate(module)
+        model.to(dtype)
+        for param in model.parameters():
+            self.assertEqual(param.dtype, dtype)
+            self.assertEqual(param.to_local().dtype, dtype)
+            self.assertEqual(param._spec.tensor_meta.dtype, dtype)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=True)
+        check_sharded_parity(self, ref_model, model)
+        torch.manual_seed(42 + self.rank + 1)
+        inp = torch.randn((2, mlp_dim), device=device_type.type, dtype=dtype)
+        for iter_idx in range(10):
+            losses: list[torch.Tensor] = []
+            for _model in (ref_model, model):
+                losses.append(_model(inp).sum())
+                losses[-1].backward()
+
+            for param in ref_model.parameters():
+                if param.grad is not None:
+                    dist.all_reduce(param.grad)
+                    param.grad.div_(self.world_size)
+
+            self.assertEqual(losses[0], losses[1])
+            check_sharded_parity(self, ref_model, model)
+            for param in model.parameters():
+                self.assertEqual(param.dtype, dtype)
+                self.assertEqual(param.to_local().dtype, dtype)
+                self.assertEqual(param._spec.tensor_meta.dtype, dtype)
+                self.assertEqual(param.grad.dtype, dtype)
+                self.assertEqual(param.grad.to_local().dtype, dtype)
+                self.assertEqual(param.grad._spec.tensor_meta.dtype, dtype)
+            for _optim in (ref_optim, optim):
+                _optim.step()
+                _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+
+
 if __name__ == "__main__":
     run_tests()