Check torch.compile numerics in simpleFSDP tests (#1925)

yiming0416 · web-flow · commit 025c21bd9e1a · 2025-10-21T13:53:50.000-07:00
Added checks for numerics of `torch.compile()` with `aot_eager` backends
against eager in unit tests to guard regressions.

```
torchrun --nproc-per-node=8 -m pytest torchtitan/experiments/simple_fsdp/tests/test_numerics.py
```
diff --git a/torchtitan/experiments/simple_fsdp/README.md b/torchtitan/experiments/simple_fsdp/README.md
@@ -1,6 +1,6 @@
 ## SimpleFSDP
 
-[![integration tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_simple_fsdp.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_simple_fsdp.yaml?query=branch%3Amain)
+[![integration and numerics tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_simple_fsdp.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_simple_fsdp.yaml?query=branch%3Amain)
 [![arXiv](https://img.shields.io/badge/arXiv-2411.00284-b31b1b.svg)](https://arxiv.org/abs/2411.00284)
 
 💡 **Note**: SimpleFSDP's composability with Mixed Precision Training and Tensor Parallel requires updates from latest PyTorch, which can be installed (e.g., for CUDA 12.6) via
diff --git a/torchtitan/experiments/simple_fsdp/tests/test_numerics.py b/torchtitan/experiments/simple_fsdp/tests/test_numerics.py
@@ -79,21 +79,44 @@ def run_simple_fsdp(self, model, inputs, labels, epoch=20):
             losses.append(loss)
         return losses
 
+    def run_simple_fsdp_compiled_aot_eager(self, model, inputs, labels, epoch=20):
+        model = data_parallel(
+            model,
+            device_mesh=self.device_mesh[tuple(self.dp_mesh_dim_names)],
+            mode=self.mode,
+        )
+        # TODO: Add "inductor" backend when it's numerical issues are fixed
+        model = torch.compile(model, backend="aot_eager", fullgraph=True)
+        optim = self.optimizer(model.parameters(), lr=1e-4)
+        losses = []
+        for _ in range(epoch):
+            optim.zero_grad()
+            out = model(inputs)
+            loss = self.loss_fn(out, labels)
+            loss.backward()
+            optim.step()
+            losses.append(loss)
+        return losses
+
     def test_replicate_convergence(self):
         # unit test for replicate mode
         self.mode = "replicate"
         self.init_test()
         model, inputs, labels = self.get_input()
 
         fsdp2_losses = self.run_fsdp2(copy.deepcopy(model), inputs, labels)
-        simple_fsdp_replicate_losses = self.run_simple_fsdp(
+        simple_fsdp_losses = self.run_simple_fsdp(copy.deepcopy(model), inputs, labels)
+        simple_fsdp_compiled_aot_eager_losses = self.run_simple_fsdp_compiled_aot_eager(
             copy.deepcopy(model), inputs, labels
         )
 
-        for fsdp2_loss, simple_fsdp_replicate_loss in zip(
-            fsdp2_losses, simple_fsdp_replicate_losses
+        for (fsdp2_loss, simple_fsdp_loss, simple_fsdp_compiled_aot_eager_loss,) in zip(
+            fsdp2_losses,
+            simple_fsdp_losses,
+            simple_fsdp_compiled_aot_eager_losses,
         ):
-            assert torch.equal(fsdp2_loss, simple_fsdp_replicate_loss)
+            assert torch.equal(fsdp2_loss, simple_fsdp_loss)
+            assert torch.equal(fsdp2_loss, simple_fsdp_compiled_aot_eager_loss)
 
     def test_fullyshard_convergence(self):
         # unit test for fully_shard mode
@@ -102,14 +125,18 @@ def test_fullyshard_convergence(self):
         model, inputs, labels = self.get_input()
 
         fsdp2_losses = self.run_fsdp2(copy.deepcopy(model), inputs, labels)
-        simple_fsdp_fullyshard_losses = self.run_simple_fsdp(
+        simple_fsdp_losses = self.run_simple_fsdp(copy.deepcopy(model), inputs, labels)
+        simple_fsdp_compiled_aot_eager_losses = self.run_simple_fsdp_compiled_aot_eager(
             copy.deepcopy(model), inputs, labels
         )
 
-        for fsdp2_loss, simple_fsdp_fullyshard_loss in zip(
-            fsdp2_losses, simple_fsdp_fullyshard_losses
+        for (fsdp2_loss, simple_fsdp_loss, simple_fsdp_compiled_aot_eager_loss,) in zip(
+            fsdp2_losses,
+            simple_fsdp_losses,
+            simple_fsdp_compiled_aot_eager_losses,
         ):
-            assert torch.equal(fsdp2_loss, simple_fsdp_fullyshard_loss)
+            assert torch.equal(fsdp2_loss, simple_fsdp_loss)
+            assert torch.equal(fsdp2_loss, simple_fsdp_compiled_aot_eager_loss)
 
     def test_hybridshard_convergence(self):
         # unit test for hybrid_shard mode
@@ -118,11 +145,15 @@ def test_hybridshard_convergence(self):
         model, inputs, labels = self.get_input()
 
         fsdp2_losses = self.run_fsdp2(copy.deepcopy(model), inputs, labels)
-        simple_fsdp_hybridshard_losses = self.run_simple_fsdp(
+        simple_fsdp_losses = self.run_simple_fsdp(copy.deepcopy(model), inputs, labels)
+        simple_fsdp_compiled_aot_eager_losses = self.run_simple_fsdp_compiled_aot_eager(
             copy.deepcopy(model), inputs, labels
         )
 
-        for fsdp2_loss, simple_fsdp_hybridshard_loss in zip(
-            fsdp2_losses, simple_fsdp_hybridshard_losses
+        for (fsdp2_loss, simple_fsdp_loss, simple_fsdp_compiled_aot_eager_loss,) in zip(
+            fsdp2_losses,
+            simple_fsdp_losses,
+            simple_fsdp_compiled_aot_eager_losses,
         ):
-            assert torch.equal(fsdp2_loss, simple_fsdp_hybridshard_loss)
+            assert torch.equal(fsdp2_loss, simple_fsdp_loss)
+            assert torch.equal(fsdp2_loss, simple_fsdp_compiled_aot_eager_loss)