adding tolerance for numeric test of checkpointing (#9404)

yaoshiang · web-flow · commit 193f7049ef56 · 2025-06-25T09:50:50.000-07:00
There was initially concern that the numerics should be exact between activation remat and not. The rematerialized activation should be precise, however, the XLA compiler may re-order the ops, so the final update may deviate slightly, and the final loss of the model could vary even more than that.
diff --git a/test/spmd/test_train_spmd_linear_model.py b/test/spmd/test_train_spmd_linear_model.py
@@ -50,9 +50,9 @@ def test_basic(self):
       with extended_argv(['--use_gradient_checkpointing']):
         checkpointing_losses, checkpointing_result = train_and_evaluate()
         # Verify that the runs match with and without checkpointing.
-        assert torch.allclose(baseline_result, checkpointing_result)
+        assert torch.allclose(baseline_result, checkpointing_result, atol=0.005)
         assert all(
-            torch.allclose(baseline_loss, checkpointing_loss)
+            torch.allclose(baseline_loss, checkpointing_loss, atol=0.00002)
             for baseline_loss, checkpointing_loss in zip(
                 baseline_losses, checkpointing_losses))