Fix: no_grad with AMP bug (#20921)

baskrahmer · pre-commit-ci[bot] · Borda · Borda · commit b2ef2e3cc43b · 2025-09-03T11:03:02.000+02:00
* Disable cache for torch.autocast in amp * Add a test * Only test for bf16-mixed * Implement test to reproduce the issue --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> (cherry picked from commit 216f9ec)
diff --git a/src/lightning/pytorch/plugins/precision/amp.py b/src/lightning/pytorch/plugins/precision/amp.py
@@ -112,7 +112,8 @@ def clip_gradients(
         super().clip_gradients(optimizer=optimizer, clip_val=clip_val, gradient_clip_algorithm=gradient_clip_algorithm)
 
     def autocast_context_manager(self) -> torch.autocast:
-        return torch.autocast(self.device, dtype=(torch.bfloat16 if self.precision == "bf16-mixed" else torch.half))
+        dtype = torch.bfloat16 if self.precision == "bf16-mixed" else torch.half
+        return torch.autocast(self.device, dtype=dtype, cache_enabled=False)
 
     @override
     @contextmanager
diff --git a/tests/tests_pytorch/plugins/precision/test_amp.py b/tests/tests_pytorch/plugins/precision/test_amp.py
@@ -14,6 +14,8 @@
 from unittest.mock import Mock
 
 import pytest
+import torch
+from torch import nn
 from torch.optim import Optimizer
 
 from lightning.pytorch.plugins import MixedPrecision
@@ -51,3 +53,19 @@ def test_optimizer_amp_scaling_support_in_step_method():
 
     with pytest.raises(RuntimeError, match="The current optimizer.*does not allow for gradient clipping"):
         precision.clip_gradients(optimizer, clip_val=1.0)
+
+
+def test_amp_with_no_grad():
+    """Test that asserts using `no_grad` context wrapper with a persistent AMP context wrapper does not break gradient
+    tracking."""
+    layer = nn.Linear(2, 1)
+    x = torch.randn(1, 2)
+    amp = MixedPrecision(precision="bf16-mixed", device="cpu")
+
+    with amp.autocast_context_manager():
+        with torch.no_grad():
+            _ = layer(x)
+
+        loss = layer(x).mean()
+        loss.backward()
+        assert loss.grad_fn is not None