Explicitly enable grad in closure (#18268)

0x404 · awaelchli · lexierule · commit 28e0bdd84c39 · 2023-08-14T15:52:02.000-04:00
Co-authored-by: awaelchli <aedu.waelchli@gmail.com> (cherry picked from commit b88b8b3)
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -24,6 +24,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed an issue that would prevent the user to set the multiprocessing start method after importing lightning ([#18177](https://github.com/Lightning-AI/lightning/pull/18177))
 
 
+- Ensure that the closure running inside the optimizer step has gradients enabled, even if the optimizer step has it disabled ([#18268](https://github.com/Lightning-AI/lightning/pull/18268))
+
+
 ## [2.0.5] - 2023-07-07
 
 ### Fixed
diff --git a/src/lightning/pytorch/loops/optimization/automatic.py b/src/lightning/pytorch/loops/optimization/automatic.py
@@ -123,6 +123,7 @@ def __init__(
         self._backward_fn = backward_fn
         self._zero_grad_fn = zero_grad_fn
 
+    @torch.enable_grad()
     def closure(self, *args: Any, **kwargs: Any) -> ClosureResult:
         step_output = self._step_fn()
 
diff --git a/tests/tests_pytorch/loops/optimization/test_closure.py b/tests/tests_pytorch/loops/optimization/test_closure.py
@@ -43,3 +43,31 @@ def step(self, closure=None):
     trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=1)
     with pytest.raises(MisconfigurationException, match="The closure hasn't been executed"):
         trainer.fit(model)
+
+
+def test_closure_with_no_grad_optimizer(tmpdir):
+    """Test that the closure is guaranteed to run with grad enabled.
+
+    There are certain third-party library optimizers
+    (such as Hugging Face Transformers' AdamW) that set `no_grad` during the `step` operation.
+
+    """
+
+    class NoGradAdamW(torch.optim.AdamW):
+        @torch.no_grad()
+        def step(self, closure):
+            if closure is not None:
+                closure()
+            return super().step()
+
+    class TestModel(BoringModel):
+        def training_step(self, batch, batch_idx):
+            assert torch.is_grad_enabled()
+            return super().training_step(batch, batch_idx)
+
+        def configure_optimizers(self):
+            return NoGradAdamW(self.parameters(), lr=0.1)
+
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=1)
+    model = TestModel()
+    trainer.fit(model)