Support skipping training step when using mixed precision training (#18267)

awaelchli · pre-commit-ci[bot] · lexierule · commit 6b9e0a5536b6 · 2023-08-14T15:52:02.000-04:00
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> (cherry picked from commit 97020bf)
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -31,6 +31,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed an issue that would prevent the user to set the multiprocessing start method after importing lightning ([#18177](https://github.com/Lightning-AI/lightning/pull/18177))
 
 
+- Fixed the gradient unscaling logic if the training step skipped backward (by returning `None`) ([#18267](https://github.com/Lightning-AI/lightning/pull/18267))
+
+
 - Ensure that the closure running inside the optimizer step has gradients enabled, even if the optimizer step has it disabled ([#18268](https://github.com/Lightning-AI/lightning/pull/18268))
 
 
diff --git a/src/lightning/pytorch/plugins/precision/amp.py b/src/lightning/pytorch/plugins/precision/amp.py
@@ -75,16 +75,19 @@ def optimizer_step(  # type: ignore[override]
             raise MisconfigurationException("AMP and the LBFGS optimizer are not compatible.")
         closure_result = closure()
 
-        if not _optimizer_handles_unscaling(optimizer):
+        # If backward was skipped in automatic optimization (return None), unscaling is not needed
+        skip_unscaling = closure_result is None and model.automatic_optimization
+
+        if not _optimizer_handles_unscaling(optimizer) and not skip_unscaling:
             # Unscaling needs to be performed here in case we are going to apply gradient clipping.
             # Optimizers that perform unscaling in their `.step()` method are not supported (e.g., fused Adam).
             # Note: `unscale` happens after the closure is executed, but before the `on_before_optimizer_step` hook.
             self.scaler.unscale_(optimizer)
 
         self._after_closure(model, optimizer)
-        skipped_backward = closure_result is None
+
         # in manual optimization, the closure does not return a value
-        if not model.automatic_optimization or not skipped_backward:
+        if not skip_unscaling:
             # note: the scaler will skip the `optimizer.step` if nonfinite gradients are found
             step_output = self.scaler.step(optimizer, **kwargs)
             self.scaler.update()
diff --git a/tests/tests_pytorch/plugins/precision/test_amp_integration.py b/tests/tests_pytorch/plugins/precision/test_amp_integration.py
@@ -11,11 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from unittest.mock import Mock
+
 import torch
 
 from lightning.fabric import seed_everything
 from lightning.pytorch import Trainer
 from lightning.pytorch.demos.boring_classes import BoringModel
+from lightning.pytorch.plugins.precision import MixedPrecisionPlugin
 from tests_pytorch.helpers.runif import RunIf
 
 
@@ -54,3 +57,31 @@ def run(fused=False):
     # Both the regular and the fused version of Adam produce the same losses and model weights
     for p, q in zip(params, params_fused):
         torch.testing.assert_close(p, q)
+
+
+@RunIf(min_cuda_gpus=1)
+def test_skip_training_step_with_grad_scaler():
+    """Test that the grad scaler gets skipped when skipping a training step."""
+
+    class TestModel(BoringModel):
+        def training_step(self, batch, batch_idx):
+            if batch_idx % 2:
+                return None  # skipping the backward should skip the grad scaler too
+            return super().training_step(batch, batch_idx)
+
+    trainer = Trainer(
+        accelerator="cuda",
+        devices=1,
+        precision="16-mixed",
+        barebones=True,
+        max_steps=5,
+        gradient_clip_val=0.5,
+    )
+    assert isinstance(trainer.precision_plugin, MixedPrecisionPlugin)
+    assert trainer.precision_plugin.scaler is not None
+    trainer.precision_plugin.scaler = Mock(wraps=trainer.precision_plugin.scaler)
+    model = TestModel()
+    trainer.fit(model)
+    assert trainer.precision_plugin.scaler.unscale_.call_count == 3
+    assert trainer.precision_plugin.scaler.step.call_count == 3
+    assert trainer.precision_plugin.scaler.update.call_count == 3