From 9d92a4a28e2c6cd208f832efd0a015d7c8cd58ff Mon Sep 17 00:00:00 2001
From: milesial <milesial@users.noreply.github.com>
Date: Tue, 3 Jan 2023 15:48:31 +0100
Subject: [PATCH 01/10] Fix LR scheduler behaviour with AMP

---
 src/pytorch_lightning/core/module.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py
index 878313b675bdc..9c5875d931939 100644
--- a/src/pytorch_lightning/core/module.py
+++ b/src/pytorch_lightning/core/module.py
@@ -1649,6 +1649,10 @@ def lr_scheduler_step(self, scheduler, optimizer_idx, metric):
                 scheduler.step(epoch=self.current_epoch)
 
         """
+        optimizer = self.trainer.optimizers[optimizer_idx]
+        if hasattr(optimizer, '_step_count') and optimizer._step_count <= 0:
+            return
+
         if metric is None:
             scheduler.step()  # type: ignore[call-arg]
         else:

From 2c2b138342360e313d6c0ae2f856a29ca5f1e22a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 3 Jan 2023 14:55:56 +0000
Subject: [PATCH 02/10] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/pytorch_lightning/core/module.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py
index 9c5875d931939..417332f317b53 100644
--- a/src/pytorch_lightning/core/module.py
+++ b/src/pytorch_lightning/core/module.py
@@ -1650,7 +1650,7 @@ def lr_scheduler_step(self, scheduler, optimizer_idx, metric):
 
         """
         optimizer = self.trainer.optimizers[optimizer_idx]
-        if hasattr(optimizer, '_step_count') and optimizer._step_count <= 0:
+        if hasattr(optimizer, "_step_count") and optimizer._step_count <= 0:
             return
 
         if metric is None:

From 1b3365ead62379dce92a1d7b0280c610d26afdc1 Mon Sep 17 00:00:00 2001
From: milesial <milesial@users.noreply.github.com>
Date: Mon, 9 Jan 2023 00:38:58 -0800
Subject: [PATCH 03/10] Fix LR schedulers when optimizers with frequencies

---
 src/pytorch_lightning/loops/epoch/training_epoch_loop.py    | 2 +-
 tests/tests_pytorch/trainer/optimization/test_optimizers.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pytorch_lightning/loops/epoch/training_epoch_loop.py b/src/pytorch_lightning/loops/epoch/training_epoch_loop.py
index 5bdec1b55226b..b44ed64d659f8 100644
--- a/src/pytorch_lightning/loops/epoch/training_epoch_loop.py
+++ b/src/pytorch_lightning/loops/epoch/training_epoch_loop.py
@@ -390,7 +390,7 @@ def update_lr_schedulers(self, interval: str, update_plateau_schedulers: bool) -
         if interval == "step" and self._should_accumulate():
             return
         active_optimizers = _get_active_optimizers(
-            self.trainer.optimizers, self.trainer.optimizer_frequencies, self.total_batch_idx
+            self.trainer.optimizers, self.trainer.optimizer_frequencies, self.batch_idx
         )
         self._update_learning_rates(
             interval=interval,
diff --git a/tests/tests_pytorch/trainer/optimization/test_optimizers.py b/tests/tests_pytorch/trainer/optimization/test_optimizers.py
index ed821b0d6ff4c..155cc175bcf80 100644
--- a/tests/tests_pytorch/trainer/optimization/test_optimizers.py
+++ b/tests/tests_pytorch/trainer/optimization/test_optimizers.py
@@ -286,7 +286,7 @@ def configure_optimizers(self):
             (dict(step_size=5), dict(T_max=2)),
             ("epoch", "epoch"),
             (5, 10),
-            (2, 3),
+            (4, 1),
             3,
         ),
     ],

From a237283d446ccb1ba81f036416eaca6dbe361695 Mon Sep 17 00:00:00 2001
From: milesial <milesial@users.noreply.github.com>
Date: Wed, 4 Jan 2023 11:49:55 +0100
Subject: [PATCH 04/10] Move implementation to scales comparison

---
 src/lightning_fabric/plugins/precision/native_amp.py     | 2 ++
 src/pytorch_lightning/core/module.py                     | 4 ----
 src/pytorch_lightning/loops/epoch/training_epoch_loop.py | 5 +++--
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/lightning_fabric/plugins/precision/native_amp.py b/src/lightning_fabric/plugins/precision/native_amp.py
index e86a61c490a24..372afdcc395be 100644
--- a/src/lightning_fabric/plugins/precision/native_amp.py
+++ b/src/lightning_fabric/plugins/precision/native_amp.py
@@ -73,8 +73,10 @@ def optimizer_step(
             return super().optimizer_step(optimizer, **kwargs)
         if isinstance(optimizer, LBFGS):
             raise TypeError("Native AMP and the LBFGS optimizer are not compatible.")
+        previous_scale = self.scaler.get_scale()
         # note: the scaler will skip the `optimizer.step` if nonfinite gradients are found
         step_output = self.scaler.step(optimizer, **kwargs)
+        model._skip_next_scheduler_step = self.scaler.get_scale() < previous_scale
         self.scaler.update()
         return step_output
 
diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py
index 417332f317b53..878313b675bdc 100644
--- a/src/pytorch_lightning/core/module.py
+++ b/src/pytorch_lightning/core/module.py
@@ -1649,10 +1649,6 @@ def lr_scheduler_step(self, scheduler, optimizer_idx, metric):
                 scheduler.step(epoch=self.current_epoch)
 
         """
-        optimizer = self.trainer.optimizers[optimizer_idx]
-        if hasattr(optimizer, "_step_count") and optimizer._step_count <= 0:
-            return
-
         if metric is None:
             scheduler.step()  # type: ignore[call-arg]
         else:
diff --git a/src/pytorch_lightning/loops/epoch/training_epoch_loop.py b/src/pytorch_lightning/loops/epoch/training_epoch_loop.py
index b44ed64d659f8..b8509e082dafb 100644
--- a/src/pytorch_lightning/loops/epoch/training_epoch_loop.py
+++ b/src/pytorch_lightning/loops/epoch/training_epoch_loop.py
@@ -216,8 +216,9 @@ def advance(self, data_fetcher: AbstractDataFetcher) -> None:
 
         # update non-plateau LR schedulers
         # update epoch-interval ones only when we are at the end of training epoch
-        self.update_lr_schedulers("step", update_plateau_schedulers=False)
-        if self._num_ready_batches_reached():
+        if not getattr(self.trainer.lightning_module, "_skip_next_scheduler_step", False):
+            self.update_lr_schedulers("step", update_plateau_schedulers=False)
+        elif self._num_ready_batches_reached():
             self.update_lr_schedulers("epoch", update_plateau_schedulers=False)
 
         batch_end_outputs = self._prepare_outputs_training_batch_end(

From 183a6a6318d159a508d07c81bb7b1b42a5f0d1bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Tue, 10 Jan 2023 17:13:11 +0100
Subject: [PATCH 05/10] Catch warnings

---
 setup.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.cfg b/setup.cfg
index f2609c80d0803..90220afd0741c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -34,6 +34,7 @@ markers =
     cloud:Run the cloud tests for example
 filterwarnings =
     error::FutureWarning
+    error:Detected call of `lr_scheduler.step\(\)` before `optimizer.step\(\)`:UserWarning
 xfail_strict = true
 junit_duration_report = call
 

From 0406a5561979f4cff723321592aeccce197084ec Mon Sep 17 00:00:00 2001
From: milesial <milesial@users.noreply.github.com>
Date: Tue, 10 Jan 2023 20:05:28 +0100
Subject: [PATCH 06/10] Fix implementation

---
 src/lightning_fabric/plugins/precision/native_amp.py     | 2 +-
 src/pytorch_lightning/loops/epoch/training_epoch_loop.py | 8 +++++---
 src/pytorch_lightning/plugins/precision/native_amp.py    | 2 ++
 tests/tests_fabric/plugins/precision/test_native_amp.py  | 1 +
 tests/tests_pytorch/models/test_hooks.py                 | 3 ++-
 5 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/lightning_fabric/plugins/precision/native_amp.py b/src/lightning_fabric/plugins/precision/native_amp.py
index 372afdcc395be..c165e89ae4ee9 100644
--- a/src/lightning_fabric/plugins/precision/native_amp.py
+++ b/src/lightning_fabric/plugins/precision/native_amp.py
@@ -76,8 +76,8 @@ def optimizer_step(
         previous_scale = self.scaler.get_scale()
         # note: the scaler will skip the `optimizer.step` if nonfinite gradients are found
         step_output = self.scaler.step(optimizer, **kwargs)
-        model._skip_next_scheduler_step = self.scaler.get_scale() < previous_scale
         self.scaler.update()
+        optimizer._skip_next_scheduler_step = self.scaler.get_scale() < previous_scale
         return step_output
 
     def state_dict(self) -> Dict[str, Any]:
diff --git a/src/pytorch_lightning/loops/epoch/training_epoch_loop.py b/src/pytorch_lightning/loops/epoch/training_epoch_loop.py
index b8509e082dafb..0bd0cd138d802 100644
--- a/src/pytorch_lightning/loops/epoch/training_epoch_loop.py
+++ b/src/pytorch_lightning/loops/epoch/training_epoch_loop.py
@@ -216,9 +216,8 @@ def advance(self, data_fetcher: AbstractDataFetcher) -> None:
 
         # update non-plateau LR schedulers
         # update epoch-interval ones only when we are at the end of training epoch
-        if not getattr(self.trainer.lightning_module, "_skip_next_scheduler_step", False):
-            self.update_lr_schedulers("step", update_plateau_schedulers=False)
-        elif self._num_ready_batches_reached():
+        self.update_lr_schedulers("step", update_plateau_schedulers=False)
+        if self._num_ready_batches_reached():
             self.update_lr_schedulers("epoch", update_plateau_schedulers=False)
 
         batch_end_outputs = self._prepare_outputs_training_batch_end(
@@ -451,6 +450,9 @@ def _update_learning_rates(
                         )
                         continue
 
+                if getattr(self.trainer.optimizers[config.opt_idx], "_skip_next_scheduler_step", False):
+                    continue
+
                 self.scheduler_progress.increment_ready()
 
                 # update LR
diff --git a/src/pytorch_lightning/plugins/precision/native_amp.py b/src/pytorch_lightning/plugins/precision/native_amp.py
index 9d1393871fc1d..a02412bd2d108 100644
--- a/src/pytorch_lightning/plugins/precision/native_amp.py
+++ b/src/pytorch_lightning/plugins/precision/native_amp.py
@@ -85,8 +85,10 @@ def optimizer_step(  # type: ignore[override]
         # in manual optimization, the closure does not return a value
         if not model.automatic_optimization or not skipped_backward:
             # note: the scaler will skip the `optimizer.step` if nonfinite gradients are found
+            previous_scale = self.scaler.get_scale()
             step_output = self.scaler.step(optimizer, **kwargs)
             self.scaler.update()
+            optimizer._skip_next_scheduler_step = self.scaler.get_scale() < previous_scale
             return step_output
         return closure_result
 
diff --git a/tests/tests_fabric/plugins/precision/test_native_amp.py b/tests/tests_fabric/plugins/precision/test_native_amp.py
index 62f983ab78762..d8ecc02fe06e5 100644
--- a/tests/tests_fabric/plugins/precision/test_native_amp.py
+++ b/tests/tests_fabric/plugins/precision/test_native_amp.py
@@ -69,6 +69,7 @@ def test_native_amp_precision_backward():
 def test_native_amp_precision_optimizer_step_with_scaler():
     precision = MixedPrecision(precision="mixed", device="cuda")
     precision.scaler = Mock()
+    precision.scaler.get_scale = Mock(return_value=1.0)
     optimizer = Mock()
 
     precision.optimizer_step(optimizer, keyword="arg")
diff --git a/tests/tests_pytorch/models/test_hooks.py b/tests/tests_pytorch/models/test_hooks.py
index 3702c8a922546..4293ef8d72dce 100644
--- a/tests/tests_pytorch/models/test_hooks.py
+++ b/tests/tests_pytorch/models/test_hooks.py
@@ -304,6 +304,7 @@ def _auto_train_batch(
         trainer, model, batches, device=torch.device("cpu"), current_epoch=0, current_batch=0, **kwargs
     ):
         using_deepspeed = kwargs.get("strategy") == "deepspeed"
+        using_native_amp = trainer.precision == 16 and trainer.amp_backend == 'native'
         out = []
         for i in range(current_batch, batches):
             out.extend(
@@ -347,7 +348,7 @@ def _auto_train_batch(
                         kwargs=dict(on_tpu=False, using_lbfgs=False),
                     ),
                     *(
-                        [dict(name="lr_scheduler_step", args=(ANY, 0, None))]
+                        [dict(name="lr_scheduler_step", args=(ANY, 0, None)) if not using_native_amp else ANY]
                         if i == (trainer.num_training_batches - 1)
                         else []
                     ),

From c44a892f4226933c73ff94fce72d8e3d5aa207e0 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 10 Jan 2023 20:05:04 +0000
Subject: [PATCH 07/10] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/tests_pytorch/models/test_hooks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/tests_pytorch/models/test_hooks.py b/tests/tests_pytorch/models/test_hooks.py
index 4293ef8d72dce..957a87e1a4e13 100644
--- a/tests/tests_pytorch/models/test_hooks.py
+++ b/tests/tests_pytorch/models/test_hooks.py
@@ -304,7 +304,7 @@ def _auto_train_batch(
         trainer, model, batches, device=torch.device("cpu"), current_epoch=0, current_batch=0, **kwargs
     ):
         using_deepspeed = kwargs.get("strategy") == "deepspeed"
-        using_native_amp = trainer.precision == 16 and trainer.amp_backend == 'native'
+        using_native_amp = trainer.precision == 16 and trainer.amp_backend == "native"
         out = []
         for i in range(current_batch, batches):
             out.extend(

From 11e4f721426357c88841b86e270c88530cdab051 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 3 Feb 2023 01:56:51 +0000
Subject: [PATCH 08/10] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/tests_pytorch/models/test_hooks.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tests/tests_pytorch/models/test_hooks.py b/tests/tests_pytorch/models/test_hooks.py
index 12538872ca2b7..66a32218e6f51 100644
--- a/tests/tests_pytorch/models/test_hooks.py
+++ b/tests/tests_pytorch/models/test_hooks.py
@@ -345,11 +345,7 @@ def _auto_train_batch(
                         name="optimizer_step",
                         args=(current_epoch, i, ANY, ANY),
                     ),
-                    *(
-                        [dict(name="lr_scheduler_step", args=ANY)]
-                        if i == (trainer.num_training_batches - 1)
-                        else []
-                    ),
+                    *([dict(name="lr_scheduler_step", args=ANY)] if i == (trainer.num_training_batches - 1) else []),
                     dict(name="Callback.on_train_batch_end", args=(trainer, model, dict(loss=ANY), ANY, i)),
                     dict(name="on_train_batch_end", args=(dict(loss=ANY), ANY, i)),
                 ]

From b99707525796dd1472f5779577482e9cc507d002 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 26 Apr 2023 19:46:59 +0000
Subject: [PATCH 09/10] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/tests_pytorch/models/test_hooks.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/tests_pytorch/models/test_hooks.py b/tests/tests_pytorch/models/test_hooks.py
index f5aac0030e79c..e8257b63003c6 100644
--- a/tests/tests_pytorch/models/test_hooks.py
+++ b/tests/tests_pytorch/models/test_hooks.py
@@ -296,13 +296,13 @@ def _auto_train_batch(
                     },
                     # this is after because it refers to the `LightningModule.optimizer_step` hook which encapsulates
                     # the actual call to `PrecisionPlugin.optimizer_step`
-                    dict(
-                        name="optimizer_step",
-                        args=(current_epoch, i, ANY, ANY),
-                    ),
-                    *([dict(name="lr_scheduler_step", args=ANY)] if i == (trainer.num_training_batches - 1) else []),
-                    dict(name="Callback.on_train_batch_end", args=(trainer, model, dict(loss=ANY), ANY, i)),
-                    dict(name="on_train_batch_end", args=(dict(loss=ANY), ANY, i)),
+                    {
+                        "name": "optimizer_step",
+                        "args": (current_epoch, i, ANY, ANY),
+                    },
+                    *([{"name": "lr_scheduler_step", "args": ANY}] if i == (trainer.num_training_batches - 1) else []),
+                    {"name": "Callback.on_train_batch_end", "args": (trainer, model, {"loss": ANY}, ANY, i)},
+                    {"name": "on_train_batch_end", "args": ({"loss": ANY}, ANY, i)},
                 ]
             )
         return out

From 965fc03b4cff39028e90634b2ddcb2e483f6c3c1 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 16 Feb 2024 17:26:55 +0000
Subject: [PATCH 10/10] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/tests_pytorch/models/test_hooks.py | 82 ++++++++++++------------
 1 file changed, 40 insertions(+), 42 deletions(-)

diff --git a/tests/tests_pytorch/models/test_hooks.py b/tests/tests_pytorch/models/test_hooks.py
index a022154800b20..596d08641da1b 100644
--- a/tests/tests_pytorch/models/test_hooks.py
+++ b/tests/tests_pytorch/models/test_hooks.py
@@ -266,48 +266,46 @@ def _auto_train_batch(trainer, model, batches, device, current_epoch=0, current_
         using_deepspeed = kwargs.get("strategy") == "deepspeed"
         out = []
         for i in range(current_batch, batches):
-            out.extend(
-                [
-                    {"name": "on_before_batch_transfer", "args": (ANY, 0)},
-                    {"name": "transfer_batch_to_device", "args": (ANY, device, 0)},
-                    {"name": "on_after_batch_transfer", "args": (ANY, 0)},
-                    {"name": "Callback.on_train_batch_start", "args": (trainer, model, ANY, i)},
-                    {"name": "on_train_batch_start", "args": (ANY, i)},
-                    {"name": "forward", "args": (ANY,)},
-                    {"name": "training_step", "args": (ANY, i)},
-                    {"name": "Callback.on_before_zero_grad", "args": (trainer, model, ANY)},
-                    {"name": "on_before_zero_grad", "args": (ANY,)},
-                    {"name": "optimizer_zero_grad", "args": (current_epoch, i, ANY)},
-                    {"name": "Callback.on_before_backward", "args": (trainer, model, ANY)},
-                    {"name": "on_before_backward", "args": (ANY,)},
-                    # DeepSpeed handles backward internally
-                    *([{"name": "backward", "args": (ANY,)}] if not using_deepspeed else []),
-                    {"name": "Callback.on_after_backward", "args": (trainer, model)},
-                    {"name": "on_after_backward"},
-                    # note: unscaling happens here in the case of AMP
-                    {"name": "Callback.on_before_optimizer_step", "args": (trainer, model, ANY)},
-                    {"name": "on_before_optimizer_step", "args": (ANY,)},
-                    {
-                        "name": "clip_gradients",
-                        "args": (ANY,),
-                        "kwargs": {"gradient_clip_val": None, "gradient_clip_algorithm": None},
-                    },
-                    {
-                        "name": "configure_gradient_clipping",
-                        "args": (ANY,),
-                        "kwargs": {"gradient_clip_val": None, "gradient_clip_algorithm": None},
-                    },
-                    # this is after because it refers to the `LightningModule.optimizer_step` hook which encapsulates
-                    # the actual call to `Precision.optimizer_step`
-                    {
-                        "name": "optimizer_step",
-                        "args": (current_epoch, i, ANY, ANY),
-                    },
-                    *([{"name": "lr_scheduler_step", "args": ANY}] if i == (trainer.num_training_batches - 1) else []),
-                    {"name": "Callback.on_train_batch_end", "args": (trainer, model, {"loss": ANY}, ANY, i)},
-                    {"name": "on_train_batch_end", "args": ({"loss": ANY}, ANY, i)},
-                ]
-            )
+            out.extend([
+                {"name": "on_before_batch_transfer", "args": (ANY, 0)},
+                {"name": "transfer_batch_to_device", "args": (ANY, device, 0)},
+                {"name": "on_after_batch_transfer", "args": (ANY, 0)},
+                {"name": "Callback.on_train_batch_start", "args": (trainer, model, ANY, i)},
+                {"name": "on_train_batch_start", "args": (ANY, i)},
+                {"name": "forward", "args": (ANY,)},
+                {"name": "training_step", "args": (ANY, i)},
+                {"name": "Callback.on_before_zero_grad", "args": (trainer, model, ANY)},
+                {"name": "on_before_zero_grad", "args": (ANY,)},
+                {"name": "optimizer_zero_grad", "args": (current_epoch, i, ANY)},
+                {"name": "Callback.on_before_backward", "args": (trainer, model, ANY)},
+                {"name": "on_before_backward", "args": (ANY,)},
+                # DeepSpeed handles backward internally
+                *([{"name": "backward", "args": (ANY,)}] if not using_deepspeed else []),
+                {"name": "Callback.on_after_backward", "args": (trainer, model)},
+                {"name": "on_after_backward"},
+                # note: unscaling happens here in the case of AMP
+                {"name": "Callback.on_before_optimizer_step", "args": (trainer, model, ANY)},
+                {"name": "on_before_optimizer_step", "args": (ANY,)},
+                {
+                    "name": "clip_gradients",
+                    "args": (ANY,),
+                    "kwargs": {"gradient_clip_val": None, "gradient_clip_algorithm": None},
+                },
+                {
+                    "name": "configure_gradient_clipping",
+                    "args": (ANY,),
+                    "kwargs": {"gradient_clip_val": None, "gradient_clip_algorithm": None},
+                },
+                # this is after because it refers to the `LightningModule.optimizer_step` hook which encapsulates
+                # the actual call to `Precision.optimizer_step`
+                {
+                    "name": "optimizer_step",
+                    "args": (current_epoch, i, ANY, ANY),
+                },
+                *([{"name": "lr_scheduler_step", "args": ANY}] if i == (trainer.num_training_batches - 1) else []),
+                {"name": "Callback.on_train_batch_end", "args": (trainer, model, {"loss": ANY}, ANY, i)},
+                {"name": "on_train_batch_end", "args": ({"loss": ANY}, ANY, i)},
+            ])
         return out
 
     @staticmethod