From 9d92a4a28e2c6cd208f832efd0a015d7c8cd58ff Mon Sep 17 00:00:00 2001 From: milesial Date: Tue, 3 Jan 2023 15:48:31 +0100 Subject: [PATCH 01/10] Fix LR scheduler behaviour with AMP --- src/pytorch_lightning/core/module.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py index 878313b675bdc..9c5875d931939 100644 --- a/src/pytorch_lightning/core/module.py +++ b/src/pytorch_lightning/core/module.py @@ -1649,6 +1649,10 @@ def lr_scheduler_step(self, scheduler, optimizer_idx, metric): scheduler.step(epoch=self.current_epoch) """ + optimizer = self.trainer.optimizers[optimizer_idx] + if hasattr(optimizer, '_step_count') and optimizer._step_count <= 0: + return + if metric is None: scheduler.step() # type: ignore[call-arg] else: From 2c2b138342360e313d6c0ae2f856a29ca5f1e22a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 3 Jan 2023 14:55:56 +0000 Subject: [PATCH 02/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/pytorch_lightning/core/module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py index 9c5875d931939..417332f317b53 100644 --- a/src/pytorch_lightning/core/module.py +++ b/src/pytorch_lightning/core/module.py @@ -1650,7 +1650,7 @@ def lr_scheduler_step(self, scheduler, optimizer_idx, metric): """ optimizer = self.trainer.optimizers[optimizer_idx] - if hasattr(optimizer, '_step_count') and optimizer._step_count <= 0: + if hasattr(optimizer, "_step_count") and optimizer._step_count <= 0: return if metric is None: From 1b3365ead62379dce92a1d7b0280c610d26afdc1 Mon Sep 17 00:00:00 2001 From: milesial Date: Mon, 9 Jan 2023 00:38:58 -0800 Subject: [PATCH 03/10] Fix LR schedulers when optimizers with frequencies --- src/pytorch_lightning/loops/epoch/training_epoch_loop.py | 2 +- tests/tests_pytorch/trainer/optimization/test_optimizers.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pytorch_lightning/loops/epoch/training_epoch_loop.py b/src/pytorch_lightning/loops/epoch/training_epoch_loop.py index 5bdec1b55226b..b44ed64d659f8 100644 --- a/src/pytorch_lightning/loops/epoch/training_epoch_loop.py +++ b/src/pytorch_lightning/loops/epoch/training_epoch_loop.py @@ -390,7 +390,7 @@ def update_lr_schedulers(self, interval: str, update_plateau_schedulers: bool) - if interval == "step" and self._should_accumulate(): return active_optimizers = _get_active_optimizers( - self.trainer.optimizers, self.trainer.optimizer_frequencies, self.total_batch_idx + self.trainer.optimizers, self.trainer.optimizer_frequencies, self.batch_idx ) self._update_learning_rates( interval=interval, diff --git a/tests/tests_pytorch/trainer/optimization/test_optimizers.py b/tests/tests_pytorch/trainer/optimization/test_optimizers.py index ed821b0d6ff4c..155cc175bcf80 100644 --- a/tests/tests_pytorch/trainer/optimization/test_optimizers.py +++ b/tests/tests_pytorch/trainer/optimization/test_optimizers.py @@ -286,7 +286,7 @@ def configure_optimizers(self): (dict(step_size=5), dict(T_max=2)), ("epoch", "epoch"), (5, 10), - (2, 3), + (4, 1), 3, ), ], From a237283d446ccb1ba81f036416eaca6dbe361695 Mon Sep 17 00:00:00 2001 From: milesial Date: Wed, 4 Jan 2023 11:49:55 +0100 Subject: [PATCH 04/10] Move implementation to scales comparison --- src/lightning_fabric/plugins/precision/native_amp.py | 2 ++ src/pytorch_lightning/core/module.py | 4 ---- src/pytorch_lightning/loops/epoch/training_epoch_loop.py | 5 +++-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/lightning_fabric/plugins/precision/native_amp.py b/src/lightning_fabric/plugins/precision/native_amp.py index e86a61c490a24..372afdcc395be 100644 --- a/src/lightning_fabric/plugins/precision/native_amp.py +++ b/src/lightning_fabric/plugins/precision/native_amp.py @@ -73,8 +73,10 @@ def optimizer_step( return super().optimizer_step(optimizer, **kwargs) if isinstance(optimizer, LBFGS): raise TypeError("Native AMP and the LBFGS optimizer are not compatible.") + previous_scale = self.scaler.get_scale() # note: the scaler will skip the `optimizer.step` if nonfinite gradients are found step_output = self.scaler.step(optimizer, **kwargs) + model._skip_next_scheduler_step = self.scaler.get_scale() < previous_scale self.scaler.update() return step_output diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py index 417332f317b53..878313b675bdc 100644 --- a/src/pytorch_lightning/core/module.py +++ b/src/pytorch_lightning/core/module.py @@ -1649,10 +1649,6 @@ def lr_scheduler_step(self, scheduler, optimizer_idx, metric): scheduler.step(epoch=self.current_epoch) """ - optimizer = self.trainer.optimizers[optimizer_idx] - if hasattr(optimizer, "_step_count") and optimizer._step_count <= 0: - return - if metric is None: scheduler.step() # type: ignore[call-arg] else: diff --git a/src/pytorch_lightning/loops/epoch/training_epoch_loop.py b/src/pytorch_lightning/loops/epoch/training_epoch_loop.py index b44ed64d659f8..b8509e082dafb 100644 --- a/src/pytorch_lightning/loops/epoch/training_epoch_loop.py +++ b/src/pytorch_lightning/loops/epoch/training_epoch_loop.py @@ -216,8 +216,9 @@ def advance(self, data_fetcher: AbstractDataFetcher) -> None: # update non-plateau LR schedulers # update epoch-interval ones only when we are at the end of training epoch - self.update_lr_schedulers("step", update_plateau_schedulers=False) - if self._num_ready_batches_reached(): + if not getattr(self.trainer.lightning_module, "_skip_next_scheduler_step", False): + self.update_lr_schedulers("step", update_plateau_schedulers=False) + elif self._num_ready_batches_reached(): self.update_lr_schedulers("epoch", update_plateau_schedulers=False) batch_end_outputs = self._prepare_outputs_training_batch_end( From 183a6a6318d159a508d07c81bb7b1b42a5f0d1bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Tue, 10 Jan 2023 17:13:11 +0100 Subject: [PATCH 05/10] Catch warnings --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index f2609c80d0803..90220afd0741c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,6 +34,7 @@ markers = cloud:Run the cloud tests for example filterwarnings = error::FutureWarning + error:Detected call of `lr_scheduler.step\(\)` before `optimizer.step\(\)`:UserWarning xfail_strict = true junit_duration_report = call From 0406a5561979f4cff723321592aeccce197084ec Mon Sep 17 00:00:00 2001 From: milesial Date: Tue, 10 Jan 2023 20:05:28 +0100 Subject: [PATCH 06/10] Fix implementation --- src/lightning_fabric/plugins/precision/native_amp.py | 2 +- src/pytorch_lightning/loops/epoch/training_epoch_loop.py | 8 +++++--- src/pytorch_lightning/plugins/precision/native_amp.py | 2 ++ tests/tests_fabric/plugins/precision/test_native_amp.py | 1 + tests/tests_pytorch/models/test_hooks.py | 3 ++- 5 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/lightning_fabric/plugins/precision/native_amp.py b/src/lightning_fabric/plugins/precision/native_amp.py index 372afdcc395be..c165e89ae4ee9 100644 --- a/src/lightning_fabric/plugins/precision/native_amp.py +++ b/src/lightning_fabric/plugins/precision/native_amp.py @@ -76,8 +76,8 @@ def optimizer_step( previous_scale = self.scaler.get_scale() # note: the scaler will skip the `optimizer.step` if nonfinite gradients are found step_output = self.scaler.step(optimizer, **kwargs) - model._skip_next_scheduler_step = self.scaler.get_scale() < previous_scale self.scaler.update() + optimizer._skip_next_scheduler_step = self.scaler.get_scale() < previous_scale return step_output def state_dict(self) -> Dict[str, Any]: diff --git a/src/pytorch_lightning/loops/epoch/training_epoch_loop.py b/src/pytorch_lightning/loops/epoch/training_epoch_loop.py index b8509e082dafb..0bd0cd138d802 100644 --- a/src/pytorch_lightning/loops/epoch/training_epoch_loop.py +++ b/src/pytorch_lightning/loops/epoch/training_epoch_loop.py @@ -216,9 +216,8 @@ def advance(self, data_fetcher: AbstractDataFetcher) -> None: # update non-plateau LR schedulers # update epoch-interval ones only when we are at the end of training epoch - if not getattr(self.trainer.lightning_module, "_skip_next_scheduler_step", False): - self.update_lr_schedulers("step", update_plateau_schedulers=False) - elif self._num_ready_batches_reached(): + self.update_lr_schedulers("step", update_plateau_schedulers=False) + if self._num_ready_batches_reached(): self.update_lr_schedulers("epoch", update_plateau_schedulers=False) batch_end_outputs = self._prepare_outputs_training_batch_end( @@ -451,6 +450,9 @@ def _update_learning_rates( ) continue + if getattr(self.trainer.optimizers[config.opt_idx], "_skip_next_scheduler_step", False): + continue + self.scheduler_progress.increment_ready() # update LR diff --git a/src/pytorch_lightning/plugins/precision/native_amp.py b/src/pytorch_lightning/plugins/precision/native_amp.py index 9d1393871fc1d..a02412bd2d108 100644 --- a/src/pytorch_lightning/plugins/precision/native_amp.py +++ b/src/pytorch_lightning/plugins/precision/native_amp.py @@ -85,8 +85,10 @@ def optimizer_step( # type: ignore[override] # in manual optimization, the closure does not return a value if not model.automatic_optimization or not skipped_backward: # note: the scaler will skip the `optimizer.step` if nonfinite gradients are found + previous_scale = self.scaler.get_scale() step_output = self.scaler.step(optimizer, **kwargs) self.scaler.update() + optimizer._skip_next_scheduler_step = self.scaler.get_scale() < previous_scale return step_output return closure_result diff --git a/tests/tests_fabric/plugins/precision/test_native_amp.py b/tests/tests_fabric/plugins/precision/test_native_amp.py index 62f983ab78762..d8ecc02fe06e5 100644 --- a/tests/tests_fabric/plugins/precision/test_native_amp.py +++ b/tests/tests_fabric/plugins/precision/test_native_amp.py @@ -69,6 +69,7 @@ def test_native_amp_precision_backward(): def test_native_amp_precision_optimizer_step_with_scaler(): precision = MixedPrecision(precision="mixed", device="cuda") precision.scaler = Mock() + precision.scaler.get_scale = Mock(return_value=1.0) optimizer = Mock() precision.optimizer_step(optimizer, keyword="arg") diff --git a/tests/tests_pytorch/models/test_hooks.py b/tests/tests_pytorch/models/test_hooks.py index 3702c8a922546..4293ef8d72dce 100644 --- a/tests/tests_pytorch/models/test_hooks.py +++ b/tests/tests_pytorch/models/test_hooks.py @@ -304,6 +304,7 @@ def _auto_train_batch( trainer, model, batches, device=torch.device("cpu"), current_epoch=0, current_batch=0, **kwargs ): using_deepspeed = kwargs.get("strategy") == "deepspeed" + using_native_amp = trainer.precision == 16 and trainer.amp_backend == 'native' out = [] for i in range(current_batch, batches): out.extend( @@ -347,7 +348,7 @@ def _auto_train_batch( kwargs=dict(on_tpu=False, using_lbfgs=False), ), *( - [dict(name="lr_scheduler_step", args=(ANY, 0, None))] + [dict(name="lr_scheduler_step", args=(ANY, 0, None)) if not using_native_amp else ANY] if i == (trainer.num_training_batches - 1) else [] ), From c44a892f4226933c73ff94fce72d8e3d5aa207e0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 10 Jan 2023 20:05:04 +0000 Subject: [PATCH 07/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/tests_pytorch/models/test_hooks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tests_pytorch/models/test_hooks.py b/tests/tests_pytorch/models/test_hooks.py index 4293ef8d72dce..957a87e1a4e13 100644 --- a/tests/tests_pytorch/models/test_hooks.py +++ b/tests/tests_pytorch/models/test_hooks.py @@ -304,7 +304,7 @@ def _auto_train_batch( trainer, model, batches, device=torch.device("cpu"), current_epoch=0, current_batch=0, **kwargs ): using_deepspeed = kwargs.get("strategy") == "deepspeed" - using_native_amp = trainer.precision == 16 and trainer.amp_backend == 'native' + using_native_amp = trainer.precision == 16 and trainer.amp_backend == "native" out = [] for i in range(current_batch, batches): out.extend( From 11e4f721426357c88841b86e270c88530cdab051 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 3 Feb 2023 01:56:51 +0000 Subject: [PATCH 08/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/tests_pytorch/models/test_hooks.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/tests_pytorch/models/test_hooks.py b/tests/tests_pytorch/models/test_hooks.py index 12538872ca2b7..66a32218e6f51 100644 --- a/tests/tests_pytorch/models/test_hooks.py +++ b/tests/tests_pytorch/models/test_hooks.py @@ -345,11 +345,7 @@ def _auto_train_batch( name="optimizer_step", args=(current_epoch, i, ANY, ANY), ), - *( - [dict(name="lr_scheduler_step", args=ANY)] - if i == (trainer.num_training_batches - 1) - else [] - ), + *([dict(name="lr_scheduler_step", args=ANY)] if i == (trainer.num_training_batches - 1) else []), dict(name="Callback.on_train_batch_end", args=(trainer, model, dict(loss=ANY), ANY, i)), dict(name="on_train_batch_end", args=(dict(loss=ANY), ANY, i)), ] From b99707525796dd1472f5779577482e9cc507d002 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 26 Apr 2023 19:46:59 +0000 Subject: [PATCH 09/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/tests_pytorch/models/test_hooks.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/tests_pytorch/models/test_hooks.py b/tests/tests_pytorch/models/test_hooks.py index f5aac0030e79c..e8257b63003c6 100644 --- a/tests/tests_pytorch/models/test_hooks.py +++ b/tests/tests_pytorch/models/test_hooks.py @@ -296,13 +296,13 @@ def _auto_train_batch( }, # this is after because it refers to the `LightningModule.optimizer_step` hook which encapsulates # the actual call to `PrecisionPlugin.optimizer_step` - dict( - name="optimizer_step", - args=(current_epoch, i, ANY, ANY), - ), - *([dict(name="lr_scheduler_step", args=ANY)] if i == (trainer.num_training_batches - 1) else []), - dict(name="Callback.on_train_batch_end", args=(trainer, model, dict(loss=ANY), ANY, i)), - dict(name="on_train_batch_end", args=(dict(loss=ANY), ANY, i)), + { + "name": "optimizer_step", + "args": (current_epoch, i, ANY, ANY), + }, + *([{"name": "lr_scheduler_step", "args": ANY}] if i == (trainer.num_training_batches - 1) else []), + {"name": "Callback.on_train_batch_end", "args": (trainer, model, {"loss": ANY}, ANY, i)}, + {"name": "on_train_batch_end", "args": ({"loss": ANY}, ANY, i)}, ] ) return out From 965fc03b4cff39028e90634b2ddcb2e483f6c3c1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 16 Feb 2024 17:26:55 +0000 Subject: [PATCH 10/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/tests_pytorch/models/test_hooks.py | 82 ++++++++++++------------ 1 file changed, 40 insertions(+), 42 deletions(-) diff --git a/tests/tests_pytorch/models/test_hooks.py b/tests/tests_pytorch/models/test_hooks.py index a022154800b20..596d08641da1b 100644 --- a/tests/tests_pytorch/models/test_hooks.py +++ b/tests/tests_pytorch/models/test_hooks.py @@ -266,48 +266,46 @@ def _auto_train_batch(trainer, model, batches, device, current_epoch=0, current_ using_deepspeed = kwargs.get("strategy") == "deepspeed" out = [] for i in range(current_batch, batches): - out.extend( - [ - {"name": "on_before_batch_transfer", "args": (ANY, 0)}, - {"name": "transfer_batch_to_device", "args": (ANY, device, 0)}, - {"name": "on_after_batch_transfer", "args": (ANY, 0)}, - {"name": "Callback.on_train_batch_start", "args": (trainer, model, ANY, i)}, - {"name": "on_train_batch_start", "args": (ANY, i)}, - {"name": "forward", "args": (ANY,)}, - {"name": "training_step", "args": (ANY, i)}, - {"name": "Callback.on_before_zero_grad", "args": (trainer, model, ANY)}, - {"name": "on_before_zero_grad", "args": (ANY,)}, - {"name": "optimizer_zero_grad", "args": (current_epoch, i, ANY)}, - {"name": "Callback.on_before_backward", "args": (trainer, model, ANY)}, - {"name": "on_before_backward", "args": (ANY,)}, - # DeepSpeed handles backward internally - *([{"name": "backward", "args": (ANY,)}] if not using_deepspeed else []), - {"name": "Callback.on_after_backward", "args": (trainer, model)}, - {"name": "on_after_backward"}, - # note: unscaling happens here in the case of AMP - {"name": "Callback.on_before_optimizer_step", "args": (trainer, model, ANY)}, - {"name": "on_before_optimizer_step", "args": (ANY,)}, - { - "name": "clip_gradients", - "args": (ANY,), - "kwargs": {"gradient_clip_val": None, "gradient_clip_algorithm": None}, - }, - { - "name": "configure_gradient_clipping", - "args": (ANY,), - "kwargs": {"gradient_clip_val": None, "gradient_clip_algorithm": None}, - }, - # this is after because it refers to the `LightningModule.optimizer_step` hook which encapsulates - # the actual call to `Precision.optimizer_step` - { - "name": "optimizer_step", - "args": (current_epoch, i, ANY, ANY), - }, - *([{"name": "lr_scheduler_step", "args": ANY}] if i == (trainer.num_training_batches - 1) else []), - {"name": "Callback.on_train_batch_end", "args": (trainer, model, {"loss": ANY}, ANY, i)}, - {"name": "on_train_batch_end", "args": ({"loss": ANY}, ANY, i)}, - ] - ) + out.extend([ + {"name": "on_before_batch_transfer", "args": (ANY, 0)}, + {"name": "transfer_batch_to_device", "args": (ANY, device, 0)}, + {"name": "on_after_batch_transfer", "args": (ANY, 0)}, + {"name": "Callback.on_train_batch_start", "args": (trainer, model, ANY, i)}, + {"name": "on_train_batch_start", "args": (ANY, i)}, + {"name": "forward", "args": (ANY,)}, + {"name": "training_step", "args": (ANY, i)}, + {"name": "Callback.on_before_zero_grad", "args": (trainer, model, ANY)}, + {"name": "on_before_zero_grad", "args": (ANY,)}, + {"name": "optimizer_zero_grad", "args": (current_epoch, i, ANY)}, + {"name": "Callback.on_before_backward", "args": (trainer, model, ANY)}, + {"name": "on_before_backward", "args": (ANY,)}, + # DeepSpeed handles backward internally + *([{"name": "backward", "args": (ANY,)}] if not using_deepspeed else []), + {"name": "Callback.on_after_backward", "args": (trainer, model)}, + {"name": "on_after_backward"}, + # note: unscaling happens here in the case of AMP + {"name": "Callback.on_before_optimizer_step", "args": (trainer, model, ANY)}, + {"name": "on_before_optimizer_step", "args": (ANY,)}, + { + "name": "clip_gradients", + "args": (ANY,), + "kwargs": {"gradient_clip_val": None, "gradient_clip_algorithm": None}, + }, + { + "name": "configure_gradient_clipping", + "args": (ANY,), + "kwargs": {"gradient_clip_val": None, "gradient_clip_algorithm": None}, + }, + # this is after because it refers to the `LightningModule.optimizer_step` hook which encapsulates + # the actual call to `Precision.optimizer_step` + { + "name": "optimizer_step", + "args": (current_epoch, i, ANY, ANY), + }, + *([{"name": "lr_scheduler_step", "args": ANY}] if i == (trainer.num_training_batches - 1) else []), + {"name": "Callback.on_train_batch_end", "args": (trainer, model, {"loss": ANY}, ANY, i)}, + {"name": "on_train_batch_end", "args": ({"loss": ANY}, ANY, i)}, + ]) return out @staticmethod