Lightning-AI · Copilot · Oct 4, 2025 · Oct 4, 2025 · Oct 4, 2025
@@ -481,10 +481,15 @@ def on_advance_end(self) -> None:
 
         trainer._logger_connector.on_epoch_end()
 
-        if not self.restarting and self.epoch_loop._num_ready_batches_reached():
-            # since metric-based schedulers require access to metrics and those are not currently saved in the
-            # checkpoint, the plateau schedulers shouldn't be updated
-            self.epoch_loop.update_lr_schedulers("epoch", update_plateau_schedulers=not self.restarting)
+        # since metric-based schedulers require access to metrics and those are not currently saved in the
+        # checkpoint, the plateau schedulers shouldn't be updated when restarting
+        # only update plateau schedulers if validation ran this epoch to ensure monitored metrics are available
+        if (
+            not self.restarting
+            and self.epoch_loop._num_ready_batches_reached()
+            and self.epoch_loop._should_check_val_epoch()
+        ):
+            self.epoch_loop.update_lr_schedulers("epoch", update_plateau_schedulers=True)
 
         # we manually decrease here because loggers expect that the same step is used when logging epoch-end metrics
         # even when the batch loop has finished

@@ -165,6 +165,45 @@ def configure_optimizers(self):
     )
 
 
+def test_reducelronplateau_with_check_val_every_n_epoch(tmp_path):
+    """Test that ReduceLROnPlateau works correctly when validation runs every N epochs."""
+
+    class TestModel(BoringModel):
+        def training_step(self, batch, batch_idx):
+            loss = super().training_step(batch, batch_idx)["loss"]
+            self.log("train/loss", loss)
+            return loss
+
+        def validation_step(self, batch, batch_idx):
+            loss = super().validation_step(batch, batch_idx)["x"]
+            self.log("val/loss", loss)
+            return loss
+
+        def configure_optimizers(self):
+            optimizer = optim.Adam(self.parameters())
+            return {
+                "optimizer": optimizer,
+                "lr_scheduler": {
+                    "scheduler": optim.lr_scheduler.ReduceLROnPlateau(optimizer),
+                    "monitor": "val/loss",
+                },
+            }
+
+    model = TestModel()
+    # Validation runs every 2 epochs, but scheduler should only update on validation epochs
+    trainer = Trainer(
+        default_root_dir=tmp_path,
+        max_epochs=3,
+        limit_train_batches=2,
+        limit_val_batches=2,
+        check_val_every_n_epoch=2,
+        enable_progress_bar=False,
+        enable_model_summary=False,
+    )
+    # This should not raise an error about missing val/loss metric
+    trainer.fit(model)
+
+
 def test_optimizer_return_options(tmp_path):
     trainer = Trainer(default_root_dir=tmp_path)
     model = BoringModel()