Fix validation loop handling on restart

lantiga · lantiga · commit cbb9fb534e4f · 2024-11-06T12:03:21.000Z
diff --git a/src/lightning/pytorch/loops/evaluation_loop.py b/src/lightning/pytorch/loops/evaluation_loop.py
@@ -201,7 +201,7 @@ def setup_data(self) -> None:
     def restarting_on_evaluation_end(self) -> bool:
         return (
             self.restarting
-            and self.batch.progress.total.started == self.batch_progress.total.ready
+            and self.batch_progress.total.started == self.batch_progress.total.ready
             and self.batch_progress.total.processed == self.batch_progress.total.started
             and self.batch_progress.total.completed == self.batch_progress.total.processed - 1
         )
@@ -245,6 +245,14 @@ def reset(self) -> None:
         data_fetcher._stop_profiler = self._on_after_fetch
         self._data_fetcher = data_fetcher
 
+    def increment_progress_to_evaluation_end(self) -> None:
+        self.setup_data()
+        if self.skip:
+            return
+        self.reset()
+        max_batch = max(self.max_batches)
+        self.batch_progress.increment_by(max_batch, True)
+
     def on_run_start(self) -> None:
         """Runs the ``_on_evaluation_model_eval``, ``_on_evaluation_start`` and ``_on_evaluation_epoch_start``
         hooks."""
diff --git a/src/lightning/pytorch/loops/progress.py b/src/lightning/pytorch/loops/progress.py
@@ -59,6 +59,7 @@ def reset(self) -> None:
         self.ready = 0
         self.completed = 0
 
+    @override
     def reset_on_restart(self) -> None:
         """Reset the progress on restart.
 
@@ -68,6 +69,11 @@ def reset_on_restart(self) -> None:
         """
         self.ready = self.completed
 
+    @override
+    def increment_by(self, n) -> None:
+        self.ready += n
+        self.completed += n
+
 
 @dataclass
 class _StartedTracker(_ReadyCompletedTracker):
@@ -94,6 +100,11 @@ def reset_on_restart(self) -> None:
         super().reset_on_restart()
         self.started = self.completed
 
+    @override
+    def increment_by(self, n) -> None:
+        super().increment_by(n)
+        self.started += n
+
 
 @dataclass
 class _ProcessedTracker(_StartedTracker):
@@ -121,6 +132,11 @@ def reset_on_restart(self) -> None:
         super().reset_on_restart()
         self.processed = self.completed
 
+    @override
+    def increment_by(self, n) -> None:
+        super().increment_by(n)
+        self.processed += n
+
 
 @dataclass
 class _Progress(_BaseProgress):
@@ -175,6 +191,11 @@ def reset_on_run(self) -> None:
     def reset_on_restart(self) -> None:
         self.current.reset_on_restart()
 
+    @override
+    def increment_by(self, n) -> None:
+        self.total.increment_by(n)
+        self.current.increment_by(n)
+
     @override
     def load_state_dict(self, state_dict: dict) -> None:
         self.total.load_state_dict(state_dict["total"])
@@ -206,6 +227,10 @@ def reset_on_run(self) -> None:
         super().reset_on_run()
         self.is_last_batch = False
 
+    def increment_by(self, n, is_last_batch=False) -> None:
+        super().increment_by(n)
+        self.is_last_batch = is_last_batch
+
     @override
     def load_state_dict(self, state_dict: dict) -> None:
         super().load_state_dict(state_dict)
diff --git a/src/lightning/pytorch/loops/training_epoch_loop.py b/src/lightning/pytorch/loops/training_epoch_loop.py
@@ -217,8 +217,9 @@ def advance(self, data_fetcher: _DataFetcher) -> None:
 
         """
         if self.restarting and self._should_check_val_fx(data_fetcher):
-            # skip training and run validation in `on_advance_end`
-            return
+            # fast forward progress counters to end of validation
+            self.val_loop.increment_progress_to_evaluation_end()
+
         # we are going to train first so the val loop does not need to restart
         self.val_loop.restarting = False
 
diff --git a/tests/tests_pytorch/loops/test_loops.py b/tests/tests_pytorch/loops/test_loops.py
@@ -730,6 +730,71 @@ def test_restart_parity(tmp_path):
     assert compare_state_dicts(end_of_epoch_ckpt["state_dict"], end_of_epoch_ckpt_v1["state_dict"]) == {}
 
 
+def test_restart_parity_with_val(tmp_path):
+    model = PredictableBoringModel()
+    checkpoint_callback = ModelCheckpoint(
+        dirpath=tmp_path,
+        every_n_train_steps=2,
+        save_top_k=-1,
+    )
+    trainer = Trainer(
+        default_root_dir=tmp_path,
+        limit_train_batches=4,
+        max_epochs=4,
+        callbacks=[checkpoint_callback],
+        logger=False,
+        enable_model_summary=False,
+        enable_progress_bar=False,
+        limit_val_batches=4,
+        val_check_interval=2,
+    )
+    trainer.fit(model)
+    loss = model.last_loss
+
+    trainer = Trainer(
+        default_root_dir=tmp_path,
+        limit_train_batches=4,
+        max_epochs=4,
+        callbacks=[checkpoint_callback],
+        logger=False,
+        enable_model_summary=False,
+        enable_progress_bar=False,
+        limit_val_batches=4,
+        val_check_interval=2,
+    )
+    trainer.fit(model, ckpt_path=str(tmp_path / "epoch=0-step=2.ckpt"))
+    loss_v1 = model.last_loss
+
+    assert(abs(loss - loss_v1) < 1e-8)
+
+    end_of_epoch_ckpt = torch.load(str(tmp_path / "epoch=0-step=4.ckpt"), weights_only=True)
+    end_of_epoch_ckpt_v1 = torch.load(str(tmp_path / "epoch=0-step=4-v1.ckpt"), weights_only=True)
+
+    assert compare_state_dicts(end_of_epoch_ckpt["loops"], end_of_epoch_ckpt_v1["loops"]) == {}
+    assert compare_state_dicts(end_of_epoch_ckpt["lr_schedulers"][0], end_of_epoch_ckpt_v1["lr_schedulers"][0]) == {}
+    assert end_of_epoch_ckpt["epoch"] == end_of_epoch_ckpt_v1["epoch"]
+    assert end_of_epoch_ckpt["global_step"] == end_of_epoch_ckpt_v1["global_step"]
+    assert compare_state_dicts(end_of_epoch_ckpt["state_dict"], end_of_epoch_ckpt_v1["state_dict"]) == {}
+
+    mid_epoch_ckpt = torch.load(str(tmp_path / "epoch=1-step=6.ckpt"), weights_only=True)
+    mid_epoch_ckpt_v1 = torch.load(str(tmp_path / "epoch=1-step=6-v1.ckpt"), weights_only=True)
+
+    assert compare_state_dicts(mid_epoch_ckpt["loops"], mid_epoch_ckpt_v1["loops"]) == {}
+    assert compare_state_dicts(mid_epoch_ckpt["lr_schedulers"][0], mid_epoch_ckpt_v1["lr_schedulers"][0]) == {}
+    assert mid_epoch_ckpt["epoch"] == mid_epoch_ckpt_v1["epoch"]
+    assert mid_epoch_ckpt["global_step"] == mid_epoch_ckpt_v1["global_step"]
+    assert compare_state_dicts(mid_epoch_ckpt["state_dict"], mid_epoch_ckpt_v1["state_dict"]) == {}
+
+    end_of_epoch_ckpt = torch.load(str(tmp_path / "epoch=1-step=8.ckpt"), weights_only=True)
+    end_of_epoch_ckpt_v1 = torch.load(str(tmp_path / "epoch=1-step=8-v1.ckpt"), weights_only=True)
+
+    assert compare_state_dicts(end_of_epoch_ckpt["loops"], end_of_epoch_ckpt_v1["loops"]) == {}
+    assert compare_state_dicts(end_of_epoch_ckpt["lr_schedulers"][0], end_of_epoch_ckpt_v1["lr_schedulers"][0]) == {}
+    assert end_of_epoch_ckpt["epoch"] == end_of_epoch_ckpt_v1["epoch"]
+    assert end_of_epoch_ckpt["global_step"] == end_of_epoch_ckpt_v1["global_step"]
+    assert compare_state_dicts(end_of_epoch_ckpt["state_dict"], end_of_epoch_ckpt_v1["state_dict"]) == {}
+
+
 @pytest.mark.parametrize(
     ("train_datasets", "val_datasets"),
     [([RandomDataset], [RandomDataset]), ([RandomDataset], [RandomDataset, RandomDataset])],