Avoid skipping to val end if saved mid validation

lantiga · lantiga · commit e8bd2d79a02b · 2024-11-06T13:36:25.000Z
diff --git a/src/lightning/pytorch/loops/evaluation_loop.py b/src/lightning/pytorch/loops/evaluation_loop.py
@@ -197,6 +197,15 @@ def setup_data(self) -> None:
         # this depends on the data used, so reset it too
         self._seen_batches_per_dataloader = defaultdict(int)
 
+    @property
+    def restarting_mid_evaluation(self) -> bool:
+        return (
+            self.restarting
+            and self.batch_progress.total.started == self.batch_progress.total.ready
+            and self.batch_progress.total.processed == self.batch_progress.total.started - 1
+            and self.batch_progress.total.completed == self.batch_progress.total.processed
+        )
+
     @property
     def restarting_on_evaluation_end(self) -> bool:
         return (
diff --git a/src/lightning/pytorch/loops/training_epoch_loop.py b/src/lightning/pytorch/loops/training_epoch_loop.py
@@ -217,6 +217,8 @@ def advance(self, data_fetcher: _DataFetcher) -> None:
 
         """
         if self.restarting and self._should_check_val_fx(data_fetcher):
+            if self.val_loop.restarting_mid_evaluation:
+                return
             # fast forward progress counters to end of validation
             self.val_loop.increment_progress_to_evaluation_end()
 
diff --git a/tests/tests_pytorch/loops/test_loops.py b/tests/tests_pytorch/loops/test_loops.py
@@ -397,12 +397,13 @@ def training_step(self, batch, batch_idx):
     assert state_dict == checkpoint["loops"]["fit_loop"]
 
     trainer.fit_loop.load_state_dict(checkpoint["loops"]["fit_loop"])
-    # test resetting manually, we expect all `ready` counters to be reset to `completed`
+    # test resetting manually, we expect the `ready` counter for batch to be reset to `completed`
+    # but the `ready` counter for epoch to not be reset, since we are still mid epoch
     trainer.fit_loop.reset()
     trainer.fit_loop.epoch_loop.reset()
 
     epoch_progress = trainer.fit_loop.epoch_progress
-    assert epoch_progress.current.ready == stop_epoch
+    assert epoch_progress.current.ready == stop_epoch + 1
     assert epoch_progress.current.completed == stop_epoch
 
     batch_progress = trainer.fit_loop.epoch_loop.batch_progress
@@ -418,7 +419,7 @@ def training_step(self, batch, batch_idx):
     state_dict = trainer.fit_loop.state_dict()
     assert state_dict != checkpoint["loops"]["fit_loop"]
     assert state_dict["epoch_progress"]["total"]["started"] == stop_epoch + 1
-    assert state_dict["epoch_progress"]["current"]["started"] == stop_epoch
+    assert state_dict["epoch_progress"]["current"]["started"] == stop_epoch + 1
 
 
 def test_loop_state_on_complete_run(tmp_path):