Fix edge cases and start from last with and without val

lantiga · lantiga · commit e64c200a0e30 · 2024-11-07T18:44:10.000Z
diff --git a/src/lightning/pytorch/loops/evaluation_loop.py b/src/lightning/pytorch/loops/evaluation_loop.py
@@ -206,15 +206,6 @@ def restarting_mid_evaluation(self) -> bool:
             and self.batch_progress.total.completed == self.batch_progress.total.processed
         )
 
-    @property
-    def restarting_on_evaluation_end(self) -> bool:
-        return (
-            self.restarting
-            and self.batch_progress.total.started == self.batch_progress.total.ready
-            and self.batch_progress.total.processed == self.batch_progress.total.started
-            and self.batch_progress.total.completed == self.batch_progress.total.processed - 1
-        )
-
     def reset(self) -> None:
         """Resets the internal state of the loop."""
         trainer = self.trainer
diff --git a/src/lightning/pytorch/loops/fit_loop.py b/src/lightning/pytorch/loops/fit_loop.py
@@ -329,14 +329,43 @@ def restarting_on_epoch_end(self) -> bool:
             and self.epoch_progress.total.completed == self.epoch_progress.total.processed - 1
         )
 
+    @property
+    def progress_at_epoch_end(self) -> bool:
+        # TODO LUCA comment for restart last without val
+        return (
+            self.epoch_progress.total.started == self.epoch_progress.total.ready
+            and self.epoch_progress.total.processed == self.epoch_progress.total.started
+            and self.epoch_progress.total.completed == self.epoch_progress.total.processed - 1
+        )
+
     def reset(self) -> None:
         """Resets the internal state of this loop."""
         assert self.trainer.model is not None
         torch.set_grad_enabled(True)
 
+        self.epoch_loop.reset_restarting_states()
+
         if self.restarting_on_epoch_start:
             self.epoch_progress.reset_on_restart()
 
+        if self.progress_at_epoch_end:
+            self.epoch_progress.increment_completed()
+
+        # TODO LUCA: refactor restarting for fit_loop
+        restarting_mid_epoch = self.restarting_mid_epoch
+
+        if (self.epoch_loop.restarting_on_train_batch_end
+            and self.restarting_mid_epoch
+            and self.epoch_loop.batch_progress.is_last_batch):
+            self.epoch_progress.increment_processed()
+            self.epoch_progress.increment_completed()
+
+        if (self.epoch_loop.restarting_on_train_batch_end
+            and self.epoch_loop.batch_progress.is_last_batch
+            and not restarting_mid_epoch
+            and not self.epoch_loop.val_loop.batch_progress.is_last_batch):
+            self.epoch_progress.increment_completed()
+
     def on_run_start(self) -> None:
         """Calls the ``on_train_start`` hook."""
         # update the current_epoch in-case of checkpoint reload
diff --git a/src/lightning/pytorch/loops/training_epoch_loop.py b/src/lightning/pytorch/loops/training_epoch_loop.py
@@ -81,6 +81,8 @@ def __init__(self, trainer: "pl.Trainer", min_steps: Optional[int] = None, max_s
         self._results = _ResultCollection(training=True)
         self._warning_cache = WarningCache()
         self._batches_that_stepped: int = 0
+        self._restarting_on_train_batch_end: bool = None
+        self._restarting_on_last: bool = None
 
     @property
     def total_batch_idx(self) -> int:
@@ -146,15 +148,43 @@ def run(self, data_fetcher: _DataFetcher) -> None:
 
     @property
     def restarting_on_train_batch_end(self) -> bool:
-        return (
-            self.restarting
-            and self.batch_progress.total.started == self.batch_progress.total.ready
-            and self.batch_progress.total.processed == self.batch_progress.total.started
-            and self.batch_progress.total.completed == self.batch_progress.total.processed - 1
-        )
+        if self._restarting_on_train_batch_end is None:
+            self._restarting_on_train_batch_end = (
+                self.restarting
+                and self.batch_progress.total.started == self.batch_progress.total.ready
+                and self.batch_progress.total.processed == self.batch_progress.total.started
+                and self.batch_progress.total.completed == self.batch_progress.total.processed - 1
+            )
+        return self._restarting_on_train_batch_end
+
+    @property
+    def restarting_on_last(self) -> bool:
+        if self._restarting_on_last is None:
+            self._restarting_on_last = (
+                self.restarting
+                and self.batch_progress.total.started == self.batch_progress.total.ready
+                and self.batch_progress.total.processed == self.batch_progress.total.started
+                and self.batch_progress.total.completed == self.batch_progress.total.processed
+            )
+        return self._restarting_on_last
+
+    def reset_restarting_states(self) -> None:
+        self._restarting_on_train_batch_end = None
+        self._restarting_on_last = None
+        self.restarting_on_train_batch_end
+        self.restarting_on_last
 
     def reset(self) -> None:
+        self.reset_restarting_states()
         """Resets the internal state of the loop for a new run."""
+        if self.restarting and not self._should_accumulate():
+            # batches_that_stepped is never set prior to saving a checkpoint, even when saving
+            # happens on_validation_end
+            # we could set it in the checkpoint but we prefer to keep checkpoints backward compatible
+            if self.restarting_on_train_batch_end or not self.restarting_on_last:
+            # if not self.restarting_on_train_batch_end and not self.restarting_on_last:
+                self._batches_that_stepped += 1
+
         if self.restarting_on_train_batch_end:
             self.batch_progress.increment_completed()
             # handle situation in which save happened on_train_batch_end and epoch is at end
@@ -163,8 +193,6 @@ def reset(self) -> None:
                 self.scheduler_progress.reset_on_run()
                 self.automatic_optimization.optim_progress.reset_on_run()
                 self.val_loop.batch_progress.total.reset()
-            if not self._should_accumulate():
-                self._batches_that_stepped += 1
 
         if self.restarting:
             self.batch_progress.reset_on_restart()
@@ -217,7 +245,7 @@ def advance(self, data_fetcher: _DataFetcher) -> None:
 
         """
         if self.restarting and self._should_check_val_fx(data_fetcher):
-            if self.val_loop.restarting_mid_evaluation:
+            if self.val_loop.restarting_mid_evaluation or self.restarting_on_last:
                 return
             # fast forward progress counters to end of validation
             self.val_loop.increment_progress_to_evaluation_end()
diff --git a/tests/tests_pytorch/loops/test_loops.py b/tests/tests_pytorch/loops/test_loops.py
@@ -604,16 +604,24 @@ def test_fit_loop_reset(tmp_path):
 
     # we load exactly what was saved - no reset yet
     fit_loop.load_state_dict(end_of_epoch_ckpt["loops"]["fit_loop"])
+
+    assert fit_loop.restarting
+    assert fit_loop.epoch_progress.total.ready == 1
+    assert fit_loop.epoch_progress.total.completed == 0
+    assert fit_loop.epoch_progress.current.ready == 1
+    assert fit_loop.epoch_progress.current.completed == 0
+
     # resetting from a end-of-epoch checkpoint SHOULD reset the current counters to 0
     fit_loop.reset()
     epoch_loop.reset()
 
     # resetting from a mid-of-epoch checkpoint SHOULD NOT reset the current counters to 0
+    # since we are restarting at the end of epoch, we need to see `completed` being updated after reset
     assert fit_loop.restarting
     assert fit_loop.epoch_progress.total.ready == 1
-    assert fit_loop.epoch_progress.total.completed == 0
+    assert fit_loop.epoch_progress.total.completed == 1
     assert fit_loop.epoch_progress.current.ready == 1
-    assert fit_loop.epoch_progress.current.completed == 0
+    assert fit_loop.epoch_progress.current.completed == 1
 
     # however it should increment completed batch progress, since it was saved immediately prior
     assert epoch_loop.restarting
@@ -704,6 +712,7 @@ def test_restart_parity(tmp_path):
         callbacks=[checkpoint_callback],
         logger=False,
         enable_model_summary=False,
+        enable_progress_bar=False,
     )
     trainer.fit(model)
     loss = model.last_loss
@@ -715,6 +724,7 @@ def test_restart_parity(tmp_path):
         callbacks=[checkpoint_callback],
         logger=False,
         enable_model_summary=False,
+        enable_progress_bar=False,
     )
     trainer.fit(model, ckpt_path=str(tmp_path / "epoch=0-step=2.ckpt"))
     loss_v1 = model.last_loss
@@ -749,7 +759,7 @@ def test_restart_parity(tmp_path):
     assert compare_state_dicts(end_of_epoch_ckpt["state_dict"], end_of_epoch_ckpt_v1["state_dict"]) == {}
 
 
-def test_restart_parity_with_val(tmp_path):
+def test_restart_with_val_parity(tmp_path):
     model = PredictableBoringModel()
     checkpoint_callback = ModelCheckpoint(
         dirpath=tmp_path,
@@ -814,6 +824,108 @@ def test_restart_parity_with_val(tmp_path):
     assert compare_state_dicts(end_of_epoch_ckpt["state_dict"], end_of_epoch_ckpt_v1["state_dict"]) == {}
 
 
+def test_restart_from_last_parity(tmp_path):
+    model = PredictableBoringModel()
+    checkpoint_callback = ModelCheckpoint(
+        dirpath=tmp_path,
+        save_last=True,
+        save_top_k=-1,
+    )
+
+    trainer = Trainer(
+        default_root_dir=tmp_path,
+        limit_train_batches=2,
+        max_epochs=4,
+        callbacks=[checkpoint_callback],
+        logger=False,
+        enable_model_summary=False,
+        enable_progress_bar=False,
+    )
+    trainer.fit(model)
+
+    last_ckpt_1 = torch.load(str(tmp_path / "last.ckpt"), weights_only=True)
+
+    trainer = Trainer(
+        default_root_dir=tmp_path,
+        limit_train_batches=2,
+        max_epochs=2,
+        callbacks=[checkpoint_callback],
+        logger=False,
+        enable_model_summary=False,
+        enable_progress_bar=False,
+    )
+    trainer.fit(model)
+
+    trainer = Trainer(
+        default_root_dir=tmp_path,
+        limit_train_batches=2,
+        max_epochs=4,
+        callbacks=[checkpoint_callback],
+        logger=False,
+        enable_model_summary=False,
+        enable_progress_bar=False,
+    )
+    trainer.fit(model, ckpt_path=str(tmp_path / "last.ckpt"))
+
+    last_ckpt_2 = torch.load(str(tmp_path / "last.ckpt"), weights_only=True)
+
+    assert compare_state_dicts(last_ckpt_1["loops"], last_ckpt_2["loops"]) == {}
+
+
+def test_restart_from_last_with_val_parity(tmp_path):
+    model = PredictableBoringModel()
+    checkpoint_callback = ModelCheckpoint(
+        dirpath=tmp_path,
+        save_last=True,
+        save_top_k=-1,
+    )
+
+    trainer = Trainer(
+        default_root_dir=tmp_path,
+        limit_train_batches=2,
+        max_epochs=4,
+        callbacks=[checkpoint_callback],
+        logger=False,
+        enable_model_summary=False,
+        enable_progress_bar=False,
+        limit_val_batches=2,
+        val_check_interval=2,
+    )
+    trainer.fit(model)
+
+    last_ckpt_1 = torch.load(str(tmp_path / "last.ckpt"), weights_only=True)
+
+    trainer = Trainer(
+        default_root_dir=tmp_path,
+        limit_train_batches=2,
+        max_epochs=2,
+        callbacks=[checkpoint_callback],
+        logger=False,
+        enable_model_summary=False,
+        enable_progress_bar=False,
+        limit_val_batches=2,
+        val_check_interval=2,
+    )
+    trainer.fit(model)
+
+    trainer = Trainer(
+        default_root_dir=tmp_path,
+        limit_train_batches=2,
+        max_epochs=4,
+        callbacks=[checkpoint_callback],
+        logger=False,
+        enable_model_summary=False,
+        enable_progress_bar=False,
+        limit_val_batches=2,
+        val_check_interval=2,
+    )
+    trainer.fit(model, ckpt_path=str(tmp_path / "last.ckpt"))
+
+    last_ckpt_2 = torch.load(str(tmp_path / "last.ckpt"), weights_only=True)
+
+    assert compare_state_dicts(last_ckpt_1["loops"], last_ckpt_2["loops"]) == {}
+
+
 @pytest.mark.parametrize(
     ("train_datasets", "val_datasets"),
     [([RandomDataset], [RandomDataset]), ([RandomDataset], [RandomDataset, RandomDataset])],