Fix double iteration bug when resumed from a checkpoint. (#20775)

sudiptob2 · Borda · bhimrazy · web-flow · commit 25b1343f1c11 · 2025-08-05T05:54:24.000+05:30
* Fix double iteration bug when resumed from a checkpoint.

* Apply suggestions from code review

* update wording in the comments.

Signed-off-by: sudipto baral &lt;sudiptobaral.me@gmail.com&gt;

* update test

Signed-off-by: sudipto baral &lt;sudiptobaral.me@gmail.com&gt;

* Add independent flag to track checkpoint resumption.

Signed-off-by: sudipto baral &lt;sudiptobaral.me@gmail.com&gt;

* lint

Signed-off-by: sudipto baral &lt;sudiptobaral.me@gmail.com&gt;

* update

* Update src/lightning/pytorch/loops/training_epoch_loop.py

Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;

* Update .github/workflows/ci-tests-pytorch.yml

* update

* skip

---------

Signed-off-by: sudipto baral &lt;sudiptobaral.me@gmail.com&gt;
Co-authored-by: Jirka Borovec &lt;6035284+Borda@users.noreply.github.com&gt;
Co-authored-by: Bhimraj Yadav &lt;bhimrajyadav977@gmail.com&gt;
Co-authored-by: Deependu &lt;deependujha21@gmail.com&gt;
Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
diff --git a/src/lightning/pytorch/loops/loop.py b/src/lightning/pytorch/loops/loop.py
@@ -23,6 +23,7 @@ class _Loop:
     def __init__(self, trainer: "pl.Trainer") -> None:
         self._restarting = False
         self._loaded_from_state_dict = False
+        self._resuming_from_checkpoint = False
         self.trainer = trainer
 
     @property
@@ -38,6 +39,11 @@ def restarting(self, restarting: bool) -> None:
             if isinstance(loop, _Loop):
                 loop.restarting = restarting
 
+    @property
+    def is_resuming(self) -> bool:
+        """Indicates whether training is being resumed from a checkpoint."""
+        return self._resuming_from_checkpoint
+
     def reset_restart_stage(self) -> None:
         pass
 
@@ -87,6 +93,7 @@ def load_state_dict(
                 v.load_state_dict(state_dict.copy(), prefix + k + ".")
         self.restarting = True
         self._loaded_from_state_dict = True
+        self._resuming_from_checkpoint = True
 
     def _load_from_state_dict(self, state_dict: dict, prefix: str) -> None:
         for k, v in self.__dict__.items():
@@ -102,4 +109,5 @@ def _load_from_state_dict(self, state_dict: dict, prefix: str) -> None:
     def on_iteration_done(self) -> None:
         self._restarting = False
         self._loaded_from_state_dict = False
+        self._resuming_from_checkpoint = False
         self.reset_restart_stage()
diff --git a/src/lightning/pytorch/loops/training_epoch_loop.py b/src/lightning/pytorch/loops/training_epoch_loop.py
@@ -237,7 +237,11 @@ def reset(self) -> None:
 
     def on_run_start(self, data_fetcher: _DataFetcher) -> None:
         # `iter()` was called once in `FitLoop.setup_data()` already
-        if self.trainer.current_epoch > 0 and not self.restarting:
+        # Call `iter()` again only when:
+        #       1. Not restarting
+        #       2. Not resuming from checkpoint (not is_resuming)
+        #       3. Past first epoch (current_epoch > 0)
+        if self.trainer.current_epoch > 0 and not self.trainer.fit_loop.is_resuming and not self.restarting:
             iter(data_fetcher)  # creates the iterator inside the fetcher
 
         # add the previous `fetched` value to properly track `is_last_batch` with no prefetching
diff --git a/tests/tests_pytorch/loops/test_double_iter_in_iterable_dataset.py b/tests/tests_pytorch/loops/test_double_iter_in_iterable_dataset.py
@@ -0,0 +1,76 @@
+# Copyright The Lightning AI team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This test tests the resuming of training from a checkpoint file using an IterableDataset.
+# And contains code mentioned in the issue: #19427.
+# Ref: https://github.com/Lightning-AI/pytorch-lightning/issues/19427
+import multiprocessing as mp
+import os
+import sys
+from collections.abc import Iterator
+from pathlib import Path
+from queue import Queue
+
+import numpy as np
+import pytest
+from torch.utils.data import DataLoader, IterableDataset
+
+from lightning.pytorch import Trainer
+from lightning.pytorch.demos.boring_classes import BoringModel
+
+
+class QueueDataset(IterableDataset):
+    def __init__(self, queue: Queue) -> None:
+        super().__init__()
+        self.queue = queue
+
+    def __iter__(self) -> Iterator:
+        for _ in range(5):
+            tensor, _ = self.queue.get(timeout=5)
+            yield tensor
+
+
+def train_model(queue: Queue, max_epochs: int, ckpt_path: Path) -> None:
+    dataloader = DataLoader(QueueDataset(queue), num_workers=1, batch_size=None)
+    trainer = Trainer(
+        max_epochs=max_epochs,
+        enable_progress_bar=False,
+        enable_checkpointing=False,
+        devices=1,
+        logger=False,
+    )
+    if ckpt_path.exists():
+        trainer.fit(BoringModel(), dataloader, ckpt_path=str(ckpt_path))
+    else:
+        trainer.fit(BoringModel(), dataloader)
+        trainer.save_checkpoint(str(ckpt_path))
+
+
+@pytest.mark.skipif(sys.platform == "darwin", reason="Skip on macOS due to multiprocessing issues")
+def test_resume_training_with(tmp_path):
+    """Test resuming training from checkpoint file using a IterableDataset."""
+    q = mp.Queue()
+    arr = np.random.random([1, 32]).astype(np.float32)
+    for idx in range(20):
+        q.put((arr, idx))
+
+    max_epoch = 2
+    ckpt_path = tmp_path / "model.ckpt"
+    train_model(q, max_epoch, ckpt_path)
+
+    assert os.path.exists(ckpt_path), f"Checkpoint file '{ckpt_path}' wasn't created"
+    ckpt_size = os.path.getsize(ckpt_path)
+    assert ckpt_size > 0, f"Checkpoint file is empty (size: {ckpt_size} bytes)"
+
+    train_model(q, max_epoch + 2, ckpt_path)