Include eval/predict dataloaders in save path (#909)

diego-urgell · facebook-github-bot · commit 47fbf01e7a71 · 2024-10-01T21:14:30.000-07:00
Summary: Pull Request resolved: #909 Reviewed By: JKSenthil Differential Revision: D63013007 fbshipit-source-id: b2972310ddd39bb91b64dbdcebc06c5cbd1f3035
diff --git a/tests/framework/callbacks/test_checkpoint_utils.py b/tests/framework/callbacks/test_checkpoint_utils.py
@@ -8,11 +8,16 @@
 
 import unittest
 
+from pyre_extensions import none_throws
+
 from torch import nn
+from torchtnt.framework import ActivePhase
 
 from torchtnt.framework._test_utils import (
     DummyAutoUnit,
+    DummyMeanMetric,
     DummyTrainUnit,
+    generate_dummy_stateful_dataloader,
     get_dummy_eval_state,
     get_dummy_fit_state,
     get_dummy_train_state,
@@ -28,15 +33,37 @@
 class CheckpointUtilsTest(unittest.TestCase):
 
     def test_get_app_state(self) -> None:
+
+        # Test end-of-epoch checkpoint
         my_unit = DummyTrainUnit(input_dim=2)
         state = get_dummy_train_state()
-
         app_state = _prepare_app_state_for_checkpoint(state, my_unit, intra_epoch=False)
         self.assertCountEqual(
             app_state.keys(),
             ["module", "optimizer", "loss_fn", "train_progress"],
         )
 
+        # Test train intra-epoch checkpoint
+        my_unit = DummyTrainUnit(input_dim=2)
+        my_unit.mean_metric = DummyMeanMetric()  # pyre-ignore[16]
+        state = get_dummy_train_state()
+        stateful_dl = generate_dummy_stateful_dataloader(1, 1, 1)
+        state._active_phase = ActivePhase.TRAIN
+        none_throws(state.train_state)._dataloader = stateful_dl
+
+        app_state = _prepare_app_state_for_checkpoint(state, my_unit, intra_epoch=True)
+        self.assertCountEqual(
+            app_state.keys(),
+            [
+                "module",
+                "optimizer",
+                "loss_fn",
+                "train_progress",
+                "train_dataloader",
+                "mean_metric",
+            ],
+        )
+
     def test_get_step_phase_mapping(self) -> None:
         unit = DummyAutoUnit(module=nn.Linear(2, 2))
         unit.train_progress._num_steps_completed = 5
diff --git a/tests/framework/test_state.py b/tests/framework/test_state.py
@@ -9,7 +9,10 @@
 
 import unittest
 
+from torchtnt.framework import ActivePhase
+
 from torchtnt.framework.state import _check_loop_condition, PhaseState
+from torchtnt.utils.checkpoint import Phase
 
 
 class StateTest(unittest.TestCase):
@@ -39,3 +42,13 @@ def test_phase_state_validation(self) -> None:
             ValueError, "Invalid value provided for evaluate_every_n_epochs"
         ):
             PhaseState(dataloader=[], evaluate_every_n_epochs=-2)
+
+    def test_active_phase_into_phase(self) -> None:
+        active_phase = ActivePhase.TRAIN
+        self.assertEqual(active_phase.into_phase(), Phase.TRAIN)
+
+        eval_phase = ActivePhase.EVALUATE
+        self.assertEqual(eval_phase.into_phase(), Phase.EVALUATE)
+
+        predict_phase = ActivePhase.PREDICT
+        self.assertEqual(predict_phase.into_phase(), Phase.PREDICT)
diff --git a/torchtnt/framework/_test_utils.py b/torchtnt/framework/_test_utils.py
@@ -7,7 +7,7 @@
 
 # pyre-strict
 
-from typing import Iterable, Iterator, List, Optional, Tuple
+from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple
 
 import torch
 from torch import nn, Tensor
@@ -236,3 +236,51 @@ def configure_optimizers_and_lr_scheduler(
             my_optimizer, gamma=0.9
         )
         return my_optimizer, my_lr_scheduler
+
+
+class DummyStatefulDataLoader:
+    """Dummy Dataloader that implements state_dict and load_state_dict"""
+
+    def __init__(self, dataloader: DataLoader) -> None:
+        self.dataloader = dataloader
+
+    def state_dict(self) -> Dict[str, Any]:
+        return {"current_batch": 1}
+
+    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
+        return None
+
+    def __iter__(self) -> Iterator[object]:
+        return iter(self.dataloader)
+
+
+def generate_dummy_stateful_dataloader(
+    num_samples: int, input_dim: int, batch_size: int
+) -> DummyStatefulDataLoader:
+    return DummyStatefulDataLoader(
+        DataLoader(
+            dataset=RandomIterableDataset(input_dim, num_samples),
+            batch_size=batch_size,
+        )
+    )
+
+
+class DummyMeanMetric:
+    def __init__(self) -> None:
+        super().__init__()
+        self.sum: float = 0.0
+        self.count: int = 0
+
+    def update(self, value: float) -> None:
+        self.sum += value
+        self.count += 1
+
+    def compute(self) -> float:
+        return self.sum / self.count if self.count > 0 else 0.0
+
+    def state_dict(self) -> Dict[str, Any]:
+        return {"sum": self.sum, "count": self.count}
+
+    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
+        self.sum = state_dict["sum"]
+        self.count = state_dict["count"]
diff --git a/torchtnt/framework/callbacks/_checkpoint_utils.py b/torchtnt/framework/callbacks/_checkpoint_utils.py
@@ -9,7 +9,6 @@
 
 from typing import Any, cast, Dict, Union
 
-from pyre_extensions import none_throws
 from torchtnt.framework.callbacks.checkpointer_types import RestoreOptions
 from torchtnt.framework.state import EntryPoint, State
 from torchtnt.framework.unit import AppStateMixin, TEvalUnit, TTrainUnit
@@ -19,8 +18,13 @@
 
 
 # keys for use when checkpointing
-_TRAIN_PROGRESS_STATE_KEY = "train_progress"
+_PHASE_DL_STATE_KEY_MAPPING: Dict[Phase, str] = {
+    Phase.TRAIN: "train_dataloader",
+    Phase.EVALUATE: "eval_dataloader",
+    Phase.PREDICT: "predict_dataloader",
+}
 _TRAIN_DL_STATE_KEY = "train_dataloader"
+_TRAIN_PROGRESS_STATE_KEY = "train_progress"
 _EVAL_PROGRESS_STATE_KEY = "eval_progress"
 
 
@@ -60,11 +64,13 @@ def _prepare_app_state_for_checkpoint(
     """
     app_state = _prepare_app_state(unit)
 
-    # for intra-epoch checkpointing, include dataloader states
-    train_state = none_throws(state.train_state)
-    train_dl = train_state.dataloader
-    if intra_epoch and isinstance(train_dl, Stateful):
-        app_state[_TRAIN_DL_STATE_KEY] = train_dl
+    # for intra-epoch checkpointing, include dataloader state of the current phase
+    phase_dl = state.active_phase_state().dataloader
+    if intra_epoch and isinstance(phase_dl, Stateful):
+        dataloader_state_key = _PHASE_DL_STATE_KEY_MAPPING[
+            state.active_phase.into_phase()
+        ]
+        app_state[dataloader_state_key] = phase_dl
 
     return app_state
 
diff --git a/torchtnt/framework/state.py b/torchtnt/framework/state.py
@@ -13,6 +13,7 @@
 from typing import Generic, Iterable, Optional, TypeVar
 
 from pyre_extensions import none_throws
+from torchtnt.utils.checkpoint import Phase
 
 from torchtnt.utils.timer import BoundedTimer, TimerProtocol
 
@@ -62,6 +63,17 @@ class ActivePhase(Enum):
     EVALUATE = auto()
     PREDICT = auto()
 
+    def into_phase(self) -> Phase:
+        """Converts the active phase to the corresponding phase."""
+        if self == ActivePhase.TRAIN:
+            return Phase.TRAIN
+        elif self == ActivePhase.EVALUATE:
+            return Phase.EVALUATE
+        elif self == ActivePhase.PREDICT:
+            return Phase.PREDICT
+        else:
+            raise AssertionError("Should match an ActivePhase")
+
 
 class PhaseState(Generic[TData, TStepOutput]):
     """State for each phase (train, eval, predict).