Complete utils to get step and epoch according to active_phase (#913)

diego-urgell · facebook-github-bot · commit 6d99aae70c43 · 2024-10-01T21:14:30.000-07:00
Summary: Pull Request resolved: #913 Reviewed By: anshulverma, shalgi Differential Revision: D63491481 fbshipit-source-id: 137bf317d7d3da48e3fb681b2115b94ecb0212db
diff --git a/tests/framework/callbacks/test_checkpoint_utils.py b/tests/framework/callbacks/test_checkpoint_utils.py
@@ -21,10 +21,12 @@
     generate_dummy_stateful_dataloader,
     get_dummy_eval_state,
     get_dummy_fit_state,
+    get_dummy_predict_state,
     get_dummy_train_state,
 )
 
 from torchtnt.framework.callbacks._checkpoint_utils import (
+    _get_epoch,
     _get_step_phase_mapping,
     _prepare_app_state_for_checkpoint,
 )
@@ -87,6 +89,7 @@ def test_get_step_phase_mapping(self) -> None:
         unit = DummyAutoUnit(module=nn.Linear(2, 2))
         unit.train_progress._num_steps_completed = 5
         unit.eval_progress._num_steps_completed = 7
+        unit.predict_progress._num_steps_completed = 9
 
         fit_state = get_dummy_fit_state()
         self.assertEqual(
@@ -99,3 +102,26 @@ def test_get_step_phase_mapping(self) -> None:
 
         eval_state = get_dummy_eval_state()
         self.assertEqual({Phase.EVALUATE: 7}, _get_step_phase_mapping(eval_state, unit))
+
+        predict_state = get_dummy_predict_state()
+        self.assertEqual(
+            {Phase.PREDICT: 9}, _get_step_phase_mapping(predict_state, unit)
+        )
+
+    def test_get_epoch(self) -> None:
+        unit = DummyAutoUnit(module=nn.Linear(2, 2))
+        unit.train_progress._num_epochs_completed = 1
+        unit.eval_progress._num_epochs_completed = 2
+        unit.predict_progress._num_epochs_completed = 3
+
+        fit_state = get_dummy_fit_state()
+        self.assertEqual(1, _get_epoch(fit_state, unit))
+
+        train_state = get_dummy_train_state()
+        self.assertEqual(1, _get_epoch(train_state, unit))
+
+        eval_state = get_dummy_eval_state()
+        self.assertEqual(2, _get_epoch(eval_state, unit))
+
+        predict_state = get_dummy_predict_state()
+        self.assertEqual(3, _get_epoch(predict_state, unit))
diff --git a/torchtnt/framework/_test_utils.py b/torchtnt/framework/_test_utils.py
@@ -46,6 +46,19 @@ def get_dummy_eval_state(dataloader: Optional[Iterable[object]] = None) -> State
     )
 
 
+def get_dummy_predict_state(dataloader: Optional[Iterable[object]] = None) -> State:
+    return State(
+        entry_point=EntryPoint.PREDICT,
+        predict_state=PhaseState(
+            dataloader=dataloader or [1, 2, 3, 4],
+            max_epochs=1,
+            max_steps=1,
+            max_steps_per_epoch=1,
+        ),
+        timer=None,
+    )
+
+
 def get_dummy_fit_state() -> State:
     return State(
         entry_point=EntryPoint.FIT,
diff --git a/torchtnt/framework/callbacks/_checkpoint_utils.py b/torchtnt/framework/callbacks/_checkpoint_utils.py
@@ -11,7 +11,7 @@
 
 from torchtnt.framework.callbacks.checkpointer_types import RestoreOptions
 from torchtnt.framework.state import EntryPoint, State
-from torchtnt.framework.unit import AppStateMixin, TEvalUnit, TTrainUnit
+from torchtnt.framework.unit import AppStateMixin, TEvalUnit, TPredictUnit, TTrainUnit
 from torchtnt.utils.checkpoint import Phase
 
 from torchtnt.utils.stateful import Stateful
@@ -31,7 +31,7 @@
 
 
 def _get_step_phase_mapping(
-    state: State, unit: Union[TTrainUnit, TEvalUnit]
+    state: State, unit: Union[TTrainUnit, TEvalUnit, TPredictUnit]
 ) -> Dict[Phase, int]:
     """
     Returns a mapping of phase to step, depending on the entrypoint.
@@ -47,9 +47,32 @@ def _get_step_phase_mapping(
         eval_unit = cast(TEvalUnit, unit)
         step_mapping[Phase.EVALUATE] = eval_unit.eval_progress.num_steps_completed
 
+    if state.entry_point == EntryPoint.PREDICT:
+        predict_unit = cast(TPredictUnit, unit)
+        step_mapping[Phase.PREDICT] = predict_unit.predict_progress.num_steps_completed
+
     return step_mapping
 
 
+def _get_epoch(state: State, unit: Union[TTrainUnit, TEvalUnit, TPredictUnit]) -> int:
+    """
+    Returns the epoch depending on the entrypoint. For FIT, it always returns the train epoch.
+    """
+    if state.entry_point in (EntryPoint.TRAIN, EntryPoint.FIT):
+        train_unit = cast(TTrainUnit, unit)
+        return train_unit.train_progress.num_epochs_completed
+
+    elif state.entry_point == EntryPoint.PREDICT:
+        predict_unit = cast(TPredictUnit, unit)
+        return predict_unit.predict_progress.num_epochs_completed
+
+    elif state.entry_point == EntryPoint.EVALUATE:
+        eval_unit = cast(TEvalUnit, unit)
+        return eval_unit.eval_progress.num_epochs_completed
+
+    raise ValueError(f"Unknown entrypoint: {state.entry_point}")
+
+
 def _prepare_app_state(unit: AppStateMixin) -> Dict[str, Any]:
     """Join together all of the tracked stateful entities to simplify registration of snapshottable states, deals with FSDP case"""
     app_state = unit.app_state()