Prepare app_state for eval/predict checkpoints (#908)

diego-urgell · facebook-github-bot · commit 764f36e9404c · 2024-10-01T21:14:30.000-07:00
Summary: Pull Request resolved: #908 Reviewed By: JKSenthil Differential Revision: D63013009 fbshipit-source-id: f56673d7ff5114d1d7456f7c38511939f5e9bd56
diff --git a/tests/framework/callbacks/test_checkpoint_utils.py b/tests/framework/callbacks/test_checkpoint_utils.py
@@ -15,6 +15,7 @@
 
 from torchtnt.framework._test_utils import (
     DummyAutoUnit,
+    DummyEvalUnit,
     DummyMeanMetric,
     DummyTrainUnit,
     generate_dummy_stateful_dataloader,
@@ -64,6 +65,24 @@ def test_get_app_state(self) -> None:
             ],
         )
 
+        # Test evaluate intra-epoch checkpoint
+        my_unit = DummyEvalUnit(input_dim=2)
+        my_unit.mean_metric = DummyMeanMetric()  # pyre-ignore[16]
+        state = get_dummy_eval_state()
+        stateful_dl = generate_dummy_stateful_dataloader(1, 1, 1)
+        state._active_phase = ActivePhase.EVALUATE
+        none_throws(state.eval_state)._dataloader = stateful_dl
+
+        app_state = _prepare_app_state_for_checkpoint(state, my_unit, intra_epoch=True)
+        self.assertCountEqual(
+            app_state.keys(),
+            [
+                "eval_progress",
+                "eval_dataloader",
+                "mean_metric",
+            ],
+        )
+
     def test_get_step_phase_mapping(self) -> None:
         unit = DummyAutoUnit(module=nn.Linear(2, 2))
         unit.train_progress._num_steps_completed = 5
diff --git a/torchtnt/framework/callbacks/_checkpoint_utils.py b/torchtnt/framework/callbacks/_checkpoint_utils.py
@@ -24,8 +24,10 @@
     Phase.PREDICT: "predict_dataloader",
 }
 _TRAIN_DL_STATE_KEY = "train_dataloader"
+
 _TRAIN_PROGRESS_STATE_KEY = "train_progress"
 _EVAL_PROGRESS_STATE_KEY = "eval_progress"
+_PREDICT_PROGRESS_STATE_KEY = "predict_progress"
 
 
 def _get_step_phase_mapping(
@@ -56,6 +58,30 @@ def _prepare_app_state(unit: AppStateMixin) -> Dict[str, Any]:
     return app_state
 
 
+def _remove_app_state_keys(
+    unit: AppStateMixin,
+    app_state: Dict[str, Any],
+    *,
+    remove_modules: bool = False,
+    remove_optimizers: bool = False,
+    remove_lr_schedulers: bool = False,
+) -> None:
+    if remove_modules:
+        # remove all module keys from app_state
+        for module_keys in unit.tracked_modules().keys():
+            app_state.pop(module_keys, None)
+
+    if remove_optimizers:
+        # remove all optimizer keys from app_state
+        for optim_keys in unit.tracked_optimizers().keys():
+            app_state.pop(optim_keys, None)
+
+    if remove_lr_schedulers:
+        # remove all lr scheduler keys from app_state
+        for lr_scheduler_keys in unit.tracked_lr_schedulers().keys():
+            app_state.pop(lr_scheduler_keys, None)
+
+
 def _prepare_app_state_for_checkpoint(
     state: State, unit: AppStateMixin, intra_epoch: bool
 ) -> Dict[str, Stateful]:
@@ -64,6 +90,16 @@ def _prepare_app_state_for_checkpoint(
     """
     app_state = _prepare_app_state(unit)
 
+    if state.entry_point in [EntryPoint.EVALUATE, EntryPoint.PREDICT]:
+        # Since model parameters are fixed, remove them from checkpoint.
+        _remove_app_state_keys(
+            unit,
+            app_state,
+            remove_modules=True,
+            remove_optimizers=True,
+            remove_lr_schedulers=True,
+        )
+
     # for intra-epoch checkpointing, include dataloader state of the current phase
     phase_dl = state.active_phase_state().dataloader
     if intra_epoch and isinstance(phase_dl, Stateful):
@@ -85,24 +121,21 @@ def _prepare_app_state_for_restore(
 
     restore_options = restore_options or RestoreOptions()
 
-    if not restore_options.restore_modules:
-        for module_keys in unit.tracked_modules().keys():
-            app_state.pop(module_keys, None)
-
     if not restore_options.restore_train_progress:
         app_state.pop(_TRAIN_PROGRESS_STATE_KEY, None)
 
     if not restore_options.restore_eval_progress:
         app_state.pop(_EVAL_PROGRESS_STATE_KEY, None)
 
-    if not restore_options.restore_optimizers:
-        # remove all optimizer keys from app_state
-        for optim_keys in unit.tracked_optimizers().keys():
-            app_state.pop(optim_keys, None)
+    if not restore_options.restore_predict_progress:
+        app_state.pop(_PREDICT_PROGRESS_STATE_KEY, None)
 
-    if not restore_options.restore_lr_schedulers:
-        # remove all lr scheduler keys from app_state
-        for lr_scheduler_keys in unit.tracked_lr_schedulers().keys():
-            app_state.pop(lr_scheduler_keys, None)
+    _remove_app_state_keys(
+        unit,
+        app_state,
+        remove_modules=not restore_options.restore_modules,
+        remove_optimizers=not restore_options.restore_optimizers,
+        remove_lr_schedulers=not restore_options.restore_lr_schedulers,
+    )
 
     return app_state
diff --git a/torchtnt/framework/callbacks/checkpointer_types.py b/torchtnt/framework/callbacks/checkpointer_types.py
@@ -39,6 +39,7 @@ class RestoreOptions:
         restore_modules: Whether to restore the module state dict.
         restore_train_progress: Whether to restore the training progress state.
         restore_eval_progress: Whether to restore the evaluation progress state.
+        restore_predict_progress: Whether to restore the prediction progress state.
         restore_optimizers: Whether to restore the optimizer states.
         restore_lr_schedulers: Whether to restore the lr scheduler states.
         strict: Whether to strictly restore app state and the module state dict.
@@ -47,6 +48,7 @@ class RestoreOptions:
     restore_modules: bool = True
     restore_train_progress: bool = True
     restore_eval_progress: bool = True
+    restore_predict_progress: bool = True
     restore_optimizers: bool = True
     restore_lr_schedulers: bool = True
     strict: bool = True