Add eval/predict dataloader parameters to restore methods (#917)

diego-urgell · facebook-github-bot · commit 8bf95f3208b2 · 2024-10-09T23:03:46.000-07:00
Summary: Pull Request resolved: #917 Reviewed By: JKSenthil Differential Revision: D63013005 fbshipit-source-id: a1d89e95786b74f0ce8ab09b57678dfafe125c00
diff --git a/tests/framework/callbacks/test_base_checkpointer.py b/tests/framework/callbacks/test_base_checkpointer.py
@@ -14,7 +14,7 @@
 import tempfile
 import time
 import unittest
-from typing import cast, Iterable, List, Optional
+from typing import Any, cast, Iterable, List, Optional
 from unittest.mock import MagicMock, patch
 
 import torch
@@ -42,7 +42,14 @@
 from torchtnt.framework.state import ActivePhase, State
 
 from torchtnt.framework.train import train
-from torchtnt.framework.unit import AppStateMixin, TrainUnit, TTrainData, TTrainUnit
+from torchtnt.framework.unit import (
+    AppStateMixin,
+    TEvalData,
+    TPredictData,
+    TrainUnit,
+    TTrainData,
+    TTrainUnit,
+)
 from torchtnt.utils.checkpoint import BestCheckpointConfig, get_latest_checkpoint_path
 from torchtnt.utils.distributed import get_global_rank, spawn_multi_process
 from torchtnt.utils.env import init_from_env
@@ -94,10 +101,13 @@ def restore(
         unit: AppStateMixin,
         *,
         train_dataloader: Optional[Iterable[TTrainData]] = None,
+        eval_dataloader: Optional[Iterable[TEvalData]] = None,
+        predict_dataloader: Optional[Iterable[TPredictData]] = None,
         process_group: Optional[dist.ProcessGroup] = None,
         restore_options: Optional[RestoreOptions] = None,
         msg: str = "",
         restored_checkpoint_path: Optional[List[str]] = None,
+        **kwargs: Any,
     ) -> None:
         if restored_checkpoint_path is not None:
             if len(restored_checkpoint_path):
diff --git a/torchtnt/framework/callbacks/base_checkpointer.py b/torchtnt/framework/callbacks/base_checkpointer.py
@@ -410,6 +410,7 @@ def restore(
         train_dataloader: Optional[Iterable[TTrainData]] = None,
         process_group: Optional[dist.ProcessGroup] = None,
         restore_options: Optional[RestoreOptions] = None,
+        **kwargs: Any,
     ) -> None:
         """Method to restore checkpoint state from a path.
 
@@ -419,7 +420,7 @@ def restore(
         Args:
             path: Path of the checkpoint to restore.
             unit: An instance of :class:`~torchtnt.framework.unit.TrainUnit`, :class:`~torchtnt.framework.unit.EvalUnit`, or :class:`~torchtnt.framework.unit.PredictUnit` containing states to restore.
-            train_dataloader: An optional train dataloader to restore.
+            train_dataloader: An optional train dataloader to restore. Can only be used when restoring from a train or fit checkpoint.
             process_group: The process group on which the ranks will communicate on. default: ``None`` (the entire world)
             restore_options: Controls what to filter when restoring the state.
         """
@@ -538,6 +539,7 @@ def restore_with_id(
         train_dataloader: Optional[Iterable[TTrainData]] = None,
         process_group: Optional[dist.ProcessGroup] = None,
         restore_options: Optional[RestoreOptions] = None,
+        **kwargs: Any,
     ) -> None:
         """Method to restore checkpoint state from a checkpoint id.
 
@@ -561,4 +563,5 @@ def restore_with_id(
             train_dataloader=train_dataloader,
             process_group=process_group,
             restore_options=restore_options,
+            **kwargs,
         )
diff --git a/torchtnt/framework/callbacks/dcp_saver.py b/torchtnt/framework/callbacks/dcp_saver.py
@@ -37,7 +37,9 @@
 from torchtnt.framework.state import State
 from torchtnt.framework.unit import (
     AppStateMixin,
+    TEvalData,
     TEvalUnit,
+    TPredictData,
     TPredictUnit,
     TTrainData,
     TTrainUnit,
@@ -228,11 +230,14 @@ def restore(
         unit: AppStateMixin,
         *,
         train_dataloader: Optional[Iterable[TTrainData]] = None,
+        eval_dataloader: Optional[Iterable[TEvalData]] = None,
+        predict_dataloader: Optional[Iterable[TPredictData]] = None,
         process_group: Optional[dist.ProcessGroup] = None,
         restore_options: Optional[RestoreOptions] = None,
         knob_options: Optional[KnobOptions] = None,
         planner: Optional[LoadPlanner] = None,
         storage_reader: Optional[StorageReader] = None,
+        **kwargs: Any,
     ) -> None:
         """Utility method to restore dcp checkpoint from a path."""
 
@@ -242,6 +247,8 @@ def restore(
             checkpoint_id,
             unit,
             train_dataloader=train_dataloader,
+            eval_dataloader=eval_dataloader,
+            predict_dataloader=predict_dataloader,
             process_group=process_group,
             restore_options=restore_options,
             knob_options=knob_options,
@@ -255,11 +262,14 @@ def restore_with_id(
         unit: AppStateMixin,
         *,
         train_dataloader: Optional[Iterable[TTrainData]] = None,
+        eval_dataloader: Optional[Iterable[TEvalData]] = None,
+        predict_dataloader: Optional[Iterable[TPredictData]] = None,
         process_group: Optional[dist.ProcessGroup] = None,
         restore_options: Optional[RestoreOptions] = None,
         knob_options: Optional[KnobOptions] = None,
         planner: Optional[LoadPlanner] = None,
         storage_reader: Optional[StorageReader] = None,
+        **kwargs: Any,
     ) -> None:
         """Utility method to restore dcp checkpoint from a checkpoint_id.
 
@@ -269,7 +279,9 @@ def restore_with_id(
         Args:
             checkpoint_id: Checkpoint id. It can be the path of the snapshot to restore.
             unit: An instance of :class:`~torchtnt.framework.unit.TrainUnit`, :class:`~torchtnt.framework.unit.EvalUnit`, or :class:`~torchtnt.framework.unit.PredictUnit` containing states to restore.
-            train_dataloader: An optional train dataloader to restore.
+            train_dataloader: An optional train dataloader to restore. Can only be used when restoring from a train or fit checkpoint.
+            eval_dataloader: An optional eval dataloader to restore. Can only be used when restoring from an eval or fit checkpoint.
+            predict_dataloader: An optional predict dataloader to restore. Can only be used when restoring from a predict checkpoint.
             process_group: The process group on which the ranks will communicate on. default: ``None`` (the entire world)
                             If not Gloo, a Gloo process group is created.
                             Note: If torch.distributed is available and a process group is initialized, dcp assumes the intention is to save/load checkpoints in distributed fashion.
diff --git a/torchtnt/framework/callbacks/torchsnapshot_saver.py b/torchtnt/framework/callbacks/torchsnapshot_saver.py
@@ -270,6 +270,7 @@ def restore(
         storage_options: Optional[Dict[str, Any]] = None,
         knob_options: Optional[KnobOptions] = None,
         strict: bool = True,
+        **kwargs: Any,
     ) -> None:
         """Utility method to restore snapshot state from a path.
 
@@ -279,7 +280,7 @@ def restore(
         Args:
             path: Path of the snapshot to restore.
             unit: An instance of :class:`~torchtnt.framework.unit.TrainUnit`, :class:`~torchtnt.framework.unit.EvalUnit`, or :class:`~torchtnt.framework.unit.PredictUnit` containing states to restore.
-            train_dataloader: An optional train dataloader to restore.
+            train_dataloader: An optional train dataloader to restore. Note that restoring from predict or evaluate dataloaders is not supported for TorchSnapshotSaver.
             process_group: The process group on which the ranks will communicate on. default: ``None`` (the entire world)
             restore_options: Controls what to  filter when restoring the state.
             storage_options: Additional keyword options for the storage plugin to use, to be passed to `torchsnapshot.Snapshot <https://pytorch.org/torchsnapshot/stable/api_reference.html#torchsnapshot.Snapshot>`_. See each storage plugin's documentation for customizations.