Configurable planner and storage writer in the dcp saver save API (#821)

saumishr · facebook-github-bot · commit ec6d9eed6599 · 2024-05-08T12:51:46.000-07:00
Summary: Pull Request resolved: #821 Configurable planner and storage writer in the DCP saver save API. # This Stack DCP saver is the TorchTNT callback which allows checkpointing via the Distributed Checkpointing APIs. Current implementation doesn't expose the Save Planner and Storage Writer in the API for clients to plug in their implementations. It enforces the default planner and FsspecWriter. # This diff - DCP save and async save APIs now support planner and storage writer allowing clients to plug in their implementations. - Introduces a knob option to plug in storage writer component with storage efficiency optimizations Reviewed By: JKSenthil Differential Revision: D56921724 fbshipit-source-id: b60c34c6df38e02c0af9a1db4ffd243d382fd621
diff --git a/tests/framework/callbacks/test_dcp_saver.py b/tests/framework/callbacks/test_dcp_saver.py
@@ -18,6 +18,9 @@
 
 import torch
 from torch import nn
+from torch.distributed.checkpoint import FileSystemWriter
+from torch.distributed.checkpoint.default_planner import DefaultSavePlanner
+from torch.distributed.checkpoint.metadata import STATE_DICT_TYPE
 from torch.utils.data import DataLoader
 from torchsnapshot.test_utils import assert_state_dict_eq, check_state_dict_eq
 from torchtnt.framework._test_utils import (
@@ -289,6 +292,62 @@ def _save_restore_ddp() -> None:
             if get_global_rank() == 0:
                 shutil.rmtree(temp_dir)  # delete temp directory
 
+    @patch("torchtnt.framework.callbacks.dcp_saver.dcp")
+    def test_save_default_planner_storage_components(
+        self, mock_dist_cp: MagicMock
+    ) -> None:
+        from torch.distributed.checkpoint._fsspec_filesystem import FsspecWriter
+
+        input_dim = 2
+        save_every_n_train_steps = 1
+
+        my_unit = DummyTrainUnit(input_dim=input_dim)
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            dcp_cb = DistributedCheckpointSaver(
+                temp_dir,
+                save_every_n_train_steps=save_every_n_train_steps,
+                knob_options=KnobOptions(1),
+            )
+
+            dcp_cb._save(
+                checkpoint_id=temp_dir,
+                app_state=my_unit.module.state_dict(),
+            )
+
+            planner = mock_dist_cp.save.call_args_list[0][1]["planner"]
+            storage_writer = mock_dist_cp.save.call_args_list[0][1]["storage_writer"]
+
+            self.assertIsInstance(planner, DefaultSavePlanner)
+            self.assertIsInstance(storage_writer, FsspecWriter)
+
+    @patch("torchtnt.framework.callbacks.dcp_saver.dcp")
+    def test_save_planner_storage_components(self, mock_dist_cp: MagicMock) -> None:
+        input_dim = 2
+        save_every_n_train_steps = 1
+
+        my_unit = DummyTrainUnit(input_dim=input_dim)
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            dcp_cb = DistributedCheckpointSaver(
+                temp_dir,
+                save_every_n_train_steps=save_every_n_train_steps,
+                knob_options=KnobOptions(1),
+            )
+
+            dcp_cb._save(
+                checkpoint_id=temp_dir,
+                app_state=my_unit.module.state_dict(),
+                planner=DummySavePlanner(),
+                storage_writer=DummyStorageWriter(path=temp_dir),
+            )
+
+            planner = mock_dist_cp.save.call_args_list[0][1]["planner"]
+            storage_writer = mock_dist_cp.save.call_args_list[0][1]["storage_writer"]
+
+            self.assertIsInstance(planner, DummySavePlanner)
+            self.assertIsInstance(storage_writer, DummyStorageWriter)
+
 
 class DummyStatefulDataLoader:
     def __init__(self, dataloader: DataLoader) -> None:
@@ -306,3 +365,19 @@ def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
 
     def __iter__(self) -> Iterator[object]:
         return iter(self.dataloader)
+
+
+class DummySavePlanner(DefaultSavePlanner):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def set_up_planner(self, state_dict: STATE_DICT_TYPE, is_coordinator: bool) -> None:
+        super().set_up_planner(state_dict, is_coordinator)
+
+
+class DummyStorageWriter(FileSystemWriter):
+    def __init__(self, path: str) -> None:
+        super().__init__(path)
+
+    def set_up_storage_writer(self, is_coordinator: bool) -> None:
+        pass
diff --git a/torchtnt/framework/callbacks/checkpointer_types.py b/torchtnt/framework/callbacks/checkpointer_types.py
@@ -14,13 +14,20 @@
 @dataclass
 class KnobOptions:
     """
-    Controls the knobs in TorchSnapshot.
+    Controls the knobs for Checkpoints.
 
     Args:
-        max_per_rank_io_concurrency: Maximum number of concurrent IO operations per rank. Defaults to 16.
+        max_per_rank_io_concurrency: Maximum number of concurrent IO operations per rank in checkpointing.
+                                     Defaults to 16.
+        enable_storage_optimization: Enable storage efficiency optimizations for Distributed Checkpointing.
     """
 
+    # use a more conservative number of concurrent IO operations per rank in Checkpointing
+    # the default value of 16 is too bandwidth hungry for most users
     max_per_rank_io_concurrency: Optional[int] = None
+    # This is a no-op and for future use. This would enable storage efficiency optimizations:
+    # e.g. Compression, Batching, Quantization etc.
+    enable_storage_optimization: bool = False
 
 
 @dataclass
diff --git a/torchtnt/framework/callbacks/dcp_saver.py b/torchtnt/framework/callbacks/dcp_saver.py
@@ -14,6 +14,9 @@
 import torch
 import torch.distributed as dist
 from torch.distributed import checkpoint as dcp
+from torch.distributed.checkpoint.default_planner import DefaultSavePlanner
+from torch.distributed.checkpoint.planner import SavePlanner
+from torch.distributed.checkpoint.storage import StorageWriter
 
 from torchtnt.framework.callbacks._checkpoint_utils import (
     _prepare_app_state_for_checkpoint,
@@ -127,6 +130,8 @@ def _checkpoint_impl(
         *,
         checkpoint_path: str,
         hook: str,
+        planner: Optional[SavePlanner] = None,
+        storage_writer: Optional[StorageWriter] = None,
     ) -> bool:
         if hook not in ["on_train_step_end", "on_train_epoch_end", "on_train_end"]:
             raise RuntimeError(f"Unexpected hook encountered '{hook}'")
@@ -142,20 +147,36 @@ def _checkpoint_impl(
                 # since this is async checkpointed, so in
                 # future, add logic to set  successful flag
                 # only when checkpoint is fully written
-                checkpoint_success = self._async_save(checkpoint_path, app_state)
+                checkpoint_success = self._async_save(
+                    checkpoint_path, app_state, planner, storage_writer
+                )
                 if curr_snapshot_wait:
                     self._wait()
         else:
             with get_timing_context(state, f"{self.__class__.__name__}.save"):
-                checkpoint_success = self._save(checkpoint_path, app_state)
+                checkpoint_success = self._save(
+                    checkpoint_path, app_state, planner, storage_writer
+                )
 
         return checkpoint_success
 
     def _wait(self) -> None:
         if self._prev_snapshot is not None:
             self._prev_snapshot.result()
 
-    def _async_save(self, checkpoint_id: str, app_state: Dict[str, Stateful]) -> bool:
+    def _async_save(
+        self,
+        checkpoint_id: str,
+        app_state: Dict[str, Stateful],
+        planner: Optional[SavePlanner] = None,
+        storage_writer: Optional[StorageWriter] = None,
+    ) -> bool:
+
+        if planner is None:
+            planner = DefaultSavePlanner()
+
+        if storage_writer is None:
+            storage_writer = Writer(checkpoint_id, **self.default_writer_options)
 
         if self._prev_snapshot is not None:
             if not self._prev_snapshot.done():
@@ -177,24 +198,42 @@ def _async_save(self, checkpoint_id: str, app_state: Dict[str, Stateful]) -> boo
 
         self._prev_snapshot = dcp.async_save(
             state_dict={"app_state": MultiStateful(app_state)},
+            checkpoint_id=checkpoint_id,
             process_group=self._process_group,
-            storage_writer=Writer(checkpoint_id, **self.default_writer_options),
+            storage_writer=storage_writer,
+            planner=planner,
         )
 
         return True
 
-    def _save(self, checkpoint_id: str, app_state: Dict[str, Stateful]) -> bool:
+    def _save(
+        self,
+        checkpoint_id: str,
+        app_state: Dict[str, Stateful],
+        planner: Optional[SavePlanner] = None,
+        storage_writer: Optional[StorageWriter] = None,
+    ) -> bool:
+        # Initialize DefaultSavePlanner and FsspecWriter if not provided
+        if planner is None:
+            planner = DefaultSavePlanner()
+
+        if storage_writer is None:
+            storage_writer = Writer(checkpoint_id, **self.default_writer_options)
+
         try:
             dcp.save(
                 state_dict={"app_state": MultiStateful(app_state)},
+                checkpoint_id=checkpoint_id,
                 process_group=self._process_group,
-                storage_writer=Writer(checkpoint_id, **self.default_writer_options),
+                storage_writer=storage_writer,
+                planner=planner,
             )
         except AttributeError:
             dcp.save_state_dict(
                 state_dict={"app_state": MultiStateful(app_state)},
                 process_group=self._process_group,
-                storage_writer=Writer(checkpoint_id, **self.default_writer_options),
+                storage_writer=storage_writer,
+                planner=planner,
             )
 
         return True
@@ -229,13 +268,8 @@ def restore(
             process_group: The process group on which the ranks will communicate on. default: ``None`` (the entire world) Note:
                 If torch.distributed is available and a process group is initialized, dcp assumes the intention is to save/load checkpoints in distributed fashion.
             restore_options: Controls what to  filter when restoring the state.
-            knob_options: Option is kept for legacy reasons but ignored in DCP
+            knob_options: Additional keyword options for StorageWriter and StorageReader
         """
-        if knob_options is not None:
-            rank_zero_warn(
-                "Ignoring `knob_options` which was passed to DistributedCheckpointSaver.restore, but is not supported."
-            )
-
         storage_reader = Reader(path)
 
         restore_options = restore_options or RestoreOptions()
@@ -250,6 +284,7 @@ def restore(
                 # request to restore the dataloader state only if
                 # the persisted snapshot state includes the dataloader entry
                 metadata = storage_reader.read_metadata()
+
                 for key in metadata.state_dict_metadata.keys():
                     if _TRAIN_DL_STATE_KEY in key:
                         app_state[_TRAIN_DL_STATE_KEY] = train_dataloader
@@ -272,6 +307,7 @@ def restore(
         try:
             dcp.load(
                 {"app_state": MultiStateful(app_state)},
+                checkpoint_id=path,
                 storage_reader=storage_reader,
                 process_group=process_group,
             )