adds async save and stateful info

LucasLLC · LucasLLC · commit f4ecb4c4f1b5 · 2024-07-09T12:43:07.000-07:00
diff --git a/recipes_source/distributed_async_checkpoint_recipe.rst b/recipes_source/distributed_async_checkpoint_recipe.rst
@@ -0,0 +1,257 @@
+Asynchronous Saving with Distributed Checkpoint (DCP)
+=====================================================
+
+Checkpointing is often a bottle-neck in the critical distributed training workloads, incurring larger and larger costs as both model and world sizes grow.
+One excellent strategy to offsetting this cost is to checkpoint in parallel, asynchronously. Below, we expand the save example
+from the `Getting Started with Distributed Checkpoint Tutorial <https://github.com/pytorch/tutorials/blob/main/recipes_source/distributed_checkpoint_recipe.rst>`__
+to show how this can be integrated quite easily with `torch.distributed.checkpoint.async_save`.
+
+
+Notes on Asynchronous Checkpointing
+------------------------------------
+Before getting started with Asynchronous Checkpointing, it's important that we discuss some differences and limitations as compared to synchronous checkpointing.
+Speciically:
+
+* Memory requirements - Asynchronous checkpointing works by first copying models into internal CPU-buffers.
+    This is helpful since it ensures model and optimizer weights are not changing while the model is still checkpointing,
+    but does raise CPU memory by a factor of checkpoint size times the number of process on the host.
+
+* Checkpoint Management - Since checkpointing is Asynchronous, it is up to the user to manage concurrently run checkpoints. In general users can
+    employ their own management strategies by handling the future object returned form `async_save`. For most users, we recommend limiting
+    checkpoints to one asynchronous request at a time, avoiding additional memory pressure per request.
+
+
+
+.. code-block:: python
+
+    import os
+
+    import torch
+    import torch.distributed as dist
+    import torch.distributed.checkpoint as dcp
+    import torch.multiprocessing as mp
+    import torch.nn as nn
+
+    from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+    from torch.distributed.checkpoint.state_dict import get_state_dict, set_state_dict
+    from torch.distributed.checkpoint.stateful import Stateful
+    from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
+
+    CHECKPOINT_DIR = "checkpoint"
+
+
+    class AppState(Stateful):
+        """This is a useful wrapper for checkpointing the Application State. Since this object is compliant
+        with the Stateful protocol, DCP will automatically call state_dict/load_stat_dict as needed in the
+        dcp.save/load APIs.
+
+        Note: We take advantage of this wrapper to hande calling distributed state dict methods on the model
+        and optimizer.
+        """
+
+        def __init__(self, model, optimizer=None):
+            self.model = model
+            self.optimizer = optimizer
+
+        def state_dict(self):
+            # this line automatically manages FSDP FQN's, as well as sets the default state dict type to FSDP.SHARDED_STATE_DICT
+            model_state_dict, optimizer_state_dict = get_state_dict(model, optimizer)
+            return {
+                "model": model_state_dict,
+                "optim": optimizer_state_dict
+            }
+
+        def load_state_dict(self, state_dict):
+            # sets our state dicts on the model and optimizer, now that we've loaded
+            set_state_dict(
+                self.model,
+                self.optimizer,
+                model_state_dict=state_dict["model"],
+                optim_state_dict=state_dict["optim"]
+            )
+
+    class ToyModel(nn.Module):
+        def __init__(self):
+            super(ToyModel, self).__init__()
+            self.net1 = nn.Linear(16, 16)
+            self.relu = nn.ReLU()
+            self.net2 = nn.Linear(16, 8)
+
+        def forward(self, x):
+            return self.net2(self.relu(self.net1(x)))
+
+
+    def setup(rank, world_size):
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = "12355 "
+
+        # initialize the process group
+        dist.init_process_group("nccl", rank=rank, world_size=world_size)
+        torch.cuda.set_device(rank)
+
+
+    def cleanup():
+        dist.destroy_process_group()
+
+
+    def run_fsdp_checkpoint_save_example(rank, world_size):
+        print(f"Running basic FSDP checkpoint saving example on rank {rank}.")
+        setup(rank, world_size)
+
+        # create a model and move it to GPU with id rank
+        model = ToyModel().to(rank)
+        model = FSDP(model)
+
+        loss_fn = nn.MSELoss()
+        optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
+
+        checkpoint_future = None
+        for step in range(10):
+            optimizer.zero_grad()
+            model(torch.rand(8, 16, device="cuda")).sum().backward()
+            optimizer.step()
+
+            state_dict = { "app": AppState(model, optimizer) }
+            if checkpoint_future is not None:
+                # waits for checkpointing to finish, avoiding queuing more then one checkpoint request at a time
+                checkpoint_future.result()
+            dcp.async_save(state_dict, checkpoint_id=f"{CHECKPOINT_DIR}_step{step}")
+
+        cleanup()
+
+
+    if __name__ == "__main__":
+        world_size = torch.cuda.device_count()
+        print(f"Running fsdp checkpoint example on {world_size} devices.")
+        mp.spawn(
+            run_fsdp_checkpoint_save_example,
+            args=(world_size,),
+            nprocs=world_size,
+            join=True,
+        )
+
+
+Even more performance with Pinned Memory
+-----------------------------------------
+If the above optimization is still not performant enough for a use case, PyTorch offers an additional optimization for GPU models by utilizing a pinned memory buffer.
+This optimization attacks the main overhead of asynchronous checkpointing, which is the in-memory copying to checkpointing buffers.
+
+Note: The main drawback of this optimization is the persistence of the buffer in between checkpointing steps. Without the pinned memory optimization (as demonstrated above),
+any checkpointing buffers are released as soon as checkpointing is finished. With the pinned memory implementation, this buffer is maintained in between steps, leading to the same
+peak memory pressure being sustained through the application life.
+
+
+.. code-block:: python
+
+    import os
+
+    import torch
+    import torch.distributed as dist
+    import torch.distributed.checkpoint as dcp
+    import torch.multiprocessing as mp
+    import torch.nn as nn
+
+    from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+    from torch.distributed.checkpoint.state_dict import get_state_dict, set_state_dict
+    from torch.distributed.checkpoint.stateful import Stateful
+    from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
+    from torch.distributed.checkpoint import StorageWriter
+
+    CHECKPOINT_DIR = "checkpoint"
+
+
+    class AppState(Stateful):
+        """This is a useful wrapper for checkpointing the Application State. Since this object is compliant
+        with the Stateful protocol, DCP will automatically call state_dict/load_stat_dict as needed in the
+        dcp.save/load APIs.
+
+        Note: We take advantage of this wrapper to hande calling distributed state dict methods on the model
+        and optimizer.
+        """
+
+        def __init__(self, model, optimizer=None):
+            self.model = model
+            self.optimizer = optimizer
+
+        def state_dict(self):
+            # this line automatically manages FSDP FQN's, as well as sets the default state dict type to FSDP.SHARDED_STATE_DICT
+            model_state_dict, optimizer_state_dict = get_state_dict(model, optimizer)
+            return {
+                "model": model_state_dict,
+                "optim": optimizer_state_dict
+            }
+
+        def load_state_dict(self, state_dict):
+            # sets our state dicts on the model and optimizer, now that we've loaded
+            set_state_dict(
+                self.model,
+                self.optimizer,
+                model_state_dict=state_dict["model"],
+                optim_state_dict=state_dict["optim"]
+            )
+
+    class ToyModel(nn.Module):
+        def __init__(self):
+            super(ToyModel, self).__init__()
+            self.net1 = nn.Linear(16, 16)
+            self.relu = nn.ReLU()
+            self.net2 = nn.Linear(16, 8)
+
+        def forward(self, x):
+            return self.net2(self.relu(self.net1(x)))
+
+
+    def setup(rank, world_size):
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = "12355 "
+
+        # initialize the process group
+        dist.init_process_group("nccl", rank=rank, world_size=world_size)
+        torch.cuda.set_device(rank)
+
+
+    def cleanup():
+        dist.destroy_process_group()
+
+
+    def run_fsdp_checkpoint_save_example(rank, world_size):
+        print(f"Running basic FSDP checkpoint saving example on rank {rank}.")
+        setup(rank, world_size)
+
+        # create a model and move it to GPU with id rank
+        model = ToyModel().to(rank)
+        model = FSDP(model)
+
+        loss_fn = nn.MSELoss()
+        optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
+
+        # The storage writer defines our 'staging' strategy, where staging is considered the process of copying
+        # checkpoints to in-memory buffers. By setting `cached_state_dict=True`, we enable efficient memory copying
+        # into a persistent buffer with pinned memory enabled.
+        # Note: It's important that the writer persists in between checkpointing requests, since it maintains the
+        # pinned memory buffer.
+        writer = StorageWriter(cached_state_dict=True)
+        checkpoint_future = None
+        for step in range(10):
+            optimizer.zero_grad()
+            model(torch.rand(8, 16, device="cuda")).sum().backward()
+            optimizer.step()
+
+            state_dict = { "app": AppState(model, optimizer) }
+            if checkpoint_future is not None:
+                # waits for checkpointing to finish, avoiding queuing more then one checkpoint request at a time
+                checkpoint_future.result()
+            dcp.async_save(state_dict, storage_writer=writer, checkpoint_id=f"{CHECKPOINT_DIR}_step{step}")
+
+        cleanup()
+
+
+    if __name__ == "__main__":
+        world_size = torch.cuda.device_count()
+        print(f"Running fsdp checkpoint example on {world_size} devices.")
+        mp.spawn(
+            run_fsdp_checkpoint_save_example,
+            args=(world_size,),
+            nprocs=world_size,
+            join=True,
+        )
diff --git a/recipes_source/distributed_checkpoint_recipe.rst b/recipes_source/distributed_checkpoint_recipe.rst
@@ -33,6 +33,7 @@ DCP is different from :func:`torch.save` and :func:`torch.load` in a few signifi
 
 * It produces multiple files per checkpoint, with at least one per rank.
 * It operates in place, meaning that the model should allocate its data first and DCP uses that storage instead.
+* DCP offers special handling of Stateful objects (formally defined in `torch.distributed.checkpoint.stateful`), automatically calling both `state_dict` and `load_state_dict` methods if they are defined.
 
 .. note::
   The code in this tutorial runs on an 8-GPU server, but it can be easily
@@ -59,12 +60,43 @@ Now, let's create a toy module, wrap it with FSDP, feed it with some dummy input
     import torch.nn as nn
 
     from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-    from torch.distributed.checkpoint.state_dict import get_state_dict
+    from torch.distributed.checkpoint.state_dict import get_state_dict, set_state_dict
+    from torch.distributed.checkpoint.stateful import Stateful
     from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
 
     CHECKPOINT_DIR = "checkpoint"
 
 
+    class AppState(Stateful):
+        """This is a useful wrapper for checkpointing the Application State. Since this object is compliant
+        with the Stateful protocol, DCP will automatically call state_dict/load_stat_dict as needed in the
+        dcp.save/load APIs.
+
+        Note: We take advantage of this wrapper to hande calling distributed state dict methods on the model
+        and optimizer.
+        """
+
+        def __init__(self, model, optimizer=None):
+            self.model = model
+            self.optimizer = optimizer
+
+        def state_dict(self):
+            # this line automatically manages FSDP FQN's, as well as sets the default state dict type to FSDP.SHARDED_STATE_DICT
+            model_state_dict, optimizer_state_dict = get_state_dict(model, optimizer)
+            return {
+                "model": model_state_dict,
+                "optim": optimizer_state_dict
+            }
+
+        def load_state_dict(self, state_dict):
+            # sets our state dicts on the model and optimizer, now that we've loaded
+            set_state_dict(
+                self.model,
+                self.optimizer,
+                model_state_dict=state_dict["model"],
+                optim_state_dict=state_dict["optim"]
+            )
+
     class ToyModel(nn.Module):
         def __init__(self):
             super(ToyModel, self).__init__()
@@ -104,14 +136,8 @@ Now, let's create a toy module, wrap it with FSDP, feed it with some dummy input
         model(torch.rand(8, 16, device="cuda")).sum().backward()
         optimizer.step()
 
-        # this line automatically manages FSDP FQN's, as well as sets the default state dict type to FSDP.SHARDED_STATE_DICT
-        model_state_dict, optimizer_state_dict = get_state_dict(model, optimizer)
-        state_dict = {
-            "model": model_state_dict,
-            "optimizer": optimizer_state_dict
-        }
-        dcp.save(state_dict,checkpoint_id=CHECKPOINT_DIR)
-
+        state_dict = { "app": AppState(model, optimizer) }
+        dcp.save(state_dict, checkpoint_id=CHECKPOINT_DIR)
 
         cleanup()
 
@@ -161,6 +187,36 @@ The reason that we need the ``state_dict`` prior to loading is:
     CHECKPOINT_DIR = "checkpoint"
 
 
+    class AppState(Stateful):
+        """This is a useful wrapper for checkpointing the Application State. Since this object is compliant
+        with the Stateful protocol, DCP will automatically call state_dict/load_stat_dict as needed in the
+        dcp.save/load APIs.
+
+        Note: We take advantage of this wrapper to hande calling distributed state dict methods on the model
+        and optimizer.
+        """
+
+        def __init__(self, model, optimizer=None):
+            self.model = model
+            self.optimizer = optimizer
+
+        def state_dict(self):
+            # this line automatically manages FSDP FQN's, as well as sets the default state dict type to FSDP.SHARDED_STATE_DICT
+            model_state_dict, optimizer_state_dict = get_state_dict(model, optimizer)
+            return {
+                "model": model_state_dict,
+                "optim": optimizer_state_dict
+            }
+
+        def load_state_dict(self, state_dict):
+            # sets our state dicts on the model and optimizer, now that we've loaded
+            set_state_dict(
+                self.model,
+                self.optimizer,
+                model_state_dict=state_dict["model"],
+                optim_state_dict=state_dict["optim"]
+            )
+
     class ToyModel(nn.Module):
         def __init__(self):
             super(ToyModel, self).__init__()
@@ -193,23 +249,14 @@ The reason that we need the ``state_dict`` prior to loading is:
         model = ToyModel().to(rank)
         model = FSDP(model)
 
-        # generates the state dict we will load into
-        model_state_dict, optimizer_state_dict = get_state_dict(model, optimizer)
-        state_dict = {
-            "model": model_state_dict,
-            "optimizer": optimizer_state_dict
-        }
+        loss_fn = nn.MSELoss()
+        optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
+
+        state_dict = { "app": AppState(model, optimizer)}
         dcp.load(
             state_dict=state_dict,
             checkpoint_id=CHECKPOINT_DIR,
         )
-        # sets our state dicts on the model and optimizer, now that we've loaded
-        set_state_dict(
-            model,
-            optimizer,
-            model_state_dict=model_state_dict,
-            optim_state_dict=optimizer_state_dict
-        )
 
         cleanup()