Add util function to get module state dict (#984)

diego-urgell · facebook-github-bot · commit 055aa15f4292 · 2025-03-19T07:46:35.000-07:00
Summary: Pull Request resolved: #984 Reviewed By: galrotem Differential Revision: D71218699 fbshipit-source-id: f97209f4b90f5c3978cf5f56cbddc966d3ebb807
diff --git a/tests/utils/test_prepare_module_gpu.py b/tests/utils/test_prepare_module_gpu.py
@@ -14,6 +14,10 @@
     FullyShardedDataParallel as FSDP,
     MixedPrecisionPolicy,
 )
+from torchtnt.framework._test_utils import DummyAutoUnit, generate_random_dataloader
+from torchtnt.framework.train import train
+from torchtnt.utils.distributed import get_global_rank
+from torchtnt.utils.prepare_module import get_module_state_dict
 
 try:
     from torch.distributed.fsdp import fully_shard
@@ -404,6 +408,62 @@ def _test_prepare_fsdp2_meta_device() -> None:
                 # linear and SimpleModule are fsdp modules
                 tc.assertTrue(_is_fsdp_module(submodule))
 
+    def test_get_module_state_dict(self) -> None:
+        spawn_multi_process(
+            2,
+            "nccl",
+            self._test_get_module_state_dict,
+        )
+
+    @staticmethod
+    def _test_get_module_state_dict() -> None:
+        rank = get_global_rank()
+
+        fsdp_strategy = FSDPStrategy(
+            sharding_strategy="FULL_SHARD",
+            auto_wrap_policy=lambda module, recurse, nonwrapped_numel: True,
+        )
+        ddp_strategy = DDPStrategy()
+
+        for strategy, rank0_only in (
+            (fsdp_strategy, True),
+            (fsdp_strategy, False),
+            (ddp_strategy, True),
+            (ddp_strategy, False),
+            (None, True),
+            (None, False),
+        ):
+            module = torch.nn.Sequential(
+                torch.nn.Linear(2, 100),
+                torch.nn.Linear(100, 2),
+            )
+
+            unit = DummyAutoUnit(
+                module=module,
+                strategy=strategy,
+            )
+
+            dataloader = generate_random_dataloader(10, 2, 10)
+            train(unit, dataloader, max_epochs=1)
+
+            module_sd = get_module_state_dict(unit.module, rank0_only=rank0_only)
+
+            tc = unittest.TestCase()
+
+            # For FSDP, if the user passed rank0_only=True, we should get an empty state dict
+            # on all ranks except rank 0
+            if rank0_only and isinstance(strategy, FSDPStrategy) and rank != 0:
+                tc.assertEqual(module_sd, {})
+
+            else:
+                # Make sure that the generated state dict has the actual model keys,
+                # and the values are actual tensors as opposed to ShardedTensor.
+                tc.assertCountEqual(
+                    ["0.weight", "0.bias", "1.weight", "1.bias"],
+                    list(module_sd.keys()),
+                )
+                tc.assertIsInstance(module_sd["0.weight"], torch.Tensor)
+
 
 class SimpleModule(torch.nn.Module):
     def __init__(self, meta_device: bool = False) -> None:
diff --git a/torchtnt/utils/prepare_module.py b/torchtnt/utils/prepare_module.py
@@ -6,6 +6,7 @@
 
 # pyre-strict
 
+import logging
 from dataclasses import asdict, dataclass
 from functools import partial
 from typing import (
@@ -38,6 +39,7 @@
     set_optimizer_state_dict,
 )
 from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.fsdp.fully_sharded_data_parallel import FullStateDictConfig
 from torchtnt.utils.precision import convert_precision_str_to_dtype
 
 try:
@@ -85,6 +87,9 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
 from torchtnt.utils.version import is_torch_version_geq
 
 
+logger: logging.Logger = logging.getLogger(__name__)
+
+
 @dataclass
 class Strategy:
     """Dataclass representing a parallelization strategy"""
@@ -680,3 +685,40 @@ def _check_and_convert_mp_policy_dtypes(
     )
 
     return new_mp_policy
+
+
+def get_module_state_dict(
+    module: torch.nn.Module, rank0_only: bool = False
+) -> Dict[str, Any]:
+    """
+    Given a module, return a state dict that can be loaded into a CPU instance of the module. This requires different implementation depending on strategy:
+    - If FSDP, we need to gather all the sharded parameters and offload state dict to CPU in order to avoid OOM.
+    - If DDP, we need to unwrap the module to avoid extra state_dict prefix
+    - Otherwise, we can just return the state dict as is
+
+    Args:
+        module: module to be used.
+        rank0_only: This flag only works for FSDP. If True, only rank 0 will return the state dict. Other ranks will return an empty dict.
+            For DDP or no strategy case, we don't move the state dice to CPU -- it can be loaded directly into the module.
+
+    Note: Even if the state_dict parameters are on GPU, it can still be loaded into a CPU module.
+    """
+    logger.info("Generating module state dict")
+
+    # TODO: Add support for FSDP2
+    if isinstance(module, FSDP):
+        state_cfg = FullStateDictConfig(offload_to_cpu=True, rank0_only=rank0_only)
+        with FSDP.state_dict_type(module, _StateDictType.FULL_STATE_DICT, state_cfg):
+            return module.state_dict()
+
+    if rank0_only:
+        logger.warning(
+            "Provided rank0_only=True, but this is no-op for DDP or no strategy. Returning state dict in module's device."
+        )
+
+    if isinstance(module, DDP):
+        module = module.module
+
+    state_dict = module.state_dict()
+
+    return state_dict