support device mesh in fsdp2 (#1001)

JKSenthil · facebook-github-bot · commit 98e78dfecfb2 · 2025-05-12T16:55:04.000-07:00
Summary: Pull Request resolved: #1001 Supports custom device mesh in `prepare_fsdp2` via `GlobalMeshCoordinator`. This helps setup custom TP + fsdp2 (aka 2D parallelisms) in upcoming diffs Reviewed By: galrotem Differential Revision: D74410713 fbshipit-source-id: 1ba285fd94c660347784c57bdeb7c1cf7be16d9c
diff --git a/tests/utils/test_prepare_module.py b/tests/utils/test_prepare_module.py
@@ -8,21 +8,25 @@
 # pyre-strict
 
 import unittest
-from unittest.mock import patch
+from unittest.mock import MagicMock, Mock, patch
 
 import torch
+from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.fsdp import MixedPrecisionPolicy
 from torch.nn.parallel import DistributedDataParallel as DDP
+from torchtnt.utils.device_mesh import GlobalMeshCoordinator
 from torchtnt.utils.distributed import spawn_multi_process
 from torchtnt.utils.env import init_from_env
 from torchtnt.utils.prepare_module import (
     _check_and_convert_mp_policy_dtypes,
     apply_torch_compile,
     DDPStrategy,
+    FSDP2Strategy,
     FSDPStrategy,
     materialize_meta_params,
     NOOPStrategy,
     on_meta_device,
+    prepare_fsdp2,
     prepare_module,
     TorchCompileParams,
 )
@@ -268,6 +272,28 @@ def test_check_and_convert_mp_policy_dtypes(self) -> None:
         ):
             _check_and_convert_mp_policy_dtypes(invalid_mp_policy)
 
+    @patch("torchtnt.utils.prepare_module.fully_shard")
+    def test_fsdp2_mesh(self, mock_fully_shard: Mock) -> None:
+        """
+        Test that device mesh is forwarded appropriately
+        """
+
+        module = torch.nn.Linear(2, 2, device="cpu")
+        mock_mesh = MagicMock(spec=DeviceMesh)
+        mock_global_mesh = MagicMock(spec=GlobalMeshCoordinator)
+        mock_global_mesh.dp_mesh = mock_mesh
+
+        strategy = FSDP2Strategy()
+        module = prepare_fsdp2(
+            module,
+            torch.device("cpu"),
+            strategy,
+            global_mesh=mock_global_mesh,
+        )
+        mock_fully_shard.assert_called_with(
+            module, mesh=mock_mesh, reshard_after_forward=True
+        )
+
     def test_apply_torch_compile_recursive_module_types(self) -> None:
         """
         Test that recursive_module_types is apply correctly.
diff --git a/torchtnt/utils/prepare_module.py b/torchtnt/utils/prepare_module.py
@@ -41,6 +41,7 @@
 )
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.fsdp.fully_sharded_data_parallel import FullStateDictConfig
+from torchtnt.utils.device_mesh import GlobalMeshCoordinator
 from torchtnt.utils.precision import convert_precision_str_to_dtype
 
 try:
@@ -367,7 +368,7 @@ def prepare_fsdp2(
     module: torch.nn.Module,
     device: torch.device,
     strategy: Optional[FSDP2Strategy] = None,
-    process_group: Optional[ProcessGroup] = None,
+    global_mesh: Optional[GlobalMeshCoordinator] = None,
 ) -> torch.nn.Module:
     """
     Utility to move a module to device and wrap in `FSDP2 <https://pytorch.org/docs/2.6/distributed.fsdp.fully_shard.html>`_
@@ -376,12 +377,18 @@ def prepare_fsdp2(
         module: module to be wrapped in FSDP
         device: device to which module will be moved
         strategy: an instance of :class:`~torchtnt.utils.prepare_module.FSDP2Strategy` which defines the settings of FSDP APIs
+        global_mesh: an instance of :class:`~torchtnt.utils.device_mesh.GlobalMeshCoordinator` which defines the global mesh topology.
+            If not provided, a 1D default mesh will be created covering the entire world size.
     """
     strategy = strategy or FSDP2Strategy()
 
     # prepare kwargs for fully_shard api
-    pg = process_group or dist.distributed_c10d._get_default_group()
-    mesh = init_device_mesh(device.type, mesh_shape=(pg.size(),))
+    if global_mesh is None:
+        pg = dist.distributed_c10d._get_default_group()
+        mesh = init_device_mesh(device.type, mesh_shape=(pg.size(),))
+    else:
+        mesh = global_mesh.dp_mesh
+
     fsdp_kwargs: Dict[str, Any] = {
         "mesh": mesh,  # TODO we only configure 1D mesh for now, look into supporting HSDP
         "reshard_after_forward": strategy.reshard_after_forward,
@@ -599,6 +606,7 @@ def prepare_module(
     torch_compile_params: Optional[TorchCompileParams] = None,
     activation_checkpoint_params: Optional[ActivationCheckpointParams] = None,
     enable_compiled_autograd: bool = False,
+    global_mesh: Optional[GlobalMeshCoordinator] = None,
 ) -> torch.nn.Module:
     """
     Utility to move a module to device, set up parallelism, activation checkpointing and compile.
@@ -610,6 +618,8 @@ def prepare_module(
         torch_compile_params: params for Torch compile https://pytorch.org/docs/stable/generated/torch.compile.html.
         activation_checkpoint_params: params for enabling activation checkpointing.
         enable_compiled_autograd: if True, `compiled_autograd` will be used to compile the backward, this is an experimental flag.
+        global_mesh: an instance of :class:`~torchtnt.utils.device_mesh.GlobalMeshCoordinator` which defines the global mesh topology.
+            Only pass here if wanting to configure HSDP setup with FSDP2
     """
 
     if strategy:
@@ -652,7 +662,7 @@ def prepare_module(
                 )
             module = prepare_fsdp(module, device, strategy)
         elif isinstance(strategy, FSDP2Strategy):
-            module = prepare_fsdp2(module, device, strategy)
+            module = prepare_fsdp2(module, device, strategy, global_mesh=global_mesh)
     else:
         module = module.to(device)