support fsdp2 sharding on largest dim (#1037)

JKSenthil · meta-codesync[bot] · commit b83990d2e1c3 · 2025-10-08T18:51:08.000-07:00
Summary: Pull Request resolved: #1037 Reviewed By: galrotem Differential Revision: D83673518 fbshipit-source-id: 13a5c758bfe46ee4e2dd07719db600e2b23a7a21
diff --git a/tests/utils/test_prepare_module.py b/tests/utils/test_prepare_module.py
@@ -322,6 +322,29 @@ def test_fsdp2_mesh(self, mock_fully_shard: Mock) -> None:
             module, mesh=mock_mesh, reshard_after_forward=False
         )
 
+    @patch("torchtnt.utils.prepare_module.fully_shard")
+    def test_fsdp2_shard_on_largest_dim(self, mock_fully_shard: Mock) -> None:
+        """
+        Test that shard on largest dim function is used
+        """
+
+        module = torch.nn.Linear(2, 2, device="cpu")
+        mock_mesh = MagicMock(spec=DeviceMesh)
+        mock_global_mesh = MagicMock(spec=GlobalMeshCoordinator)
+        mock_global_mesh.dp_mesh = mock_mesh
+
+        strategy = FSDP2Strategy(
+            modules_to_shard=[torch.nn.Linear], shard_on_largest_dim=True
+        )
+        module = prepare_fsdp2(
+            module,
+            torch.device("cpu"),
+            strategy,
+            global_mesh=mock_global_mesh,
+        )
+        # Check that "shard_placement_fn" is in the kwargs passed to fully_shard
+        self.assertIn("shard_placement_fn", mock_fully_shard.call_args.kwargs)
+
     @patch("torchtnt.utils.prepare_module._prepare_module_2d")
     @patch("torchtnt.utils.prepare_module._prepare_module_1d")
     def test_prepare_module_dispatching(
diff --git a/torchtnt/utils/prepare_module.py b/torchtnt/utils/prepare_module.py
@@ -41,8 +41,9 @@
     get_optimizer_state_dict,
     set_optimizer_state_dict,
 )
-from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
 from torch.distributed.fsdp.fully_sharded_data_parallel import FullStateDictConfig
+from torch.distributed.tensor import Shard
 from torch.distributed.tensor.parallel import parallelize_module
 from torch.distributed.tensor.parallel.style import ParallelStyle
 from torchtnt.utils.device_mesh import GlobalMeshCoordinator
@@ -199,6 +200,8 @@ class FSDP2Strategy(Strategy):
         reshard_after_forward: If True, reshards parameters post-forward pass to save memory.
         mp_policy: Controls mixed precision policy. If only dtype is provided, it will be used to cast all relevant parts of model. If None, no mixed precision is used
         cpu_offload: If True, enables CPU offloading of model parameters to reduce GPU memory usage.
+        shard_on_largest_dim: If True, shards on the largest dimension of the parameter. By default FSDP shards on the first dimension, and if it is small will end up replicated on all ranks, which ends up increasing
+            memory usage as world size increases.
 
     Note:
         It is recommended to specify specific modules to shard to avoid unnecessary sharding of all submodules, which has
@@ -240,6 +243,9 @@ class FSDP2Strategy(Strategy):
     mp_policy: Optional[Union[str, torch.dtype, MixedPrecisionPolicy]] = None
     cpu_offload: bool = False
 
+    # experimental flag
+    shard_on_largest_dim: bool = False
+
 
 @dataclass
 class TPStrategy(Strategy):
@@ -409,6 +415,7 @@ def prepare_fsdp2(
     strategy = strategy or FSDP2Strategy()
 
     # prepare kwargs for fully_shard api
+    mesh: DeviceMesh
     if global_mesh is None:
         pg = dist.distributed_c10d._get_default_group()
         mesh = init_device_mesh(device.type, mesh_shape=(pg.size(),))
@@ -438,6 +445,22 @@ def prepare_fsdp2(
                 reduce_dtype=mp_policy,
                 output_dtype=mp_policy,
             )
+    if strategy.shard_on_largest_dim:
+
+        # From the docs: https://docs.pytorch.org/docs/stable/distributed.fsdp.fully_shard.html
+        # "If sharding on a nonzero dim, we currently require even sharding, i.e. the tensor dim size on that dim must be divisible by the FSDP shard mesh size."
+
+        # So we shard on a candidate nonzero dim only when it's divisible by the fsdp world size
+
+        def _shard_placement_fn(param: torch.nn.Parameter) -> Optional[Shard]:
+            largest_dim_size = max(param.shape)
+            idx = param.shape.index(largest_dim_size)
+            if idx != 0 and largest_dim_size % mesh.size() != 0:
+                # not divisible, so we return None to shard on default dim 0
+                return None
+            return Shard(idx)
+
+        fsdp_kwargs["shard_placement_fn"] = _shard_placement_fn
 
     # parse out the modules_to_shard argument
     modules_to_shard = strategy.modules_to_shard