[reland][DTensor][FSDP2] necessary changes to FSDP and TP to unblock EP (pytorch#158204)

tianyu-l · pytorchmergebot · commit 5633283574c4 · 2025-07-14T18:07:21.000Z
This PR is identical to pytorch#157216, which got reverted because of removing an outdated import of `torch._dynamo` https://www.internalfb.com/diff/D78021229?transaction_fbid=1713683499308113 The issue has been fixed by @weifengpy by D78199546, so this PR should be good to re-land. Pull Request resolved: pytorch#158204 Approved by: https://github.com/weifengpy
diff --git a/test/distributed/_composable/test_composability/test_2d_composability.py b/test/distributed/_composable/test_composability/test_2d_composability.py
@@ -554,21 +554,6 @@ def _compare_params(self, m1, m2):
                         p2 = p2.redistribute(p2.device_mesh, [Replicate()]).to_local()
                     self.assertTrue(torch.allclose(p1, p2), f"{p1} vs {p2}")
 
-    @with_comms
-    @skip_if_lt_x_gpu(4)
-    def test_raise_invalid_tp_composition(self):
-        with self.assertRaisesRegex(
-            RuntimeError, r"Found TP device_mesh on the \d dimension of its parent mesh"
-        ):
-            mesh_2d = init_device_mesh(
-                self.device_type, (2, self.world_size // 2), mesh_dim_names=("tp", "dp")
-            )
-            parallelize_plan = {
-                "net1": ColwiseParallel(),
-                "net2": RowwiseParallel(),
-            }
-            parallelize_module(SimpleModel().cuda(), mesh_2d["tp"], parallelize_plan)
-
     @with_comms
     @skip_if_lt_x_gpu(4)
     def test_2d_fsdp_state_enable_extension(self):
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_param.py b/torch/distributed/fsdp/_fully_shard/_fsdp_param.py
@@ -292,21 +292,22 @@ def _init_sharded_param(
                 dp_global_mesh is None or tp_global_mesh is None
             ):
                 raise AssertionError(
-                    "FSDP requires the DP and TP mesh to have the same parent mesh but got: \n"
-                    f"DP's global mesh: {dp_global_mesh}\nTP's global mesh: {tp_global_mesh}"
+                    "FSDP requires the DP and model parallel TP/EP mesh to have the same parent mesh but got: \n"
+                    f"DP's global mesh: {dp_global_mesh}\nTP/EP's global mesh: {tp_global_mesh}"
                 )
             name_dims_error = "FSDP requires named DeviceMesh dims for ND parallelism"
             assert dp_mesh.mesh_dim_names is not None, name_dims_error
             assert tp_mesh.mesh_dim_names is not None, name_dims_error
             submesh_names = dp_mesh.mesh_dim_names + tp_mesh.mesh_dim_names
             self._spmd_mesh = dp_global_mesh[submesh_names]
-            if len(self._tp_spec.placements) != 1:
+            if len(self._tp_spec.placements) > 2:
                 raise NotImplementedError(
-                    f"FSDP only supports 1D TP, not {self._tp_spec.placements}"
+                    f"FSDP only supports 1D TP/EP or 2D EP+TP, not {self._tp_spec.placements}"
                 )
             split_factor = self._tp_spec.num_shards_map[shard_dim]
-            assert 2 <= self._spmd_mesh.ndim <= 3, (
-                f"_spmd_mesh.ndim can only be 2 or 3 but got {self._spmd_mesh.ndim}."
+            assert 2 <= self._spmd_mesh.ndim <= 4, (
+                "_spmd_mesh.ndim can only be 2 (FSDP+TP/EP), 3 (FSDP+EP+TP, HSDP+TP/EP), "
+                f"or 4 (HSDP+EP+TP) but got {self._spmd_mesh.ndim}."
             )
             self._spmd_placements: tuple[Placement, ...]
             dp_shard_tp_placement = (
@@ -315,11 +316,11 @@ def _init_sharded_param(
                     if split_factor > 1
                     else fsdp_placement
                 ),
-                self._tp_spec.placements[0],
+                *self._tp_spec.placements,
             )
-            if self._spmd_mesh.ndim == 2:
+            if dp_mesh.ndim == 1:  # FSDP
                 self._spmd_placements = dp_shard_tp_placement
-            else:
+            else:  # HSDP
                 assert self.mesh_info.replicate_mesh_dim == 0
                 self._spmd_placements = (Replicate(),) + dp_shard_tp_placement
             self._sharding_spec = DTensorSpec(
diff --git a/torch/distributed/tensor/parallel/_utils.py b/torch/distributed/tensor/parallel/_utils.py
diff --git a/torch/distributed/tensor/parallel/api.py b/torch/distributed/tensor/parallel/api.py
@@ -6,7 +6,6 @@
 import torch
 import torch.nn as nn
 from torch.distributed.device_mesh import _mesh_resources, DeviceMesh
-from torch.distributed.tensor.parallel._utils import _validate_tp_mesh_dim
 from torch.distributed.tensor.parallel.style import ParallelStyle
 
 
@@ -71,7 +70,6 @@ def parallelize_module(  # type: ignore[return]
     torch._C._log_api_usage_once("torch.distributed.tensor.parallel.parallelize_module")
 
     device_mesh = device_mesh or _mesh_resources.get_current_mesh()
-    _validate_tp_mesh_dim(device_mesh)
 
     if parallelize_plan is None:
         warnings.warn(