[TorchComms] Support training with EP (#1902)

mori360 · web-flow · commit 951f6ff4841b · 2025-10-17T14:20:39.000-07:00
support _build_mesh_with_ep

`TEST_BACKEND=nccl TRAIN_FILE=torchtitan.experiments.torchcomms.train
CONFIG_FILE="./torchtitan/models/qwen3/train_configs/qwen3_moe_debug.toml"
./run_train.sh --parallelism.expert_parallel_degree 2`

[rank0]:[titan] 2025-10-16 17:32:31,142 - root - INFO - Building qwen3
debugmodel_moe with Qwen3ModelArgs(_enforced='This field is used to
enforce all fields have defaults.', dim=256, n_layers=8, n_heads=16,
n_kv_heads=8, vocab_size=2048, head_dim=128, hidden_dim=3072,
norm_eps=1e-06, rope_theta=1000000, qk_norm=True, max_seq_len=4096,
depth_init=True, use_flex_attn=False, attn_mask_type='causal',
eos_id=151645, enable_weight_tying=False, moe_enabled=True,
moe_inter_dim=768, moe_args=MoEArgs(num_experts=64,
num_shared_experts=0, score_func='softmax', route_norm=True,
route_scale=1.0, score_before_experts=False, top_k=8,
use_grouped_mm=True, load_balance_coeff=0.001,
_debug_force_load_balance=False))
...
[rank0]:[titan] 2025-10-16 17:32:40,167 - root - INFO - step: 1 loss:
8.1372 grad_norm: 2.8767 memory: 4.90GiB(5.16%) tps: 1,821 tflops: 0.74
mfu: 0.07%
[rank0]:[titan] 2025-10-16 17:32:40,167 - root - INFO - Synchronizing
and adjusting timeout for all ProcessGroups to 0:01:40

[rank0]:/data/users/yifanmao/pytorch/torch/distributed/distributed_c10d.py:1543:
UserWarning: Set timeout is now only supported for either nccl or gloo.
[rank0]: warnings.warn("Set timeout is now only supported for either
nccl or gloo.")
[rank0]:[titan] 2025-10-16 17:32:40,371 - root - INFO - step: 2 loss:
7.3916 grad_norm: 3.0698 memory: 4.91GiB(5.17%) tps: 80,530 tflops:
32.75 mfu: 3.31%
[rank0]:[titan] 2025-10-16 17:32:40,560 - root - INFO - step: 3 loss:
5.9824 grad_norm: 3.5676 memory: 5.82GiB(6.12%) tps: 86,885 tflops:
35.33 mfu: 3.57%
[rank0]:[titan] 2025-10-16 17:32:40,746 - root - INFO - step: 4 loss:
5.1610 grad_norm: 2.7867 memory: 5.89GiB(6.21%) tps: 88,525 tflops:
36.00 mfu: 3.64%
[rank0]:[titan] 2025-10-16 17:32:40,936 - root - INFO - step: 5 loss:
4.7838 grad_norm: 2.4660 memory: 6.23GiB(6.56%) tps: 86,351 tflops:
35.11 mfu: 3.55%
[rank0]:[titan] 2025-10-16 17:32:41,127 - root - INFO - step: 6 loss:
4.5567 grad_norm: 2.4021 memory: 6.23GiB(6.56%) tps: 86,018 tflops:
34.98 mfu: 3.54%
[rank0]:[titan] 2025-10-16 17:32:41,322 - root - INFO - step: 7 loss:
4.4087 grad_norm: 2.3600 memory: 6.23GiB(6.56%) tps: 84,345 tflops:
34.30 mfu: 3.47%
[rank0]:[titan] 2025-10-16 17:32:41,520 - root - INFO - step: 8 loss:
4.3251 grad_norm: 2.2613 memory: 6.89GiB(7.26%) tps: 82,943 tflops:
33.73 mfu: 3.41%
[rank0]:[titan] 2025-10-16 17:32:41,706 - root - INFO - step: 9 loss:
4.3709 grad_norm: 2.0616 memory: 6.89GiB(7.26%) tps: 88,325 tflops:
35.92 mfu: 3.63%
[rank0]:[titan] 2025-10-16 17:32:41,896 - root - INFO - step: 10 loss:
4.2593 grad_norm: 2.0684 memory: 6.89GiB(7.26%) tps: 86,348 tflops:
35.11 mfu: 3.55%
[rank0]:[titan] 2025-10-16 17:32:41,896 - root - INFO - Sleeping 2
seconds for other ranks to complete
[rank0]:[titan] 2025-10-16 17:32:43,896 - root - INFO - Training
completed
[rank0]:[titan] 2025-10-16 17:32:47,371 - root - INFO - Process group
destroyed
[rank0]:[rank0]:[W1016 17:32:47.493710282 ProcessGroup.hpp:940] Warning:
No backend of type 0 found for Process Group with name undefined.
Assuming no hooks are registered. (function hasHooks)
diff --git a/torchtitan/experiments/torchcomms/parallel_dims.py b/torchtitan/experiments/torchcomms/parallel_dims.py
@@ -49,81 +49,143 @@ def _calculate_ranks_per_dimension(
     return ranks_per_dim
 
 
+def _create_device_mesh(
+    world_size: int,
+    mesh_shape: tuple,
+    mesh_dim_names: List[str],
+) -> Dict:
+    """Util function to create device mesh with communicators for each dimension.
+
+    Args:
+        world_size: Total number of ranks in the world
+        mesh_shape: Shape of the device mesh
+        mesh_dim_names: List of dimension names for the mesh
+
+    Returns:
+        Dictionary containing:
+            - comm: Root communicator
+            - device_mesh: Initialized DeviceMesh object
+            - mesh: Tensor representation of the mesh
+            - comm_per_dim: Communicators for each dimension
+        Returns empty dict if initialization fails
+    """
+    backend = os.environ["TEST_BACKEND"]
+    device = torch.device("cuda")
+    mesh = torch.arange(world_size, dtype=torch.int, device="cpu").view(mesh_shape)
+    comm = torchcomms.new_comm(
+        backend,
+        device,
+        name="comms_test_n_d_parallel",
+    )
+
+    cur_rank = comm.get_rank()
+
+    mesh_sizes = [mesh.size(idx) for idx in range(len(mesh_dim_names))]
+    meshes = [mesh] * len(mesh_dim_names)
+    ranks_per_dim = _calculate_ranks_per_dimension(
+        meshes, mesh_dim_names, mesh_sizes, cur_rank
+    )
+
+    # Create sub-communicators for each dimension
+    comm_per_dim = {}
+    for dim_name, ranks in ranks_per_dim.items():
+        comm_per_dim[dim_name] = comm.split(ranks, dim_name)
+
+    # Initialize device mesh with communicators
+    mesh_dim_comms = tuple(comm_per_dim[name] for name in mesh_dim_names)
+    try:
+        device_mesh = init_device_mesh(
+            mesh_dim_comms=mesh_dim_comms,
+            mesh_dim_names=tuple(mesh_dim_names),
+            _global_comm=comm,
+        )
+    except TypeError as e:
+        # TODO: remove this once PT 2.10 is released
+        if "_rank" in str(e):
+            for sub_comm in comm_per_dim.values():
+                sub_comm.finalize()
+            comm.finalize()
+            return {}
+        raise
+
+    return {
+        "comm": comm,
+        "device_mesh": device_mesh,
+        "mesh": mesh,
+        "comm_per_dim": comm_per_dim,
+    }
+
+
+def _flatten_comms(
+    flatten_ranks_per_dim: Dict[str, List[int]],
+    comm,
+    flatten_mesh_dim_names: Dict[str, List[str]],
+    device_mesh: DeviceMesh,
+    comm_per_dim: Dict[str, any],
+) -> None:
+    """Util function to flatten mesh dimensions and create corresponding communicators.
+
+    Args:
+        flatten_ranks_per_dim: Mapping of flattened dimension names to ranks
+        comm: Base communicator
+        flatten_mesh_dim_names: Mapping of flattened names to original dimension names
+        device_mesh: Device mesh to flatten
+        comm_per_dim: Dictionary to store the created communicators
+    """
+    for flatten_dim_name, ranks in flatten_ranks_per_dim.items():
+        comm_per_dim[flatten_dim_name] = comm.split(ranks, flatten_dim_name)
+        sizes = []
+        strides = []
+        for dim_name in flatten_mesh_dim_names[flatten_dim_name]:
+            layout = device_mesh[dim_name]._layout
+            sizes.append(layout.sizes)
+            strides.append(layout.strides)
+        flatten_layout = _MeshLayout(tuple(sizes), tuple(strides))
+        _flatten_with_comm(
+            device_mesh,
+            flatten_dim_name,
+            comm_per_dim[flatten_dim_name],
+            ranks,
+            flatten_layout,
+        )
+
+
 @dataclass
 class TorchCommsParallelDims(ParallelDims):
     def _build_mesh_without_ep(self) -> DeviceMesh:
-        # TODO: support EP
-        dims = []
-        names = []
-        for d, name in zip(
-            [self.pp, self.dp_replicate, self.dp_shard, self.cp, self.tp],
-            ["pp", "dp_replicate", "dp_shard", "cp", "tp"],
-        ):
-            if d > 1:
-                dims.append(d)
-                names.append(name)
+        mesh_shape = (self.pp, self.dp_replicate, self.dp_shard, self.cp, self.tp)
+        mesh_dim_names = ["pp", "dp_replicate", "dp_shard", "cp", "tp"]
+
+        dims = [d for d in mesh_shape if d > 1]
+        names = [name for d, name in zip(mesh_shape, mesh_dim_names) if d > 1]
 
         logger.info(f"Building {len(dims)}-D device mesh with {names}, {dims}")
-        backend = os.environ["TEST_BACKEND"]
-        device = torch.device("cuda")
-        mesh = torch.arange(self.world_size, dtype=torch.int, device="cpu").view(
-            self.pp, self.dp_replicate, self.dp_shard, self.cp, self.tp
-        )
-        comm = torchcomms.new_comm(
-            backend,
-            device,
-            name="comms_test_n_d_parallel",
-        )
 
-        # Get current rank to determine which groups this rank belongs to
-        cur_rank = comm.get_rank()
+        result = _create_device_mesh(self.world_size, mesh_shape, mesh_dim_names)
+        comm = result.get("comm", None)
+        device_mesh = result.get("device_mesh", None)
+        mesh = result.get("mesh", None)
+        comm_per_dim = result.get("comm_per_dim", None)
+        assert (
+            comm is not None
+            and device_mesh is not None
+            and mesh is not None
+            and comm_per_dim is not None
+        ), "fail to init device mesh"
 
-        mesh_dim_names = ["pp", "dp_replicate", "dp_shard", "cp", "tp"]
-        mesh_sizes = [mesh.size(idx) for idx in range(len(mesh_dim_names))]
-        meshes = [mesh] * len(mesh_dim_names)
-        ranks_per_dim = _calculate_ranks_per_dimension(
-            meshes, mesh_dim_names, mesh_sizes, cur_rank
-        )
-        comm_per_dim = {}
-
-        # Create communicators using the new single-list API
-        for dim_name, ranks in ranks_per_dim.items():
-            comm_per_dim[dim_name] = comm.split(ranks, dim_name)
-
-        try:
-            device_mesh = init_device_mesh(
-                mesh_dim_comms=(
-                    comm_per_dim["pp"],
-                    comm_per_dim["dp_replicate"],
-                    comm_per_dim["dp_shard"],
-                    comm_per_dim["cp"],
-                    comm_per_dim["tp"],
-                ),
-                mesh_dim_names=tuple(mesh_dim_names),
-                _global_comm=comm,
-            )
-        except TypeError as e:
-            # TODO: remove this once PT 2.10 is released
-            if "_rank" in str(e):
-                for sub_comm in comm_per_dim.values():
-                    sub_comm.finalize()
-                comm.finalize()
-                return
-            raise
+        cur_rank = comm.get_rank()
 
         flatten_mesh = [
             mesh.view(self.pp, self.dp_replicate * self.dp_shard, self.cp, self.tp),
             mesh.view(self.pp, self.dp_replicate, self.dp_shard * self.cp, self.tp),
             mesh.view(self.pp, self.dp_replicate * self.dp_shard * self.cp, self.tp),
         ]
-
         flattened_mesh_dim_names = ["dp", "dp_shard_cp", "dp_cp"]
         flatten_mesh_dim_names = {
             "dp": ["dp_replicate", "dp_shard"],
             "dp_shard_cp": ["dp_shard", "cp"],
             "dp_cp": ["dp_replicate", "dp_shard", "cp"],
         }
-
         reshape_size = [
             self.dp_replicate * self.dp_shard,
             self.dp_shard * self.cp,
@@ -134,25 +196,130 @@ def _build_mesh_without_ep(self) -> DeviceMesh:
             flatten_mesh, flattened_mesh_dim_names, reshape_size, cur_rank
         )
 
-        for flatten_dim_name, ranks in flatten_ranks_per_dim.items():
-            comm_per_dim[flatten_dim_name] = comm.split(ranks, flatten_dim_name)
-            sizes = []
-            strides = []
-            # This is important because we need to make sure the layout is correct
-            for dim_name in flatten_mesh_dim_names[flatten_dim_name]:
-                layout = device_mesh[dim_name]._layout
-                sizes.append(layout.sizes)
-                strides.append(layout.strides)
-            flatten_layout = _MeshLayout(tuple(sizes), tuple(strides))
-            _flatten_with_comm(
-                device_mesh,
-                flatten_dim_name,
-                comm_per_dim[flatten_dim_name],
-                ranks,
-                flatten_layout,
-            )
-
-        # call .finalize() to release the sub comm before the root comm
+        _flatten_comms(
+            flatten_ranks_per_dim,
+            comm,
+            flatten_mesh_dim_names,
+            device_mesh,
+            comm_per_dim,
+        )
+
+        # Call .finalize() in train.py after training but before destroying the process group
+        # to release sub-communicators before the root communicator.
         self.comms = [*comm_per_dim.values(), comm]
+        return device_mesh
+
+    def _build_mesh_with_ep(self) -> DeviceMesh:
+        # With ep, dp_shard and ep are derived submeshes:
+        # dp_shard = dp_shard_mod_ep * dp_shard_in_ep
+        if self.etp == self.tp:
+            # ep = dp_shard_in_ep * cp
+            dp_shard_mod_ep = self.dp_shard * self.cp // self.ep
+            dp_shard_in_ep = self.ep // self.cp
+        else:
+            assert self.etp == 1
+            # ep = dp_shard_in_ep * cp * tp
+            dp_shard_mod_ep = self.dp_shard * self.cp * self.tp // self.ep
+            dp_shard_in_ep = self.ep // (self.cp * self.tp)
+
+        mesh_shape = (
+            self.pp,
+            self.dp_replicate,
+            dp_shard_mod_ep,
+            dp_shard_in_ep,
+            self.cp,
+            self.tp,
+        )
+        mesh_dim_names = [
+            "pp",
+            "dp_replicate",
+            "dp_shard_mod_ep",
+            "dp_shard_in_ep",
+            "cp",
+            "tp",
+        ]
+
+        dims = [
+            d
+            for d, name in zip(mesh_shape, mesh_dim_names)
+            if d > 1 or name == "dp_shard_mod_ep"
+        ]
+        names = [
+            name
+            for d, name in zip(mesh_shape, mesh_dim_names)
+            if d > 1 or name == "dp_shard_mod_ep"
+        ]
+
+        logger.info(f"Building {len(dims)}-D device mesh with {names}, {dims}")
+
+        result = _create_device_mesh(self.world_size, mesh_shape, mesh_dim_names)
+        comm = result.get("comm", None)
+        device_mesh = result.get("device_mesh", None)
+        mesh = result.get("mesh", None)
+        comm_per_dim = result.get("comm_per_dim", None)
+        assert (
+            comm is not None
+            and device_mesh is not None
+            and mesh is not None
+            and comm_per_dim is not None
+        ), "fail to init device mesh"
+
+        cur_rank = comm.get_rank()
+
+        flatten_mesh = [
+            mesh.view(
+                self.pp,
+                self.dp_replicate * dp_shard_mod_ep * dp_shard_in_ep,
+                self.cp,
+                self.tp,
+            ),
+            mesh.view(
+                self.pp,
+                self.dp_replicate,
+                dp_shard_mod_ep * dp_shard_in_ep * self.cp,
+                self.tp,
+            ),
+            mesh.view(
+                self.pp,
+                self.dp_replicate * dp_shard_mod_ep * dp_shard_in_ep * self.cp,
+                self.tp,
+            ),
+            mesh.view(
+                self.pp,
+                self.dp_replicate,
+                dp_shard_mod_ep,
+                dp_shard_in_ep * self.cp * self.tp,
+            ),
+        ]
+
+        flattened_mesh_dim_names = ["dp", "dp_shard_cp", "dp_cp", "ep"]
+        flatten_mesh_dim_names = {
+            "dp": ["dp_replicate", "dp_shard_mod_ep", "dp_shard_in_ep"],
+            "dp_shard_cp": ["dp_shard_mod_ep", "dp_shard_in_ep", "cp"],
+            "dp_cp": ["dp_replicate", "dp_shard_mod_ep", "dp_shard_in_ep", "cp"],
+            "ep": ["dp_shard_in_ep", "cp", "tp"],
+        }
+
+        reshape_size = [
+            self.dp_replicate * dp_shard_mod_ep * dp_shard_in_ep,
+            dp_shard_mod_ep * dp_shard_in_ep * self.cp,
+            self.dp_replicate * dp_shard_mod_ep * dp_shard_in_ep * self.cp,
+            dp_shard_in_ep * self.cp * self.tp,
+        ]
 
+        flatten_ranks_per_dim = _calculate_ranks_per_dimension(
+            flatten_mesh, flattened_mesh_dim_names, reshape_size, cur_rank
+        )
+
+        _flatten_comms(
+            flatten_ranks_per_dim,
+            comm,
+            flatten_mesh_dim_names,
+            device_mesh,
+            comm_per_dim,
+        )
+
+        # Call .finalize() in train.py after training but before destroying the process group
+        # to release sub-communicators before the root communicator.
+        self.comms = [*comm_per_dim.values(), comm]
         return device_mesh