[device_mesh] Implement _unflatten on top of CuTe layout bookkeeping (pytorch#161224)

fduwjj · pytorchmergebot · commit 89298ada8369 · 2025-10-14T23:17:11.000Z
Pull Request resolved: pytorch#161224 Approved by: https://github.com/lw, https://github.com/fegin ghstack dependencies: pytorch#164510
diff --git a/test/distributed/test_device_mesh.py b/test/distributed/test_device_mesh.py
@@ -2,6 +2,7 @@
 # Owner(s): ["oncall: distributed"]
 import os
 import unittest
+from datetime import timedelta
 
 import torch
 import torch.distributed as dist
@@ -40,6 +41,13 @@
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 device_count = torch.accelerator.device_count()
 
+try:
+    import torch._C._distributed_c10d.ProcessGroupNCCL
+
+    _NCCL_AVAILABLE = True
+except ImportError:
+    _NCCL_AVAILABLE = False
+
 
 def _set_env_var(addr="localhost", port="25364", world_size=1, rank=0, local_rank=-1):
     os.environ["MASTER_ADDR"] = addr
@@ -962,6 +970,85 @@ def test_flatten_mesh_4d(self):
         # check flattened mesh dependency
         self.assertEqual(dp_cp_mesh._get_root_mesh(), mesh_4d)
 
+    @with_comms
+    def test_unflatten_mesh_2d(self):
+        mesh_shape = (4, 2)
+        mesh_dim_names = ("dp", "tp")
+        mesh_2d = init_device_mesh(
+            self.device_type, mesh_shape, mesh_dim_names=mesh_dim_names
+        )
+        unflatten_mesh = mesh_2d._unflatten(0, (2, 2), ("dp_shard", "dp_replicate"))
+        self.assertEqual(
+            unflatten_mesh.mesh_dim_names, ["dp_shard", "dp_replicate", "tp"]
+        )
+        self.assertEqual(mesh_2d["tp"].mesh, unflatten_mesh["tp"].mesh)
+        self.assertEqual(mesh_2d["tp"].get_group(), unflatten_mesh["tp"].get_group())
+
+        # Not supporting slicing out unflatten dim name from root mesh.
+        with self.assertRaises(KeyError):
+            self.assertEqual(mesh_2d["dp_shard"].mesh, unflatten_mesh["dp_shard"].mesh)
+
+    @with_comms
+    def test_unflatten_mesh_3d(self):
+        # Test unflatten from a dummy world mesh, which is the case we need for Expert Parallelism(EP).
+        global_mesh = init_device_mesh(
+            self.device_type,
+            (8,),
+            mesh_dim_names=("world",),
+        )
+        non_ep_mesh = global_mesh._unflatten(0, (2, 2, 2), ("dp", "cp", "tp"))
+        ep_mesh = global_mesh._unflatten(0, (2, 2, 2), ("dp", "ep", "ep_tp"))
+        self.assertEqual(non_ep_mesh["cp"].mesh, ep_mesh["ep"].mesh)
+        self.assertEqual(non_ep_mesh["tp"].mesh, ep_mesh["ep_tp"].mesh)
+        mesh_3d = global_mesh._unflatten(0, (4, 2, 1), ("dp", "cp", "tp"))
+        unflatten_mesh = mesh_3d._unflatten(0, (2, 2), ("dp_shard", "dp_replicate"))
+        self.assertEqual(
+            unflatten_mesh.mesh_dim_names, ["dp_shard", "dp_replicate", "cp", "tp"]
+        )
+        self.assertEqual(mesh_3d["tp"].mesh, unflatten_mesh["tp"].mesh)
+        self.assertEqual(mesh_3d["tp"].get_group(), unflatten_mesh["tp"].get_group())
+        self.assertEqual(mesh_3d["cp"].mesh, unflatten_mesh["cp"].mesh)
+        self.assertEqual(mesh_3d["cp"].get_group(), unflatten_mesh["cp"].get_group())
+
+        # Test unflatten with backend override set.
+        if not _NCCL_AVAILABLE:
+            return
+        opts = dist.ProcessGroupNCCL.Options()
+        opts._timeout = timedelta(seconds=30)
+        mesh_2d = global_mesh._unflatten(
+            0,
+            (1, 8),
+            ("pp", "spmd"),
+            backend_override={"pp": "fake", "spmd": ("nccl", opts)},
+        )
+        opts = dist.ProcessGroupNCCL.Options()
+        opts._timeout = timedelta(seconds=60)
+        mesh_4d = mesh_2d._unflatten(
+            1,
+            (2, 2, 2),
+            ("dp", "cp", "tp"),
+            backend_override={"dp": "nccl", "cp": "nccl", "tp": ("nccl", opts)},
+        )
+        self.assertEqual(mesh_4d["pp"].get_group()._get_backend_name(), "custom")
+        spmd_pg = mesh_2d["spmd"].get_group()
+        self.assertEqual(spmd_pg._get_backend_name(), "nccl")
+        w = spmd_pg.allreduce(torch.rand(10).cuda(self.rank))
+        self.assertTrue(
+            spmd_pg._get_backend(
+                torch.device(f"cuda:{self.rank}")
+            )._verify_work_timeout(w, timedelta(seconds=30))
+        )
+        w.wait()
+        tp_pg = mesh_4d["tp"].get_group()
+        self.assertEqual(tp_pg._get_backend_name(), "nccl")
+        w = tp_pg.allreduce(torch.rand(10).cuda(self.rank))
+        self.assertTrue(
+            tp_pg._get_backend(torch.device(f"cuda:{self.rank}"))._verify_work_timeout(
+                w, timedelta(seconds=60)
+            )
+        )
+        w.wait()
+
     @with_comms
     def test_reconstruct_mesh_with_flatten_dim(self):
         mesh_3d = init_device_mesh(
diff --git a/torch/distributed/_mesh_layout.py b/torch/distributed/_mesh_layout.py
@@ -17,6 +17,7 @@
     is_int,
     is_tuple,
     Layout,
+    suffix_product,
 )
 
 
@@ -148,6 +149,52 @@ def complement(self, world_size: int) -> "_MeshLayout":
         layout = complement(self, world_size)
         return _MeshLayout(layout.shape, layout.stride)
 
+    def unflatten(self, dim: int, unflatten_sizes: tuple[int, ...]) -> "_MeshLayout":
+        """
+        Unflatten a single dimension in the layout by splitting it into multiple dimensions.
+        It takes a dimension at position `dim` and splits it into multiple new dimensions
+        with the specified sizes.
+
+        Args:
+            dim (int): The index of the dimension to unflatten. Must be a valid dimension index.
+            unflatten_sizes (tuple[int, ...]): The new sizes for the dimensions that will replace
+                the original dimension at `dim`. The product of these sizes must equal the size
+                of the original dimension at `dim`.
+
+        Returns:
+            _MeshLayout: A new layout with the specified dimension unflattened.
+
+        Example:
+            Original: sizes=(8,), strides=(1,)  # 8 ranks in 1D
+            Call: unflatten(0, (2, 2, 2))  # Create 3D topology
+            Result: sizes=(2, 2, 2), strides=(4, 2, 1)  # 2*2*2 unflattened topology
+        """
+        # Check that dim is within valid range
+        if dim < 0 or dim >= len(self):
+            raise ValueError(
+                f"dim {dim} is out of range for layout with {len(self)} dimensions. "
+                f"Expected dim to be in range [0, {len(self) - 1}]."
+            )
+
+        # Check that the product of unflatten_sizes equals the original dimension size
+        original_size = self[dim].numel()
+        unflatten_product = math.prod(unflatten_sizes)
+        if unflatten_product != original_size:
+            raise ValueError(
+                f"The product of unflatten_sizes {unflatten_sizes} is {unflatten_product}, "
+                f"but the original dimension at dim={dim} has size {original_size}. "
+                f"These must be equal for unflatten to work correctly."
+            )
+
+        sizes = list(self.sizes)  # type: ignore[arg-type]
+        strides = list(self.strides)  # type: ignore[arg-type]
+        unflatten_layout = self[dim].composition(
+            _MeshLayout(tuple(unflatten_sizes), suffix_product(unflatten_sizes))
+        )
+        sizes[dim : dim + 1] = list(unflatten_layout.sizes)  # type: ignore[arg-type]
+        strides[dim : dim + 1] = list(unflatten_layout.strides)  # type: ignore[arg-type]
+        return _MeshLayout(tuple(sizes), tuple(strides))
+
     def all_ranks_from_zero(self) -> list[int]:
         """
         This function computes the all ranks specified by the layout staring from zero.
diff --git a/torch/distributed/device_mesh.py b/torch/distributed/device_mesh.py
@@ -353,6 +353,10 @@ def _init_process_groups(
                         -1, self.mesh.size(dim)
                     )
                     backend, pg_options = backend_override[dim]
+                    # We need to explicitly pass in timeout when specified in option, otherwise
+                    # the default timeout will be used to override the timeout set in option.
+                    # TODO: remove this once we have fixed inside c10d level.
+                    timeout = pg_options._timeout if pg_options else None
 
                     # If we have a 2D mesh with mesh_dim_names ("dp", "tp"), the group description
                     # of the subgroups would be `mesh_dim_dp` and `mesh_name_tp`.
@@ -390,6 +394,7 @@ def _init_process_groups(
                     ):
                         dim_group = split_group(
                             parent_pg=default_group,
+                            timeout=timeout,
                             pg_options=pg_options,
                             split_ranks=pg_ranks_by_dim.tolist(),
                             group_desc=group_desc,
@@ -410,6 +415,7 @@ def _init_process_groups(
                         if bound_device_id is None or not has_split_group:
                             dim_group = new_group(
                                 ranks=subgroup_ranks,
+                                timeout=timeout,
                                 backend=backend,
                                 pg_options=pg_options,
                                 group_desc=group_desc,
@@ -1093,6 +1099,133 @@ def _flatten(
 
             return self._create_flatten_mesh(mesh_dim_name, backend_override_tuple)
 
+        def _create_unflatten_mesh(
+            self,
+            dim: int,
+            mesh_sizes: tuple[int, ...],
+            mesh_dim_names: tuple[str, ...],
+            backend_override: tuple[
+                tuple[Optional[str], Optional[C10dBackend.Options]], ...
+            ] = ((None, None),),
+        ) -> "DeviceMesh":
+            root_mesh = self._get_root_mesh()
+            cur_rank = self.get_rank()
+            unflattened_layout = self._layout.unflatten(dim, mesh_sizes)
+            pg_ranks_by_dim = unflattened_layout.remap_to_tensor(
+                root_mesh.mesh,
+            )
+            unflattened_mesh_dim_names = list(not_none(self.mesh_dim_names))
+            unflattened_mesh_dim_names[dim : dim + 1] = list(mesh_dim_names)
+            res_mesh = DeviceMesh._create_mesh_from_ranks(
+                self.device_type,
+                pg_ranks_by_dim,
+                cur_rank,
+                tuple(unflattened_mesh_dim_names),
+                _init_backend=False,
+                _layout=unflattened_layout,
+                _root_mesh=root_mesh,
+            )
+
+            # If original mesh has initiated its backend, we need to initialize the backend
+            # of unflatten dims as well.
+            # TODO: To make backend init more efficient with cute layout representation and support
+            # per dim backend init.
+            if hasattr(self, "_dim_group_names"):
+                unflatten_length = len(mesh_sizes)
+                unflatten_layout = _MeshLayout(
+                    tuple(unflattened_layout.sizes[dim : dim + unflatten_length]),  # type: ignore[index]
+                    tuple(unflattened_layout.strides[dim : dim + unflatten_length]),  # type: ignore[index]
+                )
+                unflatten_pg_ranks_by_dim = unflatten_layout.remap_to_tensor(
+                    root_mesh.mesh,
+                )
+                unflatten_submesh = DeviceMesh._create_mesh_from_ranks(
+                    self.device_type,
+                    unflatten_pg_ranks_by_dim,
+                    cur_rank,
+                    mesh_dim_names,
+                    backend_override=backend_override,
+                )
+                dim_group_names = []
+                for idx in range(0, res_mesh.ndim):
+                    if idx < dim:
+                        dim_group_names.append(self._dim_group_names[idx])
+                    elif idx >= dim + unflatten_length:
+                        dim_group_names.append(
+                            self._dim_group_names[idx - unflatten_length + 1]
+                        )
+                    else:
+                        dim_group_names.append(
+                            unflatten_submesh._dim_group_names[idx - dim]
+                        )
+                res_mesh._dim_group_names = dim_group_names
+
+            return res_mesh
+
+        def _unflatten(
+            self,
+            dim: Union[int, str],
+            mesh_sizes: tuple[int, ...],
+            mesh_dim_names: tuple[str, ...],
+            backend_override: Optional[
+                dict[
+                    str,
+                    Union[str, C10dBackend.Options, tuple[str, C10dBackend.Options]],
+                ]
+            ] = None,
+        ) -> "DeviceMesh":
+            """
+            Returns a DeviceMesh by unflatten the current DeviceMesh.
+
+            This api can be used to unflatten a N-D DeviceMesh into N-1+len(mesh_sizes)-D meshes or submeshes.
+            The dim is the dimension to be unflattened which can be either a string or an integer.
+
+            The mesh_sizes is a tuple which specifies the shape of the mesh unflatten into for the given dim.
+            The mesh_dim_names is a list of strings which specifies the names of the dimensions of the mesh unflatten into.
+            Its length must match the length of mesh_sizes.
+
+            For example, if we have a 1D mesh DeviceMesh([0, 1, 2, 3, 4, 5, 6, 7], mesh_dim_names=("world")),
+            calling mesh_1d._unflatten(0, (2, 2, 4), ["dp", "pp", "tp"]) will create a 3D mesh
+            DeviceMesh([[[0, 1], [2, 3]], [[4, 5], [6, 7]]], mesh_dim_names=("dp", "cp", "tp")).
+
+            Note that after calling the unflatten, there is no access to the unflattened dimension in mesh_1d, one can only
+            use the newly unflattened mesh to slice out the unflattened mesh dims.
+            """
+            if isinstance(dim, int) and dim >= self.ndim:
+                raise ValueError(
+                    f"dim {dim} specified in `_unflatten` is out of range {self.ndim}"
+                )
+            elif isinstance(dim, str) and dim in not_none(self.mesh_dim_names):
+                raise ValueError(
+                    f"dim {dim} specified in `_unflatten` is not in {self.mesh_dim_names}"
+                )
+
+            if len(mesh_sizes) != len(mesh_dim_names):
+                raise RuntimeError(
+                    "mesh_dim_names must have same length as mesh_sizes in _unflatten!"
+                )
+
+            if isinstance(dim, str):
+                dim = not_none(self.mesh_dim_names).index(dim)
+
+            if backend_override is not None:
+                backend_override_tuple = tuple(
+                    _normalize_backend_override(
+                        backend_override,  # type: ignore[arg-type]
+                        len(mesh_sizes),
+                        mesh_dim_names,
+                    )
+                )
+            else:
+                backend_override_tuple = ((None, None),) * len(mesh_dim_names)
+
+            return self._create_unflatten_mesh(
+                dim,
+                mesh_sizes,
+                mesh_dim_names,
+                backend_override_tuple,
+            )
+
     def _normalize_backend_override(
         backend_override: dict[
             Union[int, str],