Support mori in aiter (ROCm#1453)

ZhangLirong-amd · root · web-flow · commit 603e527286bc · 2025-11-30T10:24:36.000+08:00
* support mori in aiter

* format with coplit

* add prepare comm

* add gpu_per_node

* format some code

---------

Co-authored-by: root &lt;root@hjbog-srdc-39.amd.com&gt;
diff --git a/aiter/dist/device_communicators/all2all.py b/aiter/dist/device_communicators/all2all.py
@@ -0,0 +1,98 @@
+import torch
+import importlib.util
+from .base_device_communicator import All2AllManagerBase, Cache
+from functools import cache
+from aiter import logger
+
+
+@cache
+def _has_module(module_name: str) -> bool:
+    """Return True if *module_name* can be found in the current environment.
+    The result is cached so that subsequent queries for the same module incur
+    no additional overhead.
+    """
+    return importlib.util.find_spec(module_name) is not None
+
+
+def has_mori() -> bool:
+    """Whether the optional `mori` package is available."""
+    return _has_module("mori")
+
+
+class MoriAll2AllManager(All2AllManagerBase):
+    def __init__(self, cpu_group):
+        assert has_mori(), (
+            "MoRI kernels not found. Please follow https://github.com/ROCm/mori/blob/main/README.md"
+            " to install MoRI kernels."
+        )  # noqa
+        import mori
+
+        super().__init__(cpu_group)
+        self.handle_cache = Cache()
+
+        torch._C._distributed_c10d._register_process_group("mori", cpu_group)
+        mori.shmem.shmem_torch_process_group_init("mori")
+
+    def _make_all2all_kwargs(
+        self,
+        rank: int,
+        num_ep_ranks: int,
+        input_dtype: torch.dtype,
+        quant_dtype: torch.dtype,
+        token_hidden_size: int,
+        scale_dim: int,
+        scale_type_size: int,
+        max_num_tokens_per_dp_rank: int,
+        num_local_experts: int,
+        num_experts_per_token: int,
+        gpu_per_node: int,
+    ):
+        import mori  # type: ignore[import-not-found]
+
+        if not self.internode:
+            # single node
+            kernel_type = mori.ops.EpDispatchCombineKernelType.IntraNode
+            warp_num_per_block = 16
+            block_num = 80
+            rdma_block_num = 0
+        else:
+            # multi node
+            kernel_type = mori.ops.EpDispatchCombineKernelType.InterNodeV1
+            warp_num_per_block = 16
+            block_num = 32
+            rdma_block_num = 16
+
+        return dict(
+            rank=rank,
+            world_size=num_ep_ranks,
+            data_type=quant_dtype,
+            hidden_dim=token_hidden_size,
+            scale_dim=scale_dim,
+            scale_type_size=scale_type_size,
+            max_token_type_size=input_dtype.itemsize,
+            max_num_inp_token_per_rank=max_num_tokens_per_dp_rank,
+            num_experts_per_rank=num_local_experts,
+            num_experts_per_token=num_experts_per_token,
+            warp_num_per_block=warp_num_per_block,
+            block_num=block_num,
+            kernel_type=kernel_type,
+            rdma_block_num=rdma_block_num,
+            gpu_per_node=gpu_per_node,
+        )
+
+    def _make_handle(self, **kwargs):
+        import mori  # type: ignore[import-not-found]
+
+        mori_config = mori.ops.EpDispatchCombineConfig(**kwargs)
+        handle = mori.ops.EpDispatchCombineOp(mori_config)
+        return handle
+
+    def get_handle(self, kwargs):
+        import mori  # type: ignore[import-not-found]
+
+        mori_kwargs = self._make_all2all_kwargs(**kwargs)
+        logger.debug("MoRI all2all args %s", mori_kwargs)
+        handle: mori.ops.EpDispatchCombineOp = self.handle_cache.get_or_create(
+            mori_kwargs, self._make_handle
+        )
+        return handle
diff --git a/aiter/dist/device_communicators/base_device_communicator.py b/aiter/dist/device_communicators/base_device_communicator.py
@@ -123,8 +123,9 @@ def __init__(
         #     all2all_backend = config.parallel_config.all2all_backend
 
         self.is_ep_communicator = "ep" in unique_name
-        self.use_all2all = self.is_ep_communicator and use_ep
-        self.all2all_backend = all2all_backend
+        self.use_all2all = self.is_ep_communicator
+        # self.all2all_backend = all2all_backend
+        self.all2all_backend = "mori"
         self.all2all_manager: All2AllManagerBase | None = None
 
     def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
diff --git a/aiter/dist/device_communicators/communicator_cuda.py b/aiter/dist/device_communicators/communicator_cuda.py
@@ -19,6 +19,9 @@ def __init__(
         device_group: ProcessGroup | None = None,
         unique_name: str = "",
     ):
+        self._all2all_manager = None
+        self._all2all_manager_created = False
+
         super().__init__(cpu_group, device, device_group, unique_name)
         if "tp" not in unique_name:
             # custom allreduce or torch symm mem can be used only by tp
@@ -84,39 +87,57 @@ def __init__(
             #     # currently be an MI300 series.
             self.qr_comm = QuickAllReduce(group=self.cpu_group, device=self.device)
 
-        if self.use_all2all:
+    @property
+    def all2all_manager(self):
+        # Lazily create all2all_manager to avoid tp/dp/ep group haven't been created yet
+        if not self._all2all_manager_created and self.use_all2all:
+            self._all2all_manager_created = True
+
             if self.all2all_backend == "naive":
                 from .all2all import NaiveAll2AllManager
 
-                self.all2all_manager = NaiveAll2AllManager(self.cpu_group)
+                self._all2all_manager = NaiveAll2AllManager(self.cpu_group)
             elif self.all2all_backend == "allgather_reducescatter":
                 from .all2all import AgRsAll2AllManager
 
-                self.all2all_manager = AgRsAll2AllManager(self.cpu_group)
+                self._all2all_manager = AgRsAll2AllManager(self.cpu_group)
             elif self.all2all_backend == "pplx":
                 from .all2all import PPLXAll2AllManager
 
-                self.all2all_manager = PPLXAll2AllManager(self.cpu_group)
+                self._all2all_manager = PPLXAll2AllManager(self.cpu_group)
             elif self.all2all_backend == "deepep_high_throughput":
                 from .all2all import DeepEPHTAll2AllManager
 
-                self.all2all_manager = DeepEPHTAll2AllManager(self.cpu_group)
+                self._all2all_manager = DeepEPHTAll2AllManager(self.cpu_group)
             elif self.all2all_backend == "deepep_low_latency":
                 from .all2all import DeepEPLLAll2AllManager
 
-                self.all2all_manager = DeepEPLLAll2AllManager(self.cpu_group)
+                self._all2all_manager = DeepEPLLAll2AllManager(self.cpu_group)
+            elif self.all2all_backend == "mori":
+                from .all2all import MoriAll2AllManager
+
+                self._all2all_manager = MoriAll2AllManager(self.cpu_group)
             elif self.all2all_backend == "flashinfer_all2allv":
                 from .all2all import FlashInferAllToAllManager
 
-                self.all2all_manager = FlashInferAllToAllManager(self.cpu_group)
+                self._all2all_manager = FlashInferAllToAllManager(self.cpu_group)
             else:
                 raise ValueError(f"Unknown all2all backend: {self.all2all_backend}")
 
             if is_global_first_rank():
                 logger.info(
                     "Using %s all2all manager.",
-                    self.all2all_manager.__class__.__name__,
+                    self._all2all_manager.__class__.__name__,
                 )
+        # if self._all2all_manager is None:
+        #     raise ValueError(f"all2all_manager is None for {self.unique_name}")
+        return self._all2all_manager
+
+    @all2all_manager.setter
+    def all2all_manager(self, value):
+        self._all2all_manager = value
+        if value is not None:
+            self._all2all_manager_created = True
 
     def all_reduce(
         self, input_, use_new: bool = False, ca_fp8_quant: bool = False
diff --git a/aiter/dist/parallel_state.py b/aiter/dist/parallel_state.py
@@ -814,6 +814,10 @@ def recv(
             torch.distributed.recv(tensor, self.ranks[src], self.device_group)
         return tensor
 
+    def prepare_communication_buffer_for_model(self, model: torch.nn.Module):
+        if self.device_communicator is not None:
+            self.device_communicator.prepare_communication_buffer_for_model(model)
+
     def destroy(self):
         if hasattr(self, "device_group"):
             torch.distributed.destroy_process_group(self.device_group)
@@ -1359,3 +1363,21 @@ def _node_count(pg: ProcessGroup) -> int:
                 node_assignment[other_rank] = next_node_id
 
     return next_node_id
+
+
+def prepare_communication_buffer_for_model(model: torch.nn.Module):
+    """Prepare the communication buffer for the model.
+    Traditional communication libraries like NCCL are almost
+    model agnostic. However, emerging new communication libraries like
+    MoE all2all (DeepEP) usually allocate the communication buffer
+    based on the model shape for optimal performance.
+    """
+    logger.debug(f"prepare_communication_buffer_for_model: {_TP} {_PP} {_DP} {_EP}")
+    if _TP is not None:
+        _TP.prepare_communication_buffer_for_model(model)
+    if _PP is not None:
+        _PP.prepare_communication_buffer_for_model(model)
+    if _DP is not None:
+        _DP.prepare_communication_buffer_for_model(model)
+    if _EP is not None:
+        _EP.prepare_communication_buffer_for_model(model)
diff --git a/op_tests/multigpu_tests/test_mori_all2all.py b/op_tests/multigpu_tests/test_mori_all2all.py