simplify a2a kernel dispatching

SageMoore · SageMoore · commit 44124afb1c24 · 2025-09-03T14:16:51.000Z
Signed-off-by: Sage Moore &lt;sage@neuralmagic.com&gt;
diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py
@@ -101,25 +101,14 @@ def __init__(self, cpu_group):
             logger.debug("PPLX NVSHMEM UID = %s", uid)
             nvshmem_init(uid, self.rank, self.world_size)
 
-        # self.handle_cache = Cache()
-        self.handle_caches = [Cache(), Cache()]
+        self.handle_cache = Cache()
 
     def get_handle(self, kwargs):
         import pplx_kernels as pplx
-        return self.handle_caches[0].get_or_create(
+        return self.handle_cache.get_or_create(
             kwargs, pplx.AllToAll.internode
             if self.internode else pplx.AllToAll.intranode)
 
-    def get_handles(self, kwargs):
-        import pplx_kernels as pplx
-        first_handle = self.handle_caches[0].get_or_create(
-            kwargs, pplx.AllToAll.internode
-            if self.internode else pplx.AllToAll.intranode)
-        second_handle = self.handle_caches[1].get_or_create(
-            kwargs, pplx.AllToAll.internode
-            if self.internode else pplx.AllToAll.intranode)
-        return [first_handle, second_handle]
-
     def dispatch(self, hidden_states: torch.Tensor,
                  router_logits: torch.Tensor):
         raise NotImplementedError
@@ -128,10 +117,9 @@ def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
         raise NotImplementedError
 
     def destroy(self):
-        for handle_cache in self.handle_caches:
-            with handle_cache._lock:
-                for _, handle in handle_cache._cache.items():
-                    handle.destroy()
+        with self.handle_cache._lock:
+            for _, handle in self.handle_cache._cache.items():
+                handle.destroy()
 
         if self.internode:
             from pplx_kernels.nvshmem import nvshmem_finalize
@@ -148,7 +136,7 @@ def __init__(self, cpu_group):
         assert has_deep_ep(
         ), "DeepEP kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install DeepEP kernels."  # noqa
         super().__init__(cpu_group)
-        self.handle_caches = [Cache(), Cache()]
+        self.handle_cache = Cache()
 
         # This is the DeepEP default. Stick to it till we can establish
         # reasonable defaults based on profiling.
@@ -175,7 +163,6 @@ class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase):
 
     def __init__(self, cpu_group):
         super().__init__(cpu_group)
-        self.handle_cache = self.handle_caches[0]
 
     def _make_all2all_kwargs(self) -> dict[Any, Any]:
         # Defaults for internode and intranode are taken from DeepEP tests.
@@ -224,7 +211,6 @@ class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase):
 
     def __init__(self, cpu_group):
         super().__init__(cpu_group)
-        self.handle_cache = self.handle_caches[0]
 
     def _make_all2all_kwargs(
         self,
@@ -271,8 +257,3 @@ def get_handle(self, kwargs):
         handle: deep_ep.Buffer = self.handle_cache.get_or_create(
             buffer_kwargs, deep_ep.Buffer)
         return handle
-
-    def get_handles(self, kwargs):
-        handle = self.get_handle(kwargs)
-        # For DeepEP we use the same handle for microbatching
-        return [handle, handle]
diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py
@@ -60,9 +60,6 @@ def get_handle(self, kwargs):
         # and reuse it for the same config.
         raise NotImplementedError
 
-    def get_handles(self, kwargs):
-        raise NotImplementedError
-
     def dispatch(self, hidden_states: torch.Tensor,
                  router_logits: torch.Tensor):
         raise NotImplementedError
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -48,13 +48,13 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
     SUPPORTED_HIDDEN_SIZES = [2048, 2560, 4096, 5120, 6144, 7168]
 
     def __init__(self,
-                 buffers: list[deep_ep.Buffer],
+                 buffer: deep_ep.Buffer,
                  max_tokens_per_rank: int,
                  num_dispatchers: int,
                  use_fp8_dispatch: bool = False):
         super().__init__()
 
-        self.buffers = buffers
+        self.buffer = buffer
         self.max_tokens_per_rank = max_tokens_per_rank
         self.use_fp8_dispatch = use_fp8_dispatch
         # The dispatch function returns a handle that the combine function
@@ -154,7 +154,7 @@ def prepare(
         # Dispatch
         dbo_maybe_run_recv_hook()
         expert_x, expert_num_tokens, handle, _, recv_hook= \
-                self.buffers[a2a_idx].low_latency_dispatch(a1,
+                self.buffer.low_latency_dispatch(a1,
                                                 topk_ids,
                                                 self.max_tokens_per_rank,
                                                 num_experts,
@@ -200,7 +200,7 @@ def finalize(
 
         # TODO (varun) : Enable zero copy mode
         dbo_maybe_run_recv_hook()
-        _, _, recv_hook = self.buffers[a2a_idx].low_latency_combine(fused_expert_output,
+        _, _, recv_hook = self.buffer.low_latency_combine(fused_expert_output,
                                                       topk_ids,
                                                       combine_topk_weights,
                                                       handle,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -142,10 +142,10 @@ def _maybe_make_prepare_finalize(
                 all_to_all_args[
                     "group_name"] = all2all_manager.cpu_group.group_name
 
-            handles = all2all_manager.get_handles(all_to_all_args)
+            handle = all2all_manager.get_handle(all_to_all_args)
 
             prepare_finalize = PplxPrepareAndFinalize(
-                handles,
+                handle,
                 max_num_tokens=moe.max_num_tokens,
                 num_local_experts=moe.num_local_experts,
                 num_dispatchers=num_dispatchers,
@@ -171,7 +171,7 @@ def _maybe_make_prepare_finalize(
                 num_global_experts=moe.num_experts,
                 num_local_experts=moe.num_experts //
                 all2all_manager.world_size)
-            handles = all2all_manager.get_handles(all_to_all_args)
+            handle = all2all_manager.get_handle(all_to_all_args)
 
             # Note : We may want to use FP8 dispatch even otherwise just to
             # reduce datamovement
@@ -182,7 +182,7 @@ def _maybe_make_prepare_finalize(
                                 == DEEPEP_QUANT_BLOCK_SHAPE)
 
             prepare_finalize = DeepEPLLPrepareAndFinalize(
-                handles,
+                handle,
                 max_tokens_per_rank=moe.max_num_tokens,
                 num_dispatchers=all2all_manager.world_size,
                 use_fp8_dispatch=use_fp8_dispatch,