updated

Robert Shaw · Robert Shaw · commit 5662826a9608 · 2026-03-31T07:23:11.000-04:00
Signed-off-by: Robert Shaw &lt;rshaw@neuralmagic.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py
@@ -480,24 +480,6 @@ def _allocate_dp_chunking_outputs(
 
         return final_shared_hidden_states, final_fused_hidden_states
 
-    def _maybe_overlap_gate_with_shared_experts(
-        self,
-        hidden_states: torch.Tensor,
-        router_logits: torch.Tensor,
-        shared_experts_input: torch.Tensor | None,
-    ) -> torch.Tensor:
-        # If router/gate provided, then apply it here.
-        # (Note: This code runs only when "overlapped mode" is on to allow
-        #        parallel execution of shared experts with the FusedMoE via
-        #        separate cuda stream)
-        if self.shared_experts is not None:
-            self.shared_experts.maybe_setup_shared_experts_stream(shared_experts_input)
-
-        if self.gate is not None:
-            router_logits, _ = self.gate(hidden_states)
-
-        return router_logits
-
     @property
     def do_naive_dispatch_combine(self) -> bool:
         return (
@@ -621,11 +603,15 @@ def forward_dispatch(
         # TODO(bnell): this can be removed after MK migration is complete.
         layer.ensure_moe_quant_config_init()
 
-        router_logits = self._maybe_overlap_gate_with_shared_experts(
-            hidden_states,
-            router_logits,
-            shared_experts_input,
-        )
+        # Sync aux and main stream for shared expert multi-stream overlap.
+        if self.shared_experts is not None:
+            self.shared_experts.maybe_setup_shared_experts_stream(shared_experts_input)
+
+        # If the Runner holds the gate, apply it after the stream sync,
+        # so it can run overlapped with the
+        # NOTE: in future PR, MoE runner will always hold the gate.
+        if self.gate is not None:
+            router_logits, _ = self.gate(hidden_states)
 
         self._maybe_apply_shared_experts(
             shared_experts_input,
diff --git a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py
@@ -137,9 +137,8 @@ def maybe_setup_shared_experts_stream(
             # because we synch the streams before using shared_output.
             shared_experts_input.record_stream(self._stream)
 
-            # Mark sync start point for the separate shared experts
-            # stream here since we want to run in parallel with the
-            # router/gate (next op below)
+            # Mark sync start point for the aux stream since we will
+            # run in parallel with router/gate.
             self._stream.wait_stream(current_stream())
 
     def _run_in_aux_stream(