[Bugfix] [Performance] DeepEPHighThroughput + DeepSeek : Quant before Dispatch (vllm-project#21837)

varun-sundar-rabindranath · Varun Sundar Rabindranath · epwalsh · commit 4a6adca3b032 · 2025-08-27T16:55:32.000-07:00
Signed-off-by: Varun Sundar Rabindranath &lt;vsundarr@redhat.com&gt;
Co-authored-by: Varun Sundar Rabindranath &lt;vsundarr@redhat.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
@@ -144,12 +144,13 @@ def prepare(
                 "apply_router_weight_on_input is only implemented for topk=1")
             a1 = a1 * topk_weights.to(a1.dtype)
 
-        if quant_config.per_act_token_quant:
+        if quant_config.is_block_quantized:
+            # Quant and Dispatch
             a1q, a1q_scale = moe_kernel_quantize_input(
                 a1,
                 a1_scale,
                 quant_dtype=quant_config.quant_dtype,
-                per_act_token_quant=True,
+                per_act_token_quant=quant_config.per_act_token_quant,
                 block_shape=quant_config.block_shape,
             )
             if a1q_scale is not None and a1q_scale.numel() == 1:
@@ -162,16 +163,18 @@ def prepare(
                  rank_topk_weights=topk_weights,
                  num_experts=num_experts)
         else:
-            # DeepEP kernels only support dispatching per-token-quant
-            # quantization. dispatch in bfloat16.
+            # Dispatch and Quant
+            # DeepEP kernels only support dispatching block-quantized
+            # activation scales.
+            # Dispatch in bfloat16
             (expert_x, _, expert_tokens_meta, expert_topk_ids,
              expert_topk_weights) = self._do_dispatch(
                  tokens=a1,
                  token_scales=None,
                  rank_topk_ids=topk_ids,
                  rank_topk_weights=topk_weights,
                  num_experts=num_experts)
-            # quantize now
+            # Quantize after dispatch.
             expert_x_scale = None
             if expert_x.numel() != 0:
                 expert_x, expert_x_scale = moe_kernel_quantize_input(