fix after flashinfer autotuner

IwakuraRein · IwakuraRein · commit 35a24b32cd68 · 2025-08-20T23:39:18.000-07:00
Signed-off-by: Siyuan Fu &lt;siyuanf@nvidia.com&gt;
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -113,6 +113,8 @@ def __init__(self, moe: FusedMoEConfig):
         self.topk_indices_dtype = None
         self.moe = moe
         self.use_marlin = self._should_use_marlin()
+        self.device_support_pdl = current_platform.is_cuda(
+        ) and current_platform.has_device_capability(90)
 
         if current_platform.is_device_capability(100) and not has_flashinfer():
             logger.warning_once(
@@ -520,7 +522,8 @@ def apply(
                 x_scale = None
             else:
                 x_quant, x_scale = mxfp8_quantize(x, False)  # to mxfp8
-                x_scale = x_scale.view(torch.float8_e4m3fn).reshape(-1)
+                x_scale = x_scale.view(torch.float8_e4m3fn).reshape(
+                    *x.shape[:-1], -1)
             trtllm_gen_output = trtllm_fp4_block_scale_moe(
                 router_logits.to(torch.bfloat16),
                 None,  # routing_bias
@@ -549,6 +552,10 @@ def apply(
                 self._get_tile_tokens_dim(x, top_k),
                 1 if renormalize else 0,  # routing_method_type, renormalize
                 True,  # do finalize
+                self.device_support_pdl,
+                None,  # output
+                # TODO: use the maximum number in the cudagraph_batch_sizes
+                8192,  # tune_max_num_tokens.
             )[0]
             return trtllm_gen_output
         else:
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
@@ -312,6 +312,9 @@ def compile_or_warm_up_model(self) -> None:
             logger.info("Compile and warming up model for size %d", size)
             self.model_runner._dummy_run(size, skip_eplb=True)
 
+        # run autotuner before cuda graph capture.
+        kernel_warmup(self)
+
         if not self.model_config.enforce_eager:
             self.model_runner.capture_model()
 
@@ -336,9 +339,6 @@ def compile_or_warm_up_model(self) -> None:
                 self.model_runner._dummy_sampler_run(
                     hidden_states=last_hidden_states)
 
-        # Warmup kernels used during model execution
-        kernel_warmup(self)
-
         # Reset the seed to ensure that the random state is not affected by
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)