vllm-project · IwakuraRein · Aug 19, 2025 · Aug 19, 2025 · Aug 20, 2025 · Aug 21, 2025
diff --git a/setup.py b/setup.py
@@ -685,7 +685,7 @@ def _read_requirements(filename: str) -> list[str]:
                   "mistral_common[audio]"],  # Required for audio processing
         "video": [],  # Kept for backwards compatibility
         # FlashInfer should be updated together with the Dockerfile
-        "flashinfer": ["flashinfer-python==0.2.12"],
+        "flashinfer": ["flashinfer-python==0.2.13"],
     },
     cmdclass=cmdclass,
     package_data=package_data,

@@ -113,6 +113,8 @@ def __init__(self, moe: FusedMoEConfig):
         self.topk_indices_dtype = None
         self.moe = moe
         self.use_marlin = self._should_use_marlin()
+        self.device_support_pdl = current_platform.is_cuda(
+        ) and current_platform.has_device_capability(90)
 
         if current_platform.is_device_capability(100) and not has_flashinfer():
             logger.warning_once(
@@ -520,7 +522,8 @@ def apply(
                 x_scale = None
             else:
                 x_quant, x_scale = mxfp8_quantize(x, False)  # to mxfp8
-                x_scale = x_scale.view(torch.float8_e4m3fn).reshape(-1)
+                x_scale = x_scale.view(torch.float8_e4m3fn).reshape(
+                    *x.shape[:-1], -1)
             trtllm_gen_output = trtllm_fp4_block_scale_moe(
                 router_logits.to(torch.bfloat16),
                 None,  # routing_bias
@@ -549,6 +552,10 @@ def apply(
                 self._get_tile_tokens_dim(x, top_k),
                 1 if renormalize else 0,  # routing_method_type, renormalize
                 True,  # do finalize
+                self.device_support_pdl,
+                None,  # output
+                # TODO: use the maximum number in the cudagraph_batch_sizes
+                8192,  # tune_max_num_tokens.
             )[0]
             return trtllm_gen_output
         else:

@@ -312,6 +312,10 @@ def compile_or_warm_up_model(self) -> None:
             logger.info("Compile and warming up model for size %d", size)
             self.model_runner._dummy_run(size, skip_eplb=True)
 
+        # Warmup and tune the kernels used during model execution before
+        # cuda graph capture.
+        kernel_warmup(self)
+
         if not self.model_config.enforce_eager:
             self.model_runner.capture_model()
 
@@ -336,9 +340,6 @@ def compile_or_warm_up_model(self) -> None:
                 self.model_runner._dummy_sampler_run(
                     hidden_states=last_hidden_states)
 
-        # Warmup kernels used during model execution
-        kernel_warmup(self)
-
         # Reset the seed to ensure that the random state is not affected by
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)