vllm-project · IwakuraRein · Aug 19, 2025 · Aug 19, 2025 · Aug 20, 2025 · Aug 21, 2025
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -373,7 +373,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # Install FlashInfer from source
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 # Keep this in sync with "flashinfer" extra in setup.py
-ARG FLASHINFER_GIT_REF="v0.2.12"
+ARG FLASHINFER_GIT_REF="v0.2.13"
 # Flag to control whether to compile FlashInfer AOT kernels
 # Set to "true" to enable AOT compilation:
 # docker build --build-arg FLASHINFER_AOT_COMPILE=true ...

diff --git a/setup.py b/setup.py
@@ -694,7 +694,7 @@ def _read_requirements(filename: str) -> list[str]:
                   "mistral_common[audio]"],  # Required for audio processing
         "video": [],  # Kept for backwards compatibility
         # FlashInfer should be updated together with the Dockerfile
-        "flashinfer": ["flashinfer-python==0.2.12"],
+        "flashinfer": ["flashinfer-python==0.2.13"],
     },
     cmdclass=cmdclass,
     package_data=package_data,

@@ -520,7 +520,8 @@ def apply(
                 x_scale = None
             else:
                 x_quant, x_scale = mxfp8_quantize(x, False)  # to mxfp8
-                x_scale = x_scale.view(torch.float8_e4m3fn).reshape(-1)
+                x_scale = x_scale.view(torch.float8_e4m3fn).reshape(
+                    *x.shape[:-1], -1)
             trtllm_gen_output = trtllm_fp4_block_scale_moe(
                 router_logits.to(torch.bfloat16),
                 None,  # routing_bias
@@ -549,6 +550,8 @@ def apply(
                 self._get_tile_tokens_dim(x, top_k),
                 1 if renormalize else 0,  # routing_method_type, renormalize
                 True,  # do finalize
+                # TODO: use the maximum number in the cudagraph_batch_sizes
+                tune_max_num_tokens=8192,
             )[0]
             return trtllm_gen_output
         else:

@@ -312,6 +312,10 @@ def compile_or_warm_up_model(self) -> None:
             logger.info("Compile and warming up model for size %d", size)
             self.model_runner._dummy_run(size, skip_eplb=True)
 
+        # Warmup and tune the kernels used during model execution before
+        # cuda graph capture.
+        kernel_warmup(self)
+
         if not self.model_config.enforce_eager:
             self.model_runner.capture_model()
 
@@ -336,9 +340,6 @@ def compile_or_warm_up_model(self) -> None:
                 self.model_runner._dummy_sampler_run(
                     hidden_states=last_hidden_states)
 
-        # Warmup kernels used during model execution
-        kernel_warmup(self)
-
         # Reset the seed to ensure that the random state is not affected by
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)