address comment

IwakuraRein · IwakuraRein · commit 23055b80b470 · 2025-08-20T23:39:18.000-07:00
Signed-off-by: Siyuan Fu &lt;siyuanf@nvidia.com&gt;
diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py
@@ -20,7 +20,7 @@
     from vllm.v1.worker.gpu_worker import Worker
 
 
-def kernel_warmup(worker: "Worker", do_autotune: bool = False):
+def kernel_warmup(worker: "Worker"):
     # Deep GEMM warmup
     do_deep_gemm_warmup = (envs.VLLM_USE_DEEP_GEMM
                            and is_deep_gemm_supported()
@@ -32,11 +32,10 @@ def kernel_warmup(worker: "Worker", do_autotune: bool = False):
 
     # FlashInfer autotune for Blackwell (SM 10.0) GPUs
     if has_flashinfer() and current_platform.is_device_capability(100):
-        flashinfer_autotune(worker.model_runner, do_autotune)
+        flashinfer_autotune(worker.model_runner)
 
 
-def flashinfer_autotune(runner: "GPUModelRunner",
-                        do_autotune: bool = True) -> None:
+def flashinfer_autotune(runner: "GPUModelRunner") -> None:
     """
     Autotune FlashInfer operations.
     FlashInfer have many implementations for the same operation,
@@ -48,7 +47,7 @@ def flashinfer_autotune(runner: "GPUModelRunner",
     """
     from vllm.utils.flashinfer import autotune
 
-    with torch.inference_mode(), autotune(do_autotune):
+    with torch.inference_mode(), autotune():
         # We skip EPLB here since we don't want to record dummy metrics
         # When autotuning with number of tokens m, flashinfer will autotune
         # operations for all number of tokens up to m.
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
@@ -312,8 +312,9 @@ def compile_or_warm_up_model(self) -> None:
             logger.info("Compile and warming up model for size %d", size)
             self.model_runner._dummy_run(size, skip_eplb=True)
 
-        # run autotuner before cuda graph capture.
-        kernel_warmup(self, do_autotune=True)
+        # Warmup and tune the kernels used during model execution before
+        # cuda graph capture.
+        kernel_warmup(self)
 
         if not self.model_config.enforce_eager:
             self.model_runner.capture_model()
@@ -339,9 +340,6 @@ def compile_or_warm_up_model(self) -> None:
                 self.model_runner._dummy_sampler_run(
                     hidden_states=last_hidden_states)
 
-        # Warmup kernels used during model execution
-        kernel_warmup(self, do_autotune=False)
-
         # Reset the seed to ensure that the random state is not affected by
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)