add warmup

IwakuraRein · IwakuraRein · commit 6cc1f6e5b69b · 2025-08-20T23:39:18.000-07:00
Signed-off-by: Siyuan Fu &lt;siyuanf@nvidia.com&gt;
diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py
@@ -20,7 +20,7 @@
     from vllm.v1.worker.gpu_worker import Worker
 
 
-def kernel_warmup(worker: "Worker"):
+def kernel_warmup(worker: "Worker", do_autotune: bool = False):
     # Deep GEMM warmup
     do_deep_gemm_warmup = (envs.VLLM_USE_DEEP_GEMM
                            and is_deep_gemm_supported()
@@ -32,10 +32,11 @@ def kernel_warmup(worker: "Worker"):
 
     # FlashInfer autotune for Blackwell (SM 10.0) GPUs
     if has_flashinfer() and current_platform.is_device_capability(100):
-        flashinfer_autotune(worker.model_runner)
+        flashinfer_autotune(worker.model_runner, do_autotune)
 
 
-def flashinfer_autotune(runner: "GPUModelRunner") -> None:
+def flashinfer_autotune(runner: "GPUModelRunner",
+                        do_autotune: bool = True) -> None:
     """
     Autotune FlashInfer operations.
     FlashInfer have many implementations for the same operation,
@@ -47,7 +48,7 @@ def flashinfer_autotune(runner: "GPUModelRunner") -> None:
     """
     from vllm.utils.flashinfer import autotune
 
-    with torch.inference_mode(), autotune():
+    with torch.inference_mode(), autotune(do_autotune):
         # We skip EPLB here since we don't want to record dummy metrics
         # When autotuning with number of tokens m, flashinfer will autotune
         # operations for all number of tokens up to m.
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
@@ -313,7 +313,7 @@ def compile_or_warm_up_model(self) -> None:
             self.model_runner._dummy_run(size, skip_eplb=True)
 
         # run autotuner before cuda graph capture.
-        kernel_warmup(self)
+        kernel_warmup(self, do_autotune=True)
 
         if not self.model_config.enforce_eager:
             self.model_runner.capture_model()
@@ -339,6 +339,9 @@ def compile_or_warm_up_model(self) -> None:
                 self.model_runner._dummy_sampler_run(
                     hidden_states=last_hidden_states)
 
+        # Warmup kernels used during model execution
+        kernel_warmup(self, do_autotune=False)
+
         # Reset the seed to ensure that the random state is not affected by
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)