Skip to content

Commit 23055b8

Browse files
committed
address comment
Signed-off-by: Siyuan Fu <[email protected]>
1 parent 6cc1f6e commit 23055b8

File tree

2 files changed

+7
-10
lines changed

2 files changed

+7
-10
lines changed

vllm/model_executor/warmup/kernel_warmup.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from vllm.v1.worker.gpu_worker import Worker
2121

2222

23-
def kernel_warmup(worker: "Worker", do_autotune: bool = False):
23+
def kernel_warmup(worker: "Worker"):
2424
# Deep GEMM warmup
2525
do_deep_gemm_warmup = (envs.VLLM_USE_DEEP_GEMM
2626
and is_deep_gemm_supported()
@@ -32,11 +32,10 @@ def kernel_warmup(worker: "Worker", do_autotune: bool = False):
3232

3333
# FlashInfer autotune for Blackwell (SM 10.0) GPUs
3434
if has_flashinfer() and current_platform.is_device_capability(100):
35-
flashinfer_autotune(worker.model_runner, do_autotune)
35+
flashinfer_autotune(worker.model_runner)
3636

3737

38-
def flashinfer_autotune(runner: "GPUModelRunner",
39-
do_autotune: bool = True) -> None:
38+
def flashinfer_autotune(runner: "GPUModelRunner") -> None:
4039
"""
4140
Autotune FlashInfer operations.
4241
FlashInfer have many implementations for the same operation,
@@ -48,7 +47,7 @@ def flashinfer_autotune(runner: "GPUModelRunner",
4847
"""
4948
from vllm.utils.flashinfer import autotune
5049

51-
with torch.inference_mode(), autotune(do_autotune):
50+
with torch.inference_mode(), autotune():
5251
# We skip EPLB here since we don't want to record dummy metrics
5352
# When autotuning with number of tokens m, flashinfer will autotune
5453
# operations for all number of tokens up to m.

vllm/v1/worker/gpu_worker.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -312,8 +312,9 @@ def compile_or_warm_up_model(self) -> None:
312312
logger.info("Compile and warming up model for size %d", size)
313313
self.model_runner._dummy_run(size, skip_eplb=True)
314314

315-
# run autotuner before cuda graph capture.
316-
kernel_warmup(self, do_autotune=True)
315+
# Warmup and tune the kernels used during model execution before
316+
# cuda graph capture.
317+
kernel_warmup(self)
317318

318319
if not self.model_config.enforce_eager:
319320
self.model_runner.capture_model()
@@ -339,9 +340,6 @@ def compile_or_warm_up_model(self) -> None:
339340
self.model_runner._dummy_sampler_run(
340341
hidden_states=last_hidden_states)
341342

342-
# Warmup kernels used during model execution
343-
kernel_warmup(self, do_autotune=False)
344-
345343
# Reset the seed to ensure that the random state is not affected by
346344
# the model initialization and profiling.
347345
set_random_seed(self.model_config.seed)

0 commit comments

Comments
 (0)