20
20
from vllm .v1 .worker .gpu_worker import Worker
21
21
22
22
23
- def kernel_warmup (worker : "Worker" , do_autotune : bool = False ):
23
+ def kernel_warmup (worker : "Worker" ):
24
24
# Deep GEMM warmup
25
25
do_deep_gemm_warmup = (envs .VLLM_USE_DEEP_GEMM
26
26
and is_deep_gemm_supported ()
@@ -32,11 +32,10 @@ def kernel_warmup(worker: "Worker", do_autotune: bool = False):
32
32
33
33
# FlashInfer autotune for Blackwell (SM 10.0) GPUs
34
34
if has_flashinfer () and current_platform .is_device_capability (100 ):
35
- flashinfer_autotune (worker .model_runner , do_autotune )
35
+ flashinfer_autotune (worker .model_runner )
36
36
37
37
38
- def flashinfer_autotune (runner : "GPUModelRunner" ,
39
- do_autotune : bool = True ) -> None :
38
+ def flashinfer_autotune (runner : "GPUModelRunner" ) -> None :
40
39
"""
41
40
Autotune FlashInfer operations.
42
41
FlashInfer have many implementations for the same operation,
@@ -48,7 +47,7 @@ def flashinfer_autotune(runner: "GPUModelRunner",
48
47
"""
49
48
from vllm .utils .flashinfer import autotune
50
49
51
- with torch .inference_mode (), autotune (do_autotune ):
50
+ with torch .inference_mode (), autotune ():
52
51
# We skip EPLB here since we don't want to record dummy metrics
53
52
# When autotuning with number of tokens m, flashinfer will autotune
54
53
# operations for all number of tokens up to m.
0 commit comments