File tree Expand file tree Collapse file tree 1 file changed +6
-0
lines changed
vllm/model_executor/models Expand file tree Collapse file tree 1 file changed +6
-0
lines changed Original file line number Diff line number Diff line change @@ -296,6 +296,7 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
296
296
cache_config = vllm_config .cache_config
297
297
model_config = vllm_config .model_config
298
298
parallel_config = vllm_config .parallel_config
299
+ compilation_config = vllm_config .compilation_config
299
300
300
301
if cache_config .cache_dtype == "auto" :
301
302
kv_cache_dtype = model_config .dtype
@@ -361,6 +362,11 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
361
362
"that mamba page size and attention page size are "
362
363
"exactly equal." , mamba_padding_pct )
363
364
365
+ # enable full cuda graphs for decode-only batches
366
+ # note (tdoublep): this is currently necessary to
367
+ # match V0 performance
368
+ compilation_config .full_cuda_graph = True
369
+
364
370
365
371
MODELS_CONFIG_MAP : dict [str , type [VerifyAndUpdateConfig ]] = {
366
372
"GteModel" : SnowflakeGteNewModelConfig ,
You can’t perform that action at this time.
0 commit comments