Skip to content

Commit 6f41f0e

Browse files
authored
Disable CUDA graph for SqueezeLLM (#2161)
1 parent 2c9b638 commit 6f41f0e

File tree

1 file changed

+4
-3
lines changed

1 file changed

+4
-3
lines changed

vllm/config.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -185,10 +185,11 @@ def _verify_cuda_graph(self) -> None:
185185
self.max_context_len_to_capture = self.max_model_len
186186
self.max_context_len_to_capture = min(self.max_context_len_to_capture,
187187
self.max_model_len)
188-
if self.quantization == "gptq" and not self.enforce_eager:
188+
if (self.quantization in ["gptq", "squeezellm"]
189+
and not self.enforce_eager):
189190
# Related issue: https://github.com/vllm-project/vllm/issues/2147
190-
logger.warning("GPTQ does not support CUDA graph yet. Disabling "
191-
"CUDA graph.")
191+
logger.warning(f"{self.quantization} does not support CUDA graph "
192+
"yet. Disabling CUDA graph.")
192193
self.enforce_eager = True
193194

194195
def verify_with_parallel_config(

0 commit comments

Comments
 (0)