Skip to content

Commit 3a765bd

Browse files
authored
Temporarily enforce eager mode for GPTQ models (#2154)
1 parent 26c52a5 commit 3a765bd

File tree

1 file changed

+5
-0
lines changed

1 file changed

+5
-0
lines changed

vllm/config.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,11 @@ def _verify_cuda_graph(self) -> None:
185185
self.max_context_len_to_capture = self.max_model_len
186186
self.max_context_len_to_capture = min(self.max_context_len_to_capture,
187187
self.max_model_len)
188+
if self.quantization == "gptq" and not self.enforce_eager:
189+
# Related issue: https://github.com/vllm-project/vllm/issues/2147
190+
logger.warning("GPTQ does not support CUDA graph yet. Disabling "
191+
"CUDA graph.")
192+
self.enforce_eager = True
188193

189194
def verify_with_parallel_config(
190195
self,

0 commit comments

Comments
 (0)