We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 01085b1 commit 89da8d9Copy full SHA for 89da8d9
vllm/v1/attention/backends/gdn_attn.py
@@ -209,7 +209,8 @@ def build( # type: ignore[override]
209
210
# prepare tensors for cudagraph
211
if (self.use_full_cuda_graph and num_prefills == 0 and num_decodes == 0
212
- and num_spec_decodes <= self.decode_cudagraph_max_bs):
+ and num_spec_decodes <= self.decode_cudagraph_max_bs
213
+ and m.num_actual_tokens <= self.decode_cudagraph_max_bs):
214
num_total_tokens = self.vllm_config.pad_for_cudagraph(
215
m.num_actual_tokens)
216
batch_size = num_total_tokens // (self.num_spec + 1)
0 commit comments