|
47 | 47 | is_pin_memory_available, round_up, supports_dynamo)
|
48 | 48 | from vllm.v1.attention.backends.mamba_selectors import get_mamba_attn_backend
|
49 | 49 | from vllm.v1.attention.backends.utils import (
|
50 |
| - AttentionMetadataBuilder, CommonAttentionMetadata, |
| 50 | + AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, |
51 | 51 | make_kv_sharing_fast_prefill_attention_metadata,
|
52 | 52 | make_local_attention_virtual_batches)
|
53 | 53 | from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
|
@@ -2619,12 +2619,22 @@ def _initialize_single_attn_backend(
|
2619 | 2619 | self.device,
|
2620 | 2620 | )
|
2621 | 2621 |
|
2622 |
| - if (self.full_cuda_graph |
2623 |
| - and not attn_metadata_builder_i.full_cudagraph_supported): |
2624 |
| - raise ValueError( |
2625 |
| - f"Full CUDAGraph not supported for " |
2626 |
| - f"{attn_backend_i.__name__}. Turn off CompilationConfig." |
2627 |
| - f"full_cuda_graph or use a different attention backend.") |
| 2622 | + if self.full_cuda_graph: |
| 2623 | + if attn_metadata_builder_i.attn_cudagraph_support == \ |
| 2624 | + AttentionCGSupport.NEVER: |
| 2625 | + raise ValueError(f"Full CUDAGraph not supported for " |
| 2626 | + f"{attn_backend_i.__name__}. Turn off " |
| 2627 | + f"CompilationConfig.full_cuda_graph or use a " |
| 2628 | + f" different attention backend.") |
| 2629 | + if attn_metadata_builder_i.attn_cudagraph_support == \ |
| 2630 | + AttentionCGSupport.PURE_DECODE_ONLY: |
| 2631 | + # Limit the max cudagraph size to the max number of |
| 2632 | + # sequences for pure decode only cudagraph backend, |
| 2633 | + # whose max_query_len is 1. |
| 2634 | + self.cudagraph_batch_sizes = [ |
| 2635 | + size for size in self.cudagraph_batch_sizes |
| 2636 | + if size <= self.scheduler_config.max_num_seqs |
| 2637 | + ] |
2628 | 2638 | return attn_backend_i, attn_metadata_builder_i
|
2629 | 2639 |
|
2630 | 2640 | def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None:
|
|
0 commit comments