Skip to content

Commit 20228cb

Browse files
[3/N][Attention] Move AttentionMetadata-related code from utils.py to backend.py (vllm-project#32054)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
1 parent 7c0d3c5 commit 20228cb

37 files changed

+374
-370
lines changed

docs/design/cuda_graphs.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ The CUDA Graphs wrapper no longer manages the warm-up logic. The warm-up process
149149

150150
## CUDA Graphs Compatibility of Attention Backends
151151

152-
To signal the CUDA Graphs compatibility of the attention backends, we introduce a new enum type [AttentionCGSupport][vllm.v1.attention.backends.utils.AttentionCGSupport], which is an enum type that tracks the capability of the attention backend to support CUDA Graphs. The value is sorted in the order of the capability, i.e., `ALWAYS`> `UNIFORM_BATCH`> `UNIFORM_SINGLE_TOKEN_DECODE`> `NEVER`.
152+
To signal the CUDA Graphs compatibility of the attention backends, we introduce a new enum type [AttentionCGSupport][vllm.v1.attention.backend.AttentionCGSupport], which is an enum type that tracks the capability of the attention backend to support CUDA Graphs. The value is sorted in the order of the capability, i.e., `ALWAYS`> `UNIFORM_BATCH`> `UNIFORM_SINGLE_TOKEN_DECODE`> `NEVER`.
153153

154154
```python
155155
class AttentionCGSupport(enum.Enum):

tests/v1/attention/test_attention_backends.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,9 @@
2323
is_torch_equal_or_newer,
2424
set_random_seed,
2525
)
26-
from vllm.v1.attention.backend import AttentionType
26+
from vllm.v1.attention.backend import AttentionType, CommonAttentionMetadata
2727
from vllm.v1.attention.backends.registry import AttentionBackendEnum
2828
from vllm.v1.attention.backends.utils import (
29-
CommonAttentionMetadata,
3029
set_kv_cache_layout,
3130
)
3231
from vllm.v1.kv_cache_interface import FullAttentionSpec

tests/v1/attention/test_mla_backends.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,10 @@
2222
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
2323
from vllm.utils.math_utils import cdiv
2424
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
25+
from vllm.v1.attention.backend import CommonAttentionMetadata
2526
from vllm.v1.attention.backends.fa_utils import flash_attn_supports_mla
2627
from vllm.v1.attention.backends.mla.common import QueryLenSupport
2728
from vllm.v1.attention.backends.registry import AttentionBackendEnum
28-
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
2929
from vllm.v1.attention.ops.flashmla import is_flashmla_dense_supported
3030
from vllm.v1.kv_cache_interface import FullAttentionSpec
3131

tests/v1/attention/utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,12 @@
1818
VllmConfig,
1919
)
2020
from vllm.config.model import ModelDType
21-
from vllm.v1.attention.backend import AttentionImpl
22-
from vllm.v1.attention.backends.registry import AttentionBackendEnum
23-
from vllm.v1.attention.backends.utils import (
21+
from vllm.v1.attention.backend import (
22+
AttentionImpl,
2423
AttentionMetadataBuilder,
2524
CommonAttentionMetadata,
2625
)
26+
from vllm.v1.attention.backends.registry import AttentionBackendEnum
2727
from vllm.v1.kv_cache_interface import FullAttentionSpec
2828

2929

tests/v1/e2e/test_async_spec_decode.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def sync_tracker():
1919
Fixture that patches CommonAttentionMetadata.seq_lens_cpu to detect
2020
lazy init syncs. Prints stack traces immediately when syncs occur.
2121
"""
22-
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
22+
from vllm.v1.attention.backend import CommonAttentionMetadata
2323

2424
# Shared counter for cross-process communication (inherited by fork)
2525
sync_count = multiprocessing.Value("i", 0)

tests/v1/spec_decode/test_tree_attention.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@
1212
try_get_attention_backend,
1313
)
1414
from vllm.config import ParallelConfig, SpeculativeConfig
15+
from vllm.v1.attention.backend import CommonAttentionMetadata
1516
from vllm.v1.attention.backends.fa_utils import is_flash_attn_varlen_func_available
1617
from vllm.v1.attention.backends.registry import AttentionBackendEnum
17-
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
1818

1919
if not is_flash_attn_varlen_func_available():
2020
pytest.skip(

vllm/model_executor/layers/attention/chunked_local_attention.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,13 @@
88
from vllm.config import CacheConfig
99
from vllm.config.vllm import VllmConfig
1010
from vllm.model_executor.layers.quantization import QuantizationConfig
11-
from vllm.v1.attention.backend import AttentionBackend
12-
from vllm.v1.attention.backends.utils import (
11+
from vllm.v1.attention.backend import (
12+
AttentionBackend,
1313
AttentionCGSupport,
1414
AttentionMetadataBuilder,
1515
CommonAttentionMetadata,
16+
)
17+
from vllm.v1.attention.backends.utils import (
1618
make_local_attention_virtual_batches,
1719
subclass_attention_backend,
1820
)

vllm/model_executor/layers/attention/cross_attention.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@
1414
AttentionBackend,
1515
AttentionMetadata,
1616
AttentionType,
17+
CommonAttentionMetadata,
1718
)
1819
from vllm.v1.attention.backends.utils import (
19-
CommonAttentionMetadata,
2020
subclass_attention_backend,
2121
)
2222
from vllm.v1.attention.selector import get_attn_backend

vllm/model_executor/layers/attention/encoder_only_attention.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@
1212
AttentionBackend,
1313
AttentionMetadata,
1414
AttentionType,
15+
CommonAttentionMetadata,
1516
)
1617
from vllm.v1.attention.backends.utils import (
17-
CommonAttentionMetadata,
1818
subclass_attention_backend,
1919
)
2020
from vllm.v1.attention.selector import get_attn_backend

vllm/model_executor/layers/attention/static_sink_attention.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@
1515
AttentionBackend,
1616
AttentionMetadata,
1717
AttentionType,
18+
CommonAttentionMetadata,
1819
)
1920
from vllm.v1.attention.backends.utils import (
20-
CommonAttentionMetadata,
2121
subclass_attention_backend,
2222
)
2323
from vllm.v1.attention.ops.triton_reshape_and_cache_flash import (

0 commit comments

Comments
 (0)