From fceaa692d37fc36085576b09be99860417542e40 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Mon, 6 Oct 2025 14:41:13 -0400 Subject: [PATCH 1/9] Add AttentionConfig and --attention-backend CLI argument This commit consolidates attention-related configuration into a new AttentionConfig class and exposes the attention backend as a CLI argument. Changes: - Created new AttentionConfig class in vllm/config/attention.py - Added AttentionConfig to VllmConfig - Added --attention-backend CLI argument in dedicated argument group - Updated imports and exports This sets the foundation for making more attention-related settingsconfigurable via CLI arguments in future work. Signed-off-by: Matthew Bonanni --- vllm/config/__init__.py | 3 ++ vllm/config/attention.py | 83 ++++++++++++++++++++++++++++++++++++++++ vllm/config/vllm.py | 7 ++++ vllm/engine/arg_utils.py | 36 ++++++++++++++++- 4 files changed, 128 insertions(+), 1 deletion(-) create mode 100644 vllm/config/attention.py diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 7c5052c822f8..2201dd5898f3 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from vllm.config.attention import AttentionConfig from vllm.config.cache import ( BlockSize, CacheConfig, @@ -57,6 +58,8 @@ ) __all__ = [ + # From vllm.config.attention + "AttentionConfig", # From vllm.config.cache "BlockSize", "CacheConfig", diff --git a/vllm/config/attention.py b/vllm/config/attention.py new file mode 100644 index 000000000000..3285a9a36084 --- /dev/null +++ b/vllm/config/attention.py @@ -0,0 +1,83 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +from typing import Any, Optional + +from pydantic import ConfigDict +from pydantic.dataclasses import dataclass + +import vllm.envs as envs +from vllm.config.utils import config + + +@config +@dataclass(config=ConfigDict(arbitrary_types_allowed=True)) +class AttentionConfig: + """Configuration for attention mechanisms in vLLM.""" + + backend: Optional[str] = None + """Attention backend to use. If None, will be selected automatically. + Example options: TORCH_SDPA, FLASH_ATTN, XFORMERS, FLASHINFER, etc.""" + + use_triton_flash_attn: bool = True + """Whether to use triton flash attention.""" + + flash_attn_version: Optional[int] = None + """Force vllm to use a specific flash-attention version (2 or 3). + Only valid when using the flash-attention backend.""" + + v1_use_prefill_decode_attention: bool = False + """Use separate prefill and decode kernels for V1 attention instead of + the unified triton kernel.""" + + use_aiter_unified_attention: bool = False + """Use AITER triton unified attention for V1 attention.""" + + flash_attn_max_num_splits_for_cuda_graph: int = 32 + """Flash Attention max number splits for cuda graph decode.""" + + use_cudnn_prefill: bool = False + """Whether to use cudnn prefill.""" + + use_trtllm_attention: Optional[bool] = None + """If set to True/False, use or don't use the TRTLLM attention backend + in flashinfer. If None, auto-detect the attention backend in flashinfer.""" + + disable_flashinfer_prefill: bool = False + """Whether to disable flashinfer prefill.""" + + flashinfer_disable_q_quantization: bool = False + """If set, when using fp8 kv, do not quantize Q to fp8.""" + + def __post_init__(self): + # If backend is not set, use environment variable + if self.backend is None and envs.VLLM_ATTENTION_BACKEND is not None: + self.backend = envs.VLLM_ATTENTION_BACKEND + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + factors: list[Any] = [ + self.backend, + self.use_triton_flash_attn, + self.flash_attn_version, + self.v1_use_prefill_decode_attention, + self.use_aiter_unified_attention, + self.flash_attn_max_num_splits_for_cuda_graph, + self.use_cudnn_prefill, + self.use_trtllm_attention, + self.disable_flashinfer_prefill, + self.flashinfer_disable_q_quantization, + ] + hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() + return hash_str diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index b5856958ce2e..dce8a05f0098 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -20,6 +20,7 @@ from vllm.transformers_utils.runai_utils import is_runai_obj_uri from vllm.utils import random_uuid +from .attention import AttentionConfig from .cache import CacheConfig from .compilation import CompilationConfig, CompilationLevel, CUDAGraphMode from .device import DeviceConfig @@ -68,6 +69,8 @@ class VllmConfig: """Device configuration.""" load_config: LoadConfig = field(default_factory=LoadConfig) """Load configuration.""" + attention_config: AttentionConfig = field(default_factory=AttentionConfig) + """Attention configuration.""" lora_config: Optional[LoRAConfig] = None """LoRA configuration.""" speculative_config: Optional[SpeculativeConfig] = None @@ -153,6 +156,10 @@ def compute_hash(self) -> str: vllm_factors.append(self.load_config.compute_hash()) else: vllm_factors.append("None") + if self.attention_config: + vllm_factors.append(self.attention_config.compute_hash()) + else: + vllm_factors.append("None") if self.lora_config: vllm_factors.append(self.lora_config.compute_hash()) # LoRA creates static buffers based on max_num_batched_tokens. diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index a94ef598f2de..bff15972a4db 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -31,6 +31,7 @@ import vllm.envs as envs from vllm.config import ( + AttentionConfig, BlockSize, CacheConfig, CacheDType, @@ -494,6 +495,7 @@ class EngineArgs: ) model_impl: str = ModelConfig.model_impl override_attention_dtype: str = ModelConfig.override_attention_dtype + attention_backend: Optional[str] = AttentionConfig.backend calculate_kv_scales: bool = CacheConfig.calculate_kv_scales mamba_cache_dtype: MambaDType = CacheConfig.mamba_cache_dtype @@ -655,6 +657,20 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "--pt-load-map-location", **load_kwargs["pt_load_map_location"] ) + # Attention arguments + attention_group = parser.add_argument_group( + title="AttentionConfig", + description=AttentionConfig.__doc__, + ) + attention_group.add_argument( + "--attention-backend", + type=str, + default=EngineArgs.attention_backend, + help="Attention backend to use. If not specified, will be selected " + "automatically. Example options: TORCH_SDPA, FLASH_ATTN, XFORMERS, " + "FLASHINFER, FLASHMLA, etc.", + ) + # Structured outputs arguments structured_outputs_kwargs = get_kwargs(StructuredOutputsConfig) structured_outputs_group = parser.add_argument_group( @@ -1201,6 +1217,21 @@ def create_speculative_config( ) return SpeculativeConfig(**self.speculative_config) + def create_attention_config(self) -> AttentionConfig: + """Create attention configuration.""" + return AttentionConfig( + backend=self.attention_backend, + use_triton_flash_attn=envs.VLLM_USE_TRITON_FLASH_ATTN, + flash_attn_version=envs.VLLM_FLASH_ATTN_VERSION, + v1_use_prefill_decode_attention=envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION, + use_aiter_unified_attention=envs.VLLM_USE_AITER_UNIFIED_ATTENTION, + flash_attn_max_num_splits_for_cuda_graph=envs.VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH, + use_cudnn_prefill=envs.VLLM_USE_CUDNN_PREFILL, + use_trtllm_attention=envs.VLLM_USE_TRTLLM_ATTENTION, + disable_flashinfer_prefill=envs.VLLM_DISABLE_FLASHINFER_PREFILL, + flashinfer_disable_q_quantization=envs.VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION, + ) + def create_engine_config( self, usage_context: Optional[UsageContext] = None, @@ -1543,15 +1574,18 @@ def create_engine_config( collect_detailed_traces=self.collect_detailed_traces, ) + attention_config = self.create_attention_config() + config = VllmConfig( model_config=model_config, cache_config=cache_config, parallel_config=parallel_config, scheduler_config=scheduler_config, device_config=device_config, + load_config=load_config, + attention_config=attention_config, lora_config=lora_config, speculative_config=speculative_config, - load_config=load_config, structured_outputs_config=self.structured_outputs_config, observability_config=observability_config, compilation_config=self.compilation_config, From 4037da2f90521b8dd55ccd4164d0cedf80819b0a Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Mon, 6 Oct 2025 14:48:18 -0400 Subject: [PATCH 2/9] Add documentation for environment variable compatibility Added detailed documentation to clarify that all attention-related environment variables are still respected for backward compatibility. This ensures users can continue using environment variables while also having the option to use the new --attention-backend CLI argument. Signed-off-by: Matthew Bonanni --- vllm/config/attention.py | 7 +++++-- vllm/engine/arg_utils.py | 21 ++++++++++++++++++--- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/vllm/config/attention.py b/vllm/config/attention.py index 3285a9a36084..e21d1d97edc5 100644 --- a/vllm/config/attention.py +++ b/vllm/config/attention.py @@ -18,7 +18,7 @@ class AttentionConfig: backend: Optional[str] = None """Attention backend to use. If None, will be selected automatically. - Example options: TORCH_SDPA, FLASH_ATTN, XFORMERS, FLASHINFER, etc.""" + Example options: FLASH_ATTN, XFORMERS, FLASHINFER, etc.""" use_triton_flash_attn: bool = True """Whether to use triton flash attention.""" @@ -51,7 +51,10 @@ class AttentionConfig: """If set, when using fp8 kv, do not quantize Q to fp8.""" def __post_init__(self): - # If backend is not set, use environment variable + # Environment variable compatibility: If backend is not set, + # use VLLM_ATTENTION_BACKEND environment variable. + # This ensures backward compatibility with existing deployments + # that rely on environment variables. if self.backend is None and envs.VLLM_ATTENTION_BACKEND is not None: self.backend = envs.VLLM_ATTENTION_BACKEND diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index bff15972a4db..e3a5a0f845a0 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -667,8 +667,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: type=str, default=EngineArgs.attention_backend, help="Attention backend to use. If not specified, will be selected " - "automatically. Example options: TORCH_SDPA, FLASH_ATTN, XFORMERS, " - "FLASHINFER, FLASHMLA, etc.", + "automatically. Example options: FLASH_ATTN, XFORMERS, FLASHINFER, " + "FLASHMLA, etc.", ) # Structured outputs arguments @@ -1218,7 +1218,22 @@ def create_speculative_config( return SpeculativeConfig(**self.speculative_config) def create_attention_config(self) -> AttentionConfig: - """Create attention configuration.""" + """Create attention configuration. + + This method reads from environment variables to maintain backward + compatibility with existing deployments. All attention-related + environment variables are respected: + - VLLM_ATTENTION_BACKEND (can also be set via --attention-backend CLI arg) + - VLLM_USE_TRITON_FLASH_ATTN + - VLLM_FLASH_ATTN_VERSION + - VLLM_V1_USE_PREFILL_DECODE_ATTENTION + - VLLM_USE_AITER_UNIFIED_ATTENTION + - VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH + - VLLM_USE_CUDNN_PREFILL + - VLLM_USE_TRTLLM_ATTENTION + - VLLM_DISABLE_FLASHINFER_PREFILL + - VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION + """ return AttentionConfig( backend=self.attention_backend, use_triton_flash_attn=envs.VLLM_USE_TRITON_FLASH_ATTN, From cc5d9422f7d4096d497a65bedd9468bc5996360d Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Mon, 6 Oct 2025 14:51:30 -0400 Subject: [PATCH 3/9] Add deprecation warnings for attention environment variables Added comprehensive deprecation warnings to guide users toward using CLI arguments instead of environment variables for attention configuration. Changes: - Added deprecation warning in create_attention_config() that lists all attention-related environment variables that are set - Warning directs users to use --attention-backend and other future CLI args - Consolidated warning logic to avoid duplicate warnings - Maintains full backward compatibility while encouraging migration The warning will only show if one or more attention environment variables are explicitly set, making it non-intrusive for users who don't use them. Signed-off-by: Matthew Bonanni --- vllm/config/attention.py | 2 ++ vllm/engine/arg_utils.py | 40 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/vllm/config/attention.py b/vllm/config/attention.py index e21d1d97edc5..407a68f7add2 100644 --- a/vllm/config/attention.py +++ b/vllm/config/attention.py @@ -55,6 +55,8 @@ def __post_init__(self): # use VLLM_ATTENTION_BACKEND environment variable. # This ensures backward compatibility with existing deployments # that rely on environment variables. + # Note: Deprecation warning is emitted in create_attention_config() + # to avoid duplicate warnings. if self.backend is None and envs.VLLM_ATTENTION_BACKEND is not None: self.backend = envs.VLLM_ATTENTION_BACKEND diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index e3a5a0f845a0..2546330e6c60 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1222,7 +1222,7 @@ def create_attention_config(self) -> AttentionConfig: This method reads from environment variables to maintain backward compatibility with existing deployments. All attention-related - environment variables are respected: + environment variables are respected but will emit deprecation warnings: - VLLM_ATTENTION_BACKEND (can also be set via --attention-backend CLI arg) - VLLM_USE_TRITON_FLASH_ATTN - VLLM_FLASH_ATTN_VERSION @@ -1233,7 +1233,45 @@ def create_attention_config(self) -> AttentionConfig: - VLLM_USE_TRTLLM_ATTENTION - VLLM_DISABLE_FLASHINFER_PREFILL - VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION + + Note: In the future, these environment variables will be deprecated + in favor of CLI arguments in the --attention-config group. """ + + # Check if any attention env vars are set and warn users + attention_env_vars_in_use = [] + if envs.is_set("VLLM_ATTENTION_BACKEND"): + attention_env_vars_in_use.append("VLLM_ATTENTION_BACKEND") + if envs.is_set("VLLM_USE_TRITON_FLASH_ATTN"): + attention_env_vars_in_use.append("VLLM_USE_TRITON_FLASH_ATTN") + if envs.is_set("VLLM_FLASH_ATTN_VERSION"): + attention_env_vars_in_use.append("VLLM_FLASH_ATTN_VERSION") + if envs.is_set("VLLM_V1_USE_PREFILL_DECODE_ATTENTION"): + attention_env_vars_in_use.append("VLLM_V1_USE_PREFILL_DECODE_ATTENTION") + if envs.is_set("VLLM_USE_AITER_UNIFIED_ATTENTION"): + attention_env_vars_in_use.append("VLLM_USE_AITER_UNIFIED_ATTENTION") + if envs.is_set("VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH"): + attention_env_vars_in_use.append( + "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH" + ) + if envs.is_set("VLLM_USE_CUDNN_PREFILL"): + attention_env_vars_in_use.append("VLLM_USE_CUDNN_PREFILL") + if envs.is_set("VLLM_USE_TRTLLM_ATTENTION"): + attention_env_vars_in_use.append("VLLM_USE_TRTLLM_ATTENTION") + if envs.is_set("VLLM_DISABLE_FLASHINFER_PREFILL"): + attention_env_vars_in_use.append("VLLM_DISABLE_FLASHINFER_PREFILL") + if envs.is_set("VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION"): + attention_env_vars_in_use.append("VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION") + + if attention_env_vars_in_use: + logger.warning( + "The following attention-related environment variables are set: %s. " + "These are deprecated and will be removed in a future release. " + "Please use CLI arguments in the AttentionConfig group instead " + "(e.g., --attention-backend).", + ", ".join(attention_env_vars_in_use), + ) + return AttentionConfig( backend=self.attention_backend, use_triton_flash_attn=envs.VLLM_USE_TRITON_FLASH_ATTN, From 08aa82b554353d327d86d0bd0b444a68bafa3dd0 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Mon, 6 Oct 2025 15:04:38 -0400 Subject: [PATCH 4/9] Limit deprecation warning to VLLM_ATTENTION_BACKEND only Updated the deprecation warning to only apply to VLLM_ATTENTION_BACKEND since that's the only attention env var with a CLI alternative (--attention-backend). The other attention-related environment variables remain fully supported without deprecation warnings, as they don't have CLI argument alternatives yet. This makes the warning more accurate and less alarming for users who rely on other attention env vars that aren't being deprecated. Also removed unused envs import from attention.py after removing __post_init__. Signed-off-by: Matthew Bonanni --- vllm/config/attention.py | 11 --------- vllm/engine/arg_utils.py | 50 +++++++++++----------------------------- 2 files changed, 13 insertions(+), 48 deletions(-) diff --git a/vllm/config/attention.py b/vllm/config/attention.py index 407a68f7add2..e860da983145 100644 --- a/vllm/config/attention.py +++ b/vllm/config/attention.py @@ -7,7 +7,6 @@ from pydantic import ConfigDict from pydantic.dataclasses import dataclass -import vllm.envs as envs from vllm.config.utils import config @@ -50,16 +49,6 @@ class AttentionConfig: flashinfer_disable_q_quantization: bool = False """If set, when using fp8 kv, do not quantize Q to fp8.""" - def __post_init__(self): - # Environment variable compatibility: If backend is not set, - # use VLLM_ATTENTION_BACKEND environment variable. - # This ensures backward compatibility with existing deployments - # that rely on environment variables. - # Note: Deprecation warning is emitted in create_attention_config() - # to avoid duplicate warnings. - if self.backend is None and envs.VLLM_ATTENTION_BACKEND is not None: - self.backend = envs.VLLM_ATTENTION_BACKEND - def compute_hash(self) -> str: """ WARNING: Whenever a new field is added to this config, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 2546330e6c60..6286766624d3 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1222,8 +1222,8 @@ def create_attention_config(self) -> AttentionConfig: This method reads from environment variables to maintain backward compatibility with existing deployments. All attention-related - environment variables are respected but will emit deprecation warnings: - - VLLM_ATTENTION_BACKEND (can also be set via --attention-backend CLI arg) + environment variables are respected: + - VLLM_ATTENTION_BACKEND (deprecated, use --attention-backend CLI arg) - VLLM_USE_TRITON_FLASH_ATTN - VLLM_FLASH_ATTN_VERSION - VLLM_V1_USE_PREFILL_DECODE_ATTENTION @@ -1233,47 +1233,23 @@ def create_attention_config(self) -> AttentionConfig: - VLLM_USE_TRTLLM_ATTENTION - VLLM_DISABLE_FLASHINFER_PREFILL - VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION - - Note: In the future, these environment variables will be deprecated - in favor of CLI arguments in the --attention-config group. """ - # Check if any attention env vars are set and warn users - attention_env_vars_in_use = [] - if envs.is_set("VLLM_ATTENTION_BACKEND"): - attention_env_vars_in_use.append("VLLM_ATTENTION_BACKEND") - if envs.is_set("VLLM_USE_TRITON_FLASH_ATTN"): - attention_env_vars_in_use.append("VLLM_USE_TRITON_FLASH_ATTN") - if envs.is_set("VLLM_FLASH_ATTN_VERSION"): - attention_env_vars_in_use.append("VLLM_FLASH_ATTN_VERSION") - if envs.is_set("VLLM_V1_USE_PREFILL_DECODE_ATTENTION"): - attention_env_vars_in_use.append("VLLM_V1_USE_PREFILL_DECODE_ATTENTION") - if envs.is_set("VLLM_USE_AITER_UNIFIED_ATTENTION"): - attention_env_vars_in_use.append("VLLM_USE_AITER_UNIFIED_ATTENTION") - if envs.is_set("VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH"): - attention_env_vars_in_use.append( - "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH" - ) - if envs.is_set("VLLM_USE_CUDNN_PREFILL"): - attention_env_vars_in_use.append("VLLM_USE_CUDNN_PREFILL") - if envs.is_set("VLLM_USE_TRTLLM_ATTENTION"): - attention_env_vars_in_use.append("VLLM_USE_TRTLLM_ATTENTION") - if envs.is_set("VLLM_DISABLE_FLASHINFER_PREFILL"): - attention_env_vars_in_use.append("VLLM_DISABLE_FLASHINFER_PREFILL") - if envs.is_set("VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION"): - attention_env_vars_in_use.append("VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION") - - if attention_env_vars_in_use: + # Warn if VLLM_ATTENTION_BACKEND env var is used instead of CLI arg + if envs.is_set("VLLM_ATTENTION_BACKEND") and self.attention_backend is None: logger.warning( - "The following attention-related environment variables are set: %s. " - "These are deprecated and will be removed in a future release. " - "Please use CLI arguments in the AttentionConfig group instead " - "(e.g., --attention-backend).", - ", ".join(attention_env_vars_in_use), + "Using VLLM_ATTENTION_BACKEND environment variable is deprecated " + "and will be removed in a future release. " + "Please use --attention-backend CLI argument instead." ) + # Handle backend: prefer CLI arg, fall back to env var + backend = self.attention_backend + if backend is None: + backend = envs.VLLM_ATTENTION_BACKEND + return AttentionConfig( - backend=self.attention_backend, + backend=backend, use_triton_flash_attn=envs.VLLM_USE_TRITON_FLASH_ATTN, flash_attn_version=envs.VLLM_FLASH_ATTN_VERSION, v1_use_prefill_decode_attention=envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION, From 6933f2c4d27a2785e508227d6504e02059d3093e Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Mon, 6 Oct 2025 15:18:52 -0400 Subject: [PATCH 5/9] Replace attention env var references with AttentionConfig Updated code throughout the codebase to use AttentionConfig instead of directly reading from environment variables. This consolidates attention configuration and makes it easier to manage and test. Changes: - Updated V1 attention backends to read from vllm_config.attention_config - Used get_current_vllm_config() in impl classes that don't have VllmConfig - Replaced envs.VLLM_* references with config.attention_config.* where used Files modified: - vllm/v1/attention/backends/flash_attn.py - vllm/v1/attention/backends/mla/flashattn_mla.py - vllm/v1/attention/backends/mla/triton_mla.py - vllm/v1/attention/backends/mla/common.py - vllm/v1/attention/backends/rocm_attn.py - vllm/v1/attention/backends/utils.py - vllm/attention/utils/fa_utils.py - vllm/utils/flashinfer.py Note: Some cached functions still use env vars directly to avoid breaking the cache key. These can be refactored in a future PR. Signed-off-by: Matthew Bonanni --- vllm/attention/utils/fa_utils.py | 12 +++++++----- vllm/utils/flashinfer.py | 7 ++++++- vllm/v1/attention/backends/flash_attn.py | 6 ++++-- vllm/v1/attention/backends/mla/common.py | 12 +++++++++--- vllm/v1/attention/backends/mla/flashattn_mla.py | 5 +++-- vllm/v1/attention/backends/mla/triton_mla.py | 6 ++++-- vllm/v1/attention/backends/rocm_attn.py | 10 ++++++++-- vllm/v1/attention/backends/utils.py | 5 ++++- 8 files changed, 45 insertions(+), 18 deletions(-) diff --git a/vllm/attention/utils/fa_utils.py b/vllm/attention/utils/fa_utils.py index e13afd46ee96..7158b10ee6f1 100644 --- a/vllm/attention/utils/fa_utils.py +++ b/vllm/attention/utils/fa_utils.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional -from vllm import envs from vllm.logger import init_logger from vllm.platforms import current_platform @@ -42,10 +41,13 @@ def get_flash_attn_version(requires_alibi: bool = False) -> Optional[int]: 3 if (device_capability.major == 9 and is_fa_version_supported(3)) else 2 ) - # 2. override if passed by environment - if envs.VLLM_FLASH_ATTN_VERSION is not None: - assert envs.VLLM_FLASH_ATTN_VERSION in [2, 3] - fa_version = envs.VLLM_FLASH_ATTN_VERSION + # 2. override if passed by environment or config + from vllm.config import get_current_vllm_config + + vllm_config = get_current_vllm_config() + if vllm_config.attention_config.flash_attn_version is not None: + assert vllm_config.attention_config.flash_attn_version in [2, 3] + fa_version = vllm_config.attention_config.flash_attn_version # 3. fallback for unsupported combinations if device_capability.major == 10 and fa_version == 3: diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 1d707d56daba..3bb913969a91 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -215,7 +215,12 @@ def force_use_trtllm_attention() -> bool | None: return ``True`` if TRTLLM attention is forced to be used, return ``False`` if TRTLLM attention is forced to be not used. """ - return _force_use_trtllm_attention(envs.VLLM_USE_TRTLLM_ATTENTION) + from vllm.config import get_current_vllm_config + + vllm_config = get_current_vllm_config() + return _force_use_trtllm_attention( + vllm_config.attention_config.use_trtllm_attention + ) def can_use_trtllm_attention(num_qo_heads: int, num_kv_heads: int) -> bool: diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index bb3dcddba3e9..1e683773d17f 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -8,7 +8,6 @@ import numpy as np import torch -from vllm import envs from vllm.attention.backends.abstract import ( AttentionBackend, AttentionImpl, @@ -200,6 +199,7 @@ def __init__( self.parallel_config = vllm_config.parallel_config self.cache_config = vllm_config.cache_config self.compilation_config = vllm_config.compilation_config + self.attention_config = vllm_config.attention_config self.num_heads_q = self.model_config.get_num_attention_heads( self.parallel_config @@ -233,7 +233,9 @@ def __init__( # When using cuda graph, we need to set the upper bound of the # number of splits so that large enough intermediate buffers are # pre-allocated during capture. - self.max_num_splits = envs.VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH + self.max_num_splits = ( + self.attention_config.flash_attn_max_num_splits_for_cuda_graph + ) # Sliding window size to be used with the AOT scheduler will be # populated on first build() call. diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index f7ec18f5e9f6..82a467383f8d 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -425,18 +425,24 @@ def __post_init__(self): def use_flashinfer_prefill() -> bool: # For blackwell default to flashinfer prefill if it's available since # it is faster than FA2. + from vllm.config import get_current_vllm_config + + vllm_config = get_current_vllm_config() return ( - not envs.VLLM_DISABLE_FLASHINFER_PREFILL + not vllm_config.attention_config.disable_flashinfer_prefill and flashinfer_available - and not envs.VLLM_USE_CUDNN_PREFILL + and not vllm_config.attention_config.use_cudnn_prefill and current_platform.is_device_capability(100) ) def use_cudnn_prefill() -> bool: + from vllm.config import get_current_vllm_config + + vllm_config = get_current_vllm_config() return ( flashinfer_available - and envs.VLLM_USE_CUDNN_PREFILL + and vllm_config.attention_config.use_cudnn_prefill and current_platform.is_device_capability(100) and has_nvidia_artifactory() ) diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py index c0c2dbe1f961..0e3b75ae4d1f 100644 --- a/vllm/v1/attention/backends/mla/flashattn_mla.py +++ b/vllm/v1/attention/backends/mla/flashattn_mla.py @@ -6,7 +6,6 @@ import torch -from vllm import envs from vllm.attention.backends.abstract import ( AttentionLayer, AttentionType, @@ -105,7 +104,9 @@ def __init__( # When using cuda graph, we need to set the upper bound of the # number of splits so that large enough intermediate buffers are # pre-allocated during capture. - self.max_num_splits = envs.VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH + self.max_num_splits = ( + vllm_config.attention_config.flash_attn_max_num_splits_for_cuda_graph + ) # TODO(lucas): Until we add support for the DCP custom masking we need # to restrict decodes to q_len == 1 when DCP is enabled. diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py index 3b6718c48d09..9a894458a792 100644 --- a/vllm/v1/attention/backends/mla/triton_mla.py +++ b/vllm/v1/attention/backends/mla/triton_mla.py @@ -5,7 +5,6 @@ import torch -from vllm import envs from vllm.attention.backends.abstract import ( AttentionLayer, AttentionType, @@ -87,7 +86,10 @@ def __init__( "TritonMLA V1 with FP8 KV cache not yet supported" ) - self.use_triton_flash_attn = envs.VLLM_USE_TRITON_FLASH_ATTN + from vllm.config import get_current_vllm_config + + vllm_config = get_current_vllm_config() + self.use_triton_flash_attn = vllm_config.attention_config.use_triton_flash_attn self.triton_fa_func = triton_attention if HAS_TRITON else None def _flash_attn_varlen_diff_headdims_rocm( diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py index 4c24770aa22c..0dae070fa36c 100644 --- a/vllm/v1/attention/backends/rocm_attn.py +++ b/vllm/v1/attention/backends/rocm_attn.py @@ -79,6 +79,7 @@ def __init__( super().__init__(kv_cache_spec, layer_names, vllm_config, device) self.block_size = kv_cache_spec.block_size + self.attention_config = vllm_config.attention_config model_config = vllm_config.model_config self.num_heads_q = model_config.get_num_attention_heads( @@ -96,7 +97,7 @@ def build_for_cudagraph_capture( # slow, so here we set it to 1. attn_metadata.seq_lens.fill_(1) - if envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION: + if self.attention_config.v1_use_prefill_decode_attention: # Here we set the query start locs to 0. This is to # cover up an invalid memory access in the prefix_prefil kernel # that we run into during graph capture (#25985) @@ -267,8 +268,13 @@ def __init__( "RocmAttentionImpl" ) + from vllm.config import get_current_vllm_config + + vllm_config = get_current_vllm_config() self.fp8_dtype = current_platform.fp8_dtype() - self.force_prefill_decode_attn = envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION + self.force_prefill_decode_attn = ( + vllm_config.attention_config.v1_use_prefill_decode_attention + ) if not self.force_prefill_decode_attn: # If not using prefill decode attention, we use the Triton diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 003c7253e553..3a5477b0b315 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -446,7 +446,10 @@ def infer_global_hyperparameters( global_params = param_sets[0] # trtllm attention doesn't need global hyper params so disable the check - if not envs.VLLM_USE_TRTLLM_ATTENTION: + from vllm.config import get_current_vllm_config + + vllm_config = get_current_vllm_config() + if not vllm_config.attention_config.use_trtllm_attention: for params in param_sets: if params.window_left != global_params.window_left: raise ValueError( From ce87bd4881c9b5bff32da7c194ff98f1dd7ed558 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Mon, 6 Oct 2025 15:24:41 -0400 Subject: [PATCH 6/9] Replace remaining VLLM_ATTENTION_BACKEND env var usages Updated remaining files to use AttentionConfig.backend instead of reading directly from envs.VLLM_ATTENTION_BACKEND. This completes the migration to using AttentionConfig as the single source of truth. Changes: - vllm/platforms/cuda.py: Use vllm_config.attention_config.backend - vllm/platforms/xpu.py: Use get_current_vllm_config() - vllm/attention/selector.py: Read backend from AttentionConfig - vllm/config/model.py: Added comments explaining why envs is still used (ModelConfig is created before AttentionConfig, so can't use it yet) - vllm/engine/arg_utils.py: Define backend locally from CLI/env in _is_v1_supported_oracle Note: ModelConfig.__post_init__ still reads from envs because it's created before VllmConfig/AttentionConfig exists. This is only for early validation. Signed-off-by: Matthew Bonanni --- vllm/attention/selector.py | 7 +++++-- vllm/config/model.py | 5 +++++ vllm/engine/arg_utils.py | 11 ++++++----- vllm/platforms/cuda.py | 16 ++++++++++------ vllm/platforms/xpu.py | 5 ++++- 5 files changed, 30 insertions(+), 14 deletions(-) diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index effd35444d54..6596f6cec413 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -185,8 +185,11 @@ def _cached_get_attn_backend( if backend_by_global_setting is not None: selected_backend = backend_by_global_setting else: - # Check the environment variable and override if specified - backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND + # Check the config (which may come from env var) and override if specified + from vllm.config import get_current_vllm_config + + vllm_config = get_current_vllm_config() + backend_by_env_var: Optional[str] = vllm_config.attention_config.backend if backend_by_env_var is not None: if backend_by_env_var.endswith("_VLLM_V1"): logger.warning( diff --git a/vllm/config/model.py b/vllm/config/model.py index 146ace9782b9..652d2d889818 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -444,6 +444,9 @@ def __post_init__( self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer) + # Note: We read from envs here because ModelConfig is created before + # AttentionConfig, so we can't use get_current_vllm_config() yet. + # This is just for early validation. if ( (backend := envs.VLLM_ATTENTION_BACKEND) and backend == "FLASHINFER" @@ -633,6 +636,8 @@ def _task_to_convert(task: TaskOption) -> ConvertType: ) # Interleaved attention is not supported by some backends in V0 + # Note: We read from envs here because ModelConfig is created before + # AttentionConfig, so we can't use get_current_vllm_config() yet. if ( not self.disable_sliding_window and is_interleaved(self.hf_text_config) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 6286766624d3..2c5cdc6e89dd 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1687,11 +1687,12 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool: "XFORMERS", "ROCM_ATTN", ] - if ( - envs.is_set("VLLM_ATTENTION_BACKEND") - and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS - ): - name = f"VLLM_ATTENTION_BACKEND={envs.VLLM_ATTENTION_BACKEND}" + # Get backend from CLI arg or env var + backend = self.attention_backend + if backend is None: + backend = envs.VLLM_ATTENTION_BACKEND + if backend is not None and backend not in V1_BACKENDS: + name = f"VLLM_ATTENTION_BACKEND={backend}" _raise_or_fallback(feature_name=name, recommend_to_remove=True) return False diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 20568e0d6c51..2bd26c4691e1 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -128,7 +128,8 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: use_cutlass_mla = False use_flashinfer_mla = False - if envs.VLLM_ATTENTION_BACKEND is None: + attention_backend = vllm_config.attention_config.backend + if attention_backend is None: # Default case if cls.is_device_capability(100): # Blackwell => Force CutlassMLA. @@ -136,15 +137,15 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: # TODO: This does not work, because the # global_force_attn_backend_context_manager is not set. # See vllm/attention/selector.py:_cached_get_attn_backend - envs.VLLM_ATTENTION_BACKEND = "CUTLASS_MLA" + vllm_config.attention_config.backend = "CUTLASS_MLA" else: # Not Blackwell use_flashmla = True else: # Forced case - use_flashmla = envs.VLLM_ATTENTION_BACKEND == "FLASHMLA" - use_cutlass_mla = envs.VLLM_ATTENTION_BACKEND == "CUTLASS_MLA" - use_flashinfer_mla = envs.VLLM_ATTENTION_BACKEND == "FLASHINFER_MLA" + use_flashmla = attention_backend == "FLASHMLA" + use_cutlass_mla = attention_backend == "CUTLASS_MLA" + use_flashinfer_mla = attention_backend == "FLASHINFER_MLA" from vllm.attention.ops.flashmla import is_flashmla_supported @@ -481,8 +482,11 @@ def device_count(cls) -> int: def is_kv_cache_dtype_supported( cls, kv_cache_dtype: str, model_config: "ModelConfig" ) -> bool: + from vllm.config import get_current_vllm_config + fp8_attention = kv_cache_dtype.startswith("fp8") - attention_backend = envs.VLLM_ATTENTION_BACKEND + vllm_config = get_current_vllm_config() + attention_backend = vllm_config.attention_config.backend supported = False if model_config is not None and model_config.use_mla: diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 2f2f3ab8b9d9..06e8f6db367c 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -83,9 +83,12 @@ def is_kv_cache_dtype_supported( Check if the kv_cache_dtype is supported. XPU only support fp8 kv cache with triton backend. """ + from vllm.config import get_current_vllm_config + + vllm_config = get_current_vllm_config() if ( envs.is_set("VLLM_ATTENTION_BACKEND") - and envs.VLLM_ATTENTION_BACKEND == "TRITON_ATTN" + and vllm_config.attention_config.backend == "TRITON_ATTN" ): return kv_cache_dtype in ["fp8_e4m3", "fp8_e5m2", "fp8"] From 6a250274df084f9ede2dfa7887feb50534d4de8a Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Mon, 6 Oct 2025 15:27:52 -0400 Subject: [PATCH 7/9] Create AttentionConfig before ModelConfig to eliminate env var usage Restructured the config creation order so that AttentionConfig is created first, then passed to ModelConfig. This allows ModelConfig to use attention_config.backend instead of reading directly from envs. Changes: - create_engine_config(): Create AttentionConfig before ModelConfig - create_model_config(): Accept optional attention_config parameter - ModelConfig: Added attention_config field - ModelConfig.__post_init__: Use self.attention_config.backend instead of envs.VLLM_ATTENTION_BACKEND Benefits: - Eliminated 2 more env var usages from ModelConfig - AttentionConfig is now truly the single source of truth for attention backend - Cleaner dependency flow: AttentionConfig -> ModelConfig -> VllmConfig This completes the migration away from reading VLLM_ATTENTION_BACKEND directly from environment variables in core config classes. Signed-off-by: Matthew Bonanni --- vllm/config/model.py | 24 ++++++++++++------------ vllm/engine/arg_utils.py | 13 ++++++++++--- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index 652d2d889818..79733a9ef10c 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -29,6 +29,9 @@ from vllm.config.utils import assert_hashable, config, getattr_iter from vllm.logger import init_logger from vllm.platforms import current_platform + +if TYPE_CHECKING: + from vllm.config.attention import AttentionConfig from vllm.transformers_utils.config import ( ConfigFormat, get_config, @@ -280,6 +283,9 @@ class ModelConfig: """ override_attention_dtype: Optional[str] = None """Override dtype for attention""" + attention_config: Optional["AttentionConfig"] = None + """Attention configuration. If not specified, will be read from environment + variables.""" logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]] = None """One or more logits processors' fully-qualified class names or class definitions""" @@ -444,16 +450,11 @@ def __post_init__( self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer) - # Note: We read from envs here because ModelConfig is created before - # AttentionConfig, so we can't use get_current_vllm_config() yet. - # This is just for early validation. - if ( - (backend := envs.VLLM_ATTENTION_BACKEND) - and backend == "FLASHINFER" - and find_spec("flashinfer") is None - ): + # Early validation for FLASHINFER backend + backend = self.attention_config.backend if self.attention_config else None + if backend == "FLASHINFER" and find_spec("flashinfer") is None: raise ValueError( - "VLLM_ATTENTION_BACKEND is set to FLASHINFER, but flashinfer " + "attention_backend is set to FLASHINFER, but flashinfer " "module was not found. See " "https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile " # noqa: E501 "for instructions on how to install it." @@ -636,13 +637,12 @@ def _task_to_convert(task: TaskOption) -> ConvertType: ) # Interleaved attention is not supported by some backends in V0 - # Note: We read from envs here because ModelConfig is created before - # AttentionConfig, so we can't use get_current_vllm_config() yet. if ( not self.disable_sliding_window and is_interleaved(self.hf_text_config) and not envs.VLLM_USE_V1 - and (backend := envs.VLLM_ATTENTION_BACKEND) in ("XFORMERS", "FLASHINFER") + and backend is not None + and backend in ("XFORMERS", "FLASHINFER") ): logger.warning_once( "%s has interleaved attention, which is currently not " diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 2c5cdc6e89dd..02d3fbd4ae06 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1057,7 +1057,9 @@ def from_cli_args(cls, args: argparse.Namespace): ) return engine_args - def create_model_config(self) -> ModelConfig: + def create_model_config( + self, attention_config: Optional[AttentionConfig] = None + ) -> ModelConfig: # gguf file needs a specific model loader and doesn't use hf_repo if check_gguf_file(self.model): self.quantization = self.load_format = "gguf" @@ -1149,6 +1151,7 @@ def create_model_config(self) -> ModelConfig: logits_processors=self.logits_processors, video_pruning_rate=self.video_pruning_rate, io_processor_plugin=self.io_processor_plugin, + attention_config=attention_config, ) def validate_tensorizer_args(self): @@ -1283,7 +1286,10 @@ def create_engine_config( device_config = DeviceConfig(device=cast(Device, current_platform.device_type)) - model_config = self.create_model_config() + # Create AttentionConfig first so ModelConfig can use it + attention_config = self.create_attention_config() + + model_config = self.create_model_config(attention_config=attention_config) self.model = model_config.model self.tokenizer = model_config.tokenizer @@ -1603,7 +1609,8 @@ def create_engine_config( collect_detailed_traces=self.collect_detailed_traces, ) - attention_config = self.create_attention_config() + # Note: attention_config was already created earlier in this method + # (before creating model_config) so that ModelConfig can use it config = VllmConfig( model_config=model_config, From e1d524d54b94110498afc8569bbfdf1ee031b9f1 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Mon, 6 Oct 2025 15:38:38 -0400 Subject: [PATCH 8/9] Remove forced_attn_backend global, use AttentionConfig directly Eliminated the separate forced_attn_backend global variable mechanism in favor of directly modifying vllm_config.attention_config.backend. This simplifies the code and makes AttentionConfig the true single source of truth. Changes: - Removed forced_attn_backend global variable - Removed global_force_attn_backend() and get_global_forced_attn_backend() - Updated global_force_attn_backend_context_manager() to modify vllm_config.attention_config.backend directly - Updated attention selector to only check AttentionConfig.backend - Updated cuda.py TODO comment to reflect the new approach Benefits: - Simpler architecture: one source of truth instead of two - No more global state to manage - Runtime overrides now just modify AttentionConfig.backend - Context manager still works for tests that need temporary overrides The context manager is preserved for backward compatibility with tests. Signed-off-by: Matthew Bonanni --- vllm/attention/selector.py | 99 +++++++++++++------------------------- vllm/platforms/cuda.py | 5 +- 2 files changed, 36 insertions(+), 68 deletions(-) diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 6596f6cec413..d2a73238d450 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -52,32 +52,9 @@ def get_env_variable_attn_backend() -> Optional[_Backend]: # a backend based on system & workload configuration # (default behavior if this variable is None) # -# THIS SELECTION TAKES PRECEDENCE OVER THE -# VLLM_ATTENTION_BACKEND ENVIRONMENT VARIABLE -forced_attn_backend: Optional[_Backend] = None - - -def global_force_attn_backend(attn_backend: Optional[_Backend]) -> None: - """ - Force all attention operations to use a specified backend. - - Passing `None` for the argument re-enables automatic - backend selection., - - Arguments: - - * attn_backend: backend selection (None to revert to auto) - """ - global forced_attn_backend - forced_attn_backend = attn_backend - - -def get_global_forced_attn_backend() -> Optional[_Backend]: - """ - Get the currently-forced choice of attention backend, - or None if auto-selection is currently enabled. - """ - return forced_attn_backend +# NOTE: The global forced backend mechanism has been removed. +# To override the attention backend, modify vllm_config.attention_config.backend +# using get_current_vllm_config().attention_config.backend = "BACKEND_NAME" @dataclass(frozen=True) @@ -177,35 +154,27 @@ def _cached_get_attn_backend( ) -> type[AttentionBackend]: # Check whether a particular choice of backend was # previously forced. - # - # THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND - # ENVIRONMENT VARIABLE. + # Check the config (which may come from CLI arg, env var, or runtime override) + from vllm.config import get_current_vllm_config + selected_backend = None - backend_by_global_setting: Optional[_Backend] = get_global_forced_attn_backend() - if backend_by_global_setting is not None: - selected_backend = backend_by_global_setting - else: - # Check the config (which may come from env var) and override if specified - from vllm.config import get_current_vllm_config - - vllm_config = get_current_vllm_config() - backend_by_env_var: Optional[str] = vllm_config.attention_config.backend - if backend_by_env_var is not None: - if backend_by_env_var.endswith("_VLLM_V1"): - logger.warning( - "The suffix '_VLLM_V1' in the environment variable " - "%s is no longer necessary as V0 backends have been " - "deprecated. Please remove this suffix from your " - "environment variable setting.", - STR_BACKEND_ENV_VAR, - ) - backend_by_env_var = backend_by_env_var.removesuffix("_VLLM_V1") - selected_backend = backend_name_to_enum(backend_by_env_var) - if selected_backend is None: - raise ValueError( - f"Invalid attention backend: '{backend_by_env_var}'. " - f"Valid backends are: {list(_Backend.__members__.keys())}" - ) + vllm_config = get_current_vllm_config() + backend_by_config: Optional[str] = vllm_config.attention_config.backend + if backend_by_config is not None: + if backend_by_config.endswith("_VLLM_V1"): + logger.warning( + "The suffix '_VLLM_V1' in the attention backend " + "is no longer necessary as V0 backends have been " + "deprecated. Please remove this suffix from your " + "backend setting." + ) + backend_by_config = backend_by_config.removesuffix("_VLLM_V1") + selected_backend = _Backend.backend_name_to_enum(backend_by_config) + if selected_backend is None: + raise ValueError( + f"Invalid attention backend: '{backend_by_config}'. " + f"Valid backends are: {list(_Backend.__members__.keys())}" + ) # get device-specific attn_backend attention_cls = current_platform.get_attn_backend_cls( @@ -231,29 +200,29 @@ def global_force_attn_backend_context_manager( attn_backend: _Backend, ) -> Generator[None, None, None]: """ - Globally force a vLLM attention backend override within a - context manager, reverting the global attention backend - override to its prior state upon exiting the context - manager. + Temporarily override the attention backend within a context manager, + reverting to the original backend upon exiting. Arguments: - * attn_backend: attention backend to force + * attn_backend: attention backend to use Returns: * Generator """ + from vllm.config import get_current_vllm_config - # Save the current state of the global backend override (if any) - original_value = get_global_forced_attn_backend() + # Save the current backend from config + vllm_config = get_current_vllm_config() + original_value = vllm_config.attention_config.backend - # Globally force the new backend override - global_force_attn_backend(attn_backend) + # Override the backend in config + vllm_config.attention_config.backend = str(attn_backend.name) # Yield control back to the enclosed code block try: yield finally: - # Revert the original global backend override, if any - global_force_attn_backend(original_value) + # Revert the original backend + vllm_config.attention_config.backend = original_value diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 2bd26c4691e1..b0ae3a9815b1 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -134,9 +134,8 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: if cls.is_device_capability(100): # Blackwell => Force CutlassMLA. use_cutlass_mla = True - # TODO: This does not work, because the - # global_force_attn_backend_context_manager is not set. - # See vllm/attention/selector.py:_cached_get_attn_backend + # Set the backend in AttentionConfig so it's used during + # backend selection vllm_config.attention_config.backend = "CUTLASS_MLA" else: # Not Blackwell From 7e357562baa1448f07bea27372fd31deb3020a22 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Mon, 6 Oct 2025 16:29:24 -0400 Subject: [PATCH 9/9] Replace remaining attention env var references with AttentionConfig MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Updated flashinfer.py to use attention_config.flashinfer_disable_q_quantization - Updated platforms/rocm.py to use attention_config for v1_use_prefill_decode_attention and use_aiter_unified_attention - Updated rocm_attn.py to use attention_config.use_aiter_unified_attention - Only arg_utils.py now reads env vars (at initialization point) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Signed-off-by: Matthew Bonanni --- vllm/platforms/rocm.py | 10 ++++++++-- vllm/utils/flashinfer.py | 7 +++++-- vllm/v1/attention/backends/rocm_attn.py | 8 +++++++- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 80e7b849c0ed..46f8fabf8f25 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -276,6 +276,9 @@ def get_attn_backend_cls( ) if envs.VLLM_USE_V1: + from vllm.config import get_current_vllm_config + + vllm_config = get_current_vllm_config() if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9(): logger.info("Using Flash Attention backend on V1 engine.") return ( @@ -283,8 +286,11 @@ def get_attn_backend_cls( "rocm_aiter_fa.AiterFlashAttentionBackend" ) elif ( - (envs.VLLM_ROCM_USE_AITER and envs.VLLM_USE_AITER_UNIFIED_ATTENTION) - or envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION + ( + envs.VLLM_ROCM_USE_AITER + and vllm_config.attention_config.use_aiter_unified_attention + ) + or vllm_config.attention_config.v1_use_prefill_decode_attention or selected_backend == _Backend.ROCM_ATTN ): # rocm specific backend, with aiter and/or diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 3bb913969a91..79122dd274c1 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -439,8 +439,11 @@ def flashinfer_scaled_fp8_mm( @functools.cache def flashinfer_disable_q_quantization() -> bool: - """Cache result which only depends on the environment""" - return envs.VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION + """Cache result which only depends on the attention config""" + from vllm.config import get_current_vllm_config + + vllm_config = get_current_vllm_config() + return vllm_config.attention_config.flashinfer_disable_q_quantization __all__ = [ diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py index 0dae070fa36c..11f7233e3c52 100644 --- a/vllm/v1/attention/backends/rocm_attn.py +++ b/vllm/v1/attention/backends/rocm_attn.py @@ -217,7 +217,13 @@ def use_aiter_unified_attention() -> bool: """Check if aiter unified attention should be used.""" # VLLM_ROCM_USE_AITER_MHA needs to set to 0 as well as it is set # to 1 as default - return envs.VLLM_ROCM_USE_AITER and envs.VLLM_USE_AITER_UNIFIED_ATTENTION + from vllm.config import get_current_vllm_config + + vllm_config = get_current_vllm_config() + return ( + envs.VLLM_ROCM_USE_AITER + and vllm_config.attention_config.use_aiter_unified_attention + ) class RocmAttentionImpl(AttentionImpl):