vllm-project
diff --git a/‎tests/compile/test_fusion_attn.py‎
Lines changed: 54 additions & 164 deletions b/‎tests/compile/test_fusion_attn.py‎
Lines changed: 54 additions & 164 deletions
diff --git a/‎vllm/attention/backends/registry.py‎
Lines changed: 1 addition & 0 deletions b/‎vllm/attention/backends/registry.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎vllm/attention/selector.py‎
Lines changed: 1 addition & 0 deletions b/‎vllm/attention/selector.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎vllm/engine/arg_utils.py‎
Lines changed: 1 addition & 0 deletions b/‎vllm/engine/arg_utils.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎vllm/envs.py‎
Lines changed: 7 additions & 6 deletions b/‎vllm/envs.py‎
Lines changed: 7 additions & 6 deletions
@@ -7,9 +7,7 @@
 import torch._dynamo
 
 from tests.compile.backend import LazyInitPass, TestBackend
-from tests.models.utils import check_outputs_equal
 from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
-from vllm import LLM, SamplingParams
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
 from vllm.attention import Attention, AttentionMetadata
 from vllm.attention.backends.registry import _Backend
@@ -31,7 +29,6 @@
 )
 from vllm.forward_context import get_forward_context, set_forward_context
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    QuantKey,
     kFp8StaticTensorSym,
     kNvfp4Quant,
 )
@@ -48,132 +45,6 @@
 backend_unfused: Optional[TestBackend] = None
 
 
-@pytest.mark.parametrize(
-    "model, quant_key", [("amd/Llama-3.1-8B-Instruct-FP8-KV", kFp8StaticTensorSym)]
-)
-@pytest.mark.parametrize("use_triton_fa", [True, False])
-@pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8")
-@pytest.mark.skipif(
-    not current_platform.is_rocm(), reason="V0 attn quant fusion only on ROCm"
-)
-def test_attention_fusion_v0(
-    example_prompts, monkeypatch, model: str, quant_key: QuantKey, use_triton_fa: bool
-):
-    # Clean Dynamo cache to avoid reusing other test cases
-    # (for some reason the reset at the end is not enough)
-    torch._dynamo.reset()
-
-    # Use global backends
-    global backend, backend_unfused
-
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", str(int(use_triton_fa)))
-
-    # Prompt 4 seems too open-ended, differs between fused and unfused
-    # (both outputs look reasonable though)
-    prompts = example_prompts[:4] + example_prompts[5:]
-
-    compile_config = CompilationConfig(
-        # DYNAMO_AS_IS triggers custom backend & does full Dynamo compilation
-        # DYNAMO_ONCE does not properly propagate shapes.
-        level=CompilationLevel.DYNAMO_AS_IS,
-        backend="tests.compile.test_fusion_attn.backend_unfused",
-        custom_ops=["+quant_fp8"],
-    )
-    vllm_config = VllmConfig(
-        compilation_config=compile_config,
-        model_config=ModelConfig(
-            model=model,
-            dtype=torch.bfloat16,
-        ),
-    )
-    backend_unfused = TestBackend(NoOpEliminationPass(vllm_config))
-
-    llm = LLM(
-        model,
-        enforce_eager=True,
-        compilation_config=compile_config,
-        gpu_memory_utilization=0.5,
-        max_model_len=2048,
-    )
-
-    sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_p=0.95)
-
-    unfused_output = llm.generate(prompts, sampling_params)
-    backend_unfused = None  # Reset backend to make sure llm gets released
-    del llm
-
-    compile_config = CompilationConfig(
-        # DYNAMO_AS_IS triggers custom backend & does full Dynamo compilation
-        # DYNAMO_ONCE does not properly propagate shapes.
-        level=CompilationLevel.DYNAMO_AS_IS,
-        backend="tests.compile.test_fusion_attn.backend",
-        custom_ops=["+quant_fp8"],
-    )
-    vllm_config = VllmConfig(
-        compilation_config=compile_config,
-        model_config=ModelConfig(
-            model=model,
-            dtype=torch.bfloat16,
-        ),
-    )
-
-    # AttnFusionPass needs attention layers to be registered in config upon init
-    # so we initialize it during compilation.
-    attn_pass = LazyInitPass(AttnFusionPass, vllm_config)
-    backend = TestBackend(NoOpEliminationPass(vllm_config), attn_pass)
-    llm2 = LLM(
-        model,
-        enforce_eager=True,
-        compilation_config=compile_config,
-        gpu_memory_utilization=0.5,
-        max_model_len=2048,
-    )
-
-    # check support
-    attn_fusion_supported = [
-        layer.impl.fused_output_quant_supported(quant_key)
-        for key, layer in compile_config.static_forward_context.items()
-    ]
-
-    print(f"{attn_fusion_supported=}")
-    if any(attn_fusion_supported):
-        # Check quant ops
-        backend.check_before_ops([QUANT_OPS[quant_key]], fully_replaced=False)
-
-    # attention ops present in both, just output_scale param changes
-    attn_nodes_pre = list(find_op_nodes(ATTN_OP, backend.graph_pre_pass))
-    attn_nodes_post = list(find_op_nodes(ATTN_OP, backend.graph_post_pass))
-    assert len(attn_nodes_pre) == len(attn_nodes_post)
-
-    for i in range(len(attn_nodes_pre)):
-        assert attn_nodes_pre[i].kwargs["output_scale"] is None
-        fused = attn_nodes_post[i].kwargs["output_scale"] is not None
-        assert fused == attn_fusion_supported[i], (
-            f"Node {i} {'' if fused else 'not '} expected to have fused output quant"
-        )
-
-    # check outputs
-    fused_output = llm2.generate(prompts, sampling_params)
-
-    # transform outputs to format expected by check_outputs_equal
-    sample_outs = lambda s: (list(s.token_ids), s.text)
-    outs_lst = lambda ros: [sample_outs(ro.outputs[0]) for ro in ros]
-
-    check_outputs_equal(
-        outputs_0_lst=outs_lst(unfused_output),
-        outputs_1_lst=outs_lst(fused_output),
-        name_0="unfused",
-        name_1="fused",
-    )
-
-    # Clean Dynamo cache to avoid polluting other case(s)
-    torch._dynamo.reset()
-
-    # Reset backend to make sure llm2 gets released
-    backend = None
-
-
 class AttentionQuantPatternModel(torch.nn.Module):
     """Base model for AttentionQuantPattern fusion."""
 
@@ -221,7 +92,7 @@ def __init__(
             device=self.device,
         )
 
-    def build_attn_metadata(self, batch_size: int, use_hnd: bool) -> AttentionMetadata:
+    def build_attn_metadata(self, batch_size: int) -> AttentionMetadata:
         """Initialize attention metadata."""
 
         # Create common attn metadata
@@ -232,30 +103,57 @@ def build_attn_metadata(self, batch_size: int, use_hnd: bool) -> AttentionMetada
 
         max_blocks = (max(batch_spec.seq_lens) + self.block_size - 1) // self.block_size
         num_blocks = batch_size * max_blocks
+        backend = self.attn.backend
 
-        # Create dummy KV cache for FlashInfer TRTLLM
-        #   - NHD: [num_blocks, block_size, num_kv_heads, head_size]
-        #   - HND: [num_blocks, num_kv_heads, block_size, head_size]
-        kv_cache = torch.zeros(
-            num_blocks,
-            2,
-            self.num_kv_heads,
-            self.block_size,
-            self.head_size,
-            dtype=self.kv_cache_dtype,
-            device=self.device,
-        )
-        if current_platform.is_rocm():
+        # Create dummy KV cache for the selected backend
+        if backend == _Backend.ROCM_ATTN:
             # k/v as 1st dimention
-            if use_hnd:
-                kv_cache = kv_cache.permute(1, 0, 2, 3, 4)
-            else:
-                kv_cache = kv_cache.permute(1, 0, 3, 2, 4)
-        else:
+            # HND: [num_blocks, num_kv_heads, block_size, head_size]
+            kv_cache = torch.zeros(
+                2,
+                num_blocks,
+                self.num_kv_heads,
+                self.block_size,
+                self.head_size,
+                dtype=self.kv_cache_dtype,
+                device=self.device,
+            )
+        elif backend == _Backend.ROCM_AITER_UNIFIED_ATTN:
+            # k/v as 1st dimention
+            # NHD: [num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache = torch.zeros(
+                2,
+                num_blocks,
+                self.block_size,
+                self.num_kv_heads,
+                self.head_size,
+                dtype=self.kv_cache_dtype,
+                device=self.device,
+            )
+        elif backend == _Backend.TRITON_ATTN:
             # k/v as 2nd dimention
-            # Create kv_cache in HND layout and permute to NHD layout
-            # (later will be permuted back to HND layout in forward pass)
-            kv_cache = kv_cache.permute(0, 1, 3, 2, 4)
+            # NHD: [num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache = torch.zeros(
+                num_blocks,
+                2,
+                self.num_kv_heads,
+                self.block_size,
+                self.head_size,
+                dtype=self.kv_cache_dtype,
+                device=self.device,
+            )
+        elif backend == _Backend.FLASHINFER:
+            kv_cache = torch.zeros(
+                num_blocks,
+                2,
+                self.num_kv_heads,
+                self.block_size,
+                self.head_size,
+                dtype=self.kv_cache_dtype,
+                device=self.device,
+            ).permute(0, 1, 3, 2, 4)
+        else:
+            raise ValueError(f"Unsupported backend: {backend}")
         self.attn.kv_cache = [kv_cache]
 
         # Build attn metadata
@@ -375,10 +273,9 @@ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
 @pytest.mark.parametrize("model_name, model_class", MODELS)
 @pytest.mark.parametrize(
     "backend",
-    [_Backend.FLASHINFER] if current_platform.is_cuda() else [_Backend.TRITON_ATTN],
-)
-@pytest.mark.parametrize(
-    "split_attention", [False, True] if current_platform.is_rocm() else [False]
+    [_Backend.FLASHINFER]
+    if current_platform.is_cuda()
+    else [_Backend.ROCM_AITER_UNIFIED_ATTN, _Backend.ROCM_ATTN, _Backend.TRITON_ATTN],
 )
 # TODO(boyuan): test inductor graph partition on rocm
 @pytest.mark.parametrize(
@@ -405,7 +302,6 @@ def test_attention_quant_pattern(
     model_name: str,
     model_class: type[AttentionQuantPatternModel],
     backend: _Backend,
-    split_attention: bool,
     use_inductor_graph_partition: bool,
     monkeypatch,
     dist_init,
@@ -417,8 +313,6 @@ def test_attention_quant_pattern(
         pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
 
     monkeypatch.setenv("VLLM_USE_V1", "1")
-    if split_attention:
-        monkeypatch.setenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "1")
 
     device = torch.device("cuda:0")
     torch.manual_seed(42)
@@ -466,9 +360,7 @@ def test_attention_quant_pattern(
         model_unfused = model_unfused.to(device)
 
         forward_ctx = get_forward_context()
-        forward_ctx.attn_metadata = model_unfused.build_attn_metadata(
-            batch_size, use_hnd=split_attention
-        )
+        forward_ctx.attn_metadata = model_unfused.build_attn_metadata(batch_size)
 
         # Run model directly without compilation and fusion
         result_unfused = model_unfused(q, k, v)
@@ -494,9 +386,7 @@ def test_attention_quant_pattern(
         model_fused = model_fused.to(device)
 
         forward_ctx = get_forward_context()
-        forward_ctx.attn_metadata = model_fused.build_attn_metadata(
-            batch_size, use_hnd=split_attention
-        )
+        forward_ctx.attn_metadata = model_fused.build_attn_metadata(batch_size)
 
         # Create test backend with fusion passes enabled
         noop_pass = NoOpEliminationPass(vllm_config)
 
@@ -25,3 +25,4 @@ class _Backend(enum.Enum):
     FLEX_ATTENTION = enum.auto()
     TREE_ATTN = enum.auto()
     ROCM_ATTN = enum.auto()
+    ROCM_AITER_UNIFIED_ATTN = enum.auto()
@@ -254,3 +254,4 @@ def global_force_attn_backend_context_manager(
     finally:
         # Revert the original global backend override, if any
         global_force_attn_backend(original_value)
+        _cached_get_attn_backend.cache_clear()
@@ -1623,6 +1623,7 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
             "TREE_ATTN",
             "XFORMERS",
             "ROCM_ATTN",
+            "ROCM_AITER_UNIFIED_ATTN",
         ]
         if (
             envs.is_set("VLLM_ATTENTION_BACKEND")
 
@@ -18,7 +18,6 @@
     LD_LIBRARY_PATH: Optional[str] = None
     VLLM_USE_TRITON_FLASH_ATTN: bool = True
     VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False
-    VLLM_USE_AITER_UNIFIED_ATTENTION: bool = False
     VLLM_FLASH_ATTN_VERSION: Optional[int] = None
     LOCAL_RANK: int = 0
     CUDA_VISIBLE_DEVICES: Optional[str] = None
@@ -109,6 +108,7 @@
     VLLM_ROCM_USE_AITER_FP4_ASM_GEMM: bool = False
     VLLM_ROCM_USE_TRITON_ROPE: bool = False
     VLLM_ROCM_USE_AITER_FP8BMM: bool = True
+    VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION: bool = False
     VLLM_ROCM_USE_SKINNY_GEMM: bool = True
     VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ROCM_MOE_PADDING: bool = True
@@ -475,10 +475,6 @@ def get_vllm_port() -> Optional[int]:
         os.getenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "False").lower()
         in ("true", "1")
     ),
-    # Use AITER triton unified attention for V1 attention
-    "VLLM_USE_AITER_UNIFIED_ATTENTION": lambda: (
-        os.getenv("VLLM_USE_AITER_UNIFIED_ATTENTION", "False").lower() in ("true", "1")
-    ),
     # Force vllm to use a specific flash-attention version (2 or 3), only valid
     # when using the flash-attention backend.
     "VLLM_FLASH_ATTN_VERSION": lambda: maybe_convert_int(
@@ -896,6 +892,11 @@ def get_vllm_port() -> Optional[int]:
     "VLLM_ROCM_USE_AITER_FP8BMM": lambda: (
         os.getenv("VLLM_ROCM_USE_AITER_FP8BMM", "True").lower() in ("true", "1")
     ),
+    # Use AITER triton unified attention for V1 attention
+    "VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION": lambda: (
+        os.getenv("VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION", "False").lower()
+        in ("true", "1")
+    ),
     # use rocm skinny gemms
     "VLLM_ROCM_USE_SKINNY_GEMM": lambda: (
         os.getenv("VLLM_ROCM_USE_SKINNY_GEMM", "True").lower() in ("true", "1")
@@ -1434,7 +1435,6 @@ def compute_hash() -> str:
         "VLLM_FUSED_MOE_CHUNK_SIZE",
         "VLLM_FLASHINFER_MOE_BACKEND",
         "VLLM_V1_USE_PREFILL_DECODE_ATTENTION",
-        "VLLM_USE_AITER_UNIFIED_ATTENTION",
         "VLLM_ATTENTION_BACKEND",
         "VLLM_USE_FLASHINFER_SAMPLER",
         "VLLM_DISABLED_KERNELS",
@@ -1462,6 +1462,7 @@ def compute_hash() -> str:
         "VLLM_ROCM_USE_AITER_FP4_ASM_GEMM",
         "VLLM_ROCM_USE_TRITON_ROPE",
         "VLLM_ROCM_USE_AITER_FP8BMM",
+        "VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION",
         "VLLM_ROCM_USE_SKINNY_GEMM",
         "VLLM_ROCM_FP8_PADDING",
         "VLLM_ROCM_MOE_PADDING",
Original file line number	Diff line number	Diff line change
`@@ -1623,6 +1623,7 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:`
`1623`	`1623`	`"TREE_ATTN",`
`1624`	`1624`	`"XFORMERS",`
`1625`	`1625`	`"ROCM_ATTN",`
	`1626`	`+ "ROCM_AITER_UNIFIED_ATTN",`
`1626`	`1627`	`]`
`1627`	`1628`	`if (`
`1628`	`1629`	`envs.is_set("VLLM_ATTENTION_BACKEND")`