Add limited Llama4 support for VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE

tpopp · tpopp · commit a29eb07aa1c1 · 2025-10-22T11:00:19.000+02:00
This can be used with VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE=1
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
@@ -529,9 +529,20 @@ def unified_attention_with_output(
     from vllm.v1.attention.backends.triton_attn import TritonAttentionImpl
     from vllm.v1.attention.backends.rocm_aiter_fa import AiterFlashAttentionImpl
     from vllm.v1.attention.backends.mla.rocm_aiter_mla import AiterMLAImpl
-    if VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE and (isinstance(self.impl, TritonAttentionImpl) or isinstance(self.impl, AiterFlashAttentionImpl) or isinstance(self.impl, AiterMLAImpl)):
+    # Not all layers can use RoPE fusing, so check that they were given all
+    # needed inputs along with the environment variable to enable this.
+    if (
+        VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE
+        and hasattr(self.impl, "rotary_emb")
+        and self.impl.rotary_emb is not None
+        and positions is not None
+        and (
+            isinstance(self.impl, TritonAttentionImpl)
+            or isinstance(self.impl, AiterFlashAttentionImpl) 
+            or isinstance(self.impl, AiterMLAImpl)
+        )
+    ):
         # fusing RoPE with flushing kv_cache operation
-        assert hasattr(self.impl, "rotary_emb") and self.impl.rotary_emb is not None and positions is not None, f"rotary_emb not found in {self.impl=} and positions cannot be None"
         self.impl.forward(self,
                         query,
                         key,
diff --git a/vllm/attention/layers/chunked_local_attention.py b/vllm/attention/layers/chunked_local_attention.py
@@ -58,7 +58,8 @@ def __init__(self,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
                  kv_sharing_target_layer_name: Optional[str] = None,
-                 prefix: str = ""):
+                 prefix: str = "",
+                 **kwargs):
         dtype = torch.get_default_dtype()
         if cache_config is not None:
             kv_cache_dtype = cache_config.cache_dtype
@@ -88,4 +89,5 @@ def __init__(self,
             quant_config=quant_config,
             prefix=prefix,
             kv_sharing_target_layer_name=kv_sharing_target_layer_name,
-            attn_backend=attn_backend)
+            attn_backend=attn_backend,
+            **kwargs)
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
@@ -24,6 +24,7 @@
 from torch import nn
 from transformers import Llama4TextConfig
 
+import vllm.envs as envs
 from vllm.attention import Attention
 from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention
 from vllm.compilation.decorators import support_torch_compile
@@ -38,12 +39,19 @@
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.platforms import current_platform
 
 from .llama import LlamaForCausalLM, LlamaMLP, LlamaModel
 from .utils import (AutoWeightsLoader, extract_layer_index, fast_topk,
                     is_pp_missing_parameter)
 
 
+VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE = (
+    current_platform.is_rocm()
+    and envs.VLLM_ROCM_USE_AITER
+    and envs.VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE
+)
+
 class Llama4MoE(nn.Module):
 
     @staticmethod
@@ -198,6 +206,16 @@ def __init__(self,
         use_chunked_local_attn = not self.nope and config.attention_chunk_size
         attn_cls = (ChunkedLocalAttention
                     if use_chunked_local_attn else Attention)
+        extra_args = {}
+        if use_chunked_local_attn:
+            extra_args["attention_chunk_size"] = config.attention_chunk_size
+        self.use_fused_rope = (
+            VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE
+            and self.rotary_emb is not None
+            and self.qk_norm is None
+        )
+        if self.use_fused_rope:
+            extra_args["rotary_emb"] = self.rotary_emb
         self.attn = attn_cls(
             self.num_heads,
             self.head_dim,
@@ -206,9 +224,7 @@ def __init__(self,
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.attn",
-            **({
-                "attention_chunk_size": config.attention_chunk_size
-            } if use_chunked_local_attn else {}))
+            **extra_args)
 
     def _get_attn_scale(self, positions: torch.Tensor) -> torch.Tensor:
         floor = torch.floor((positions + 1.0) / self.floor_scale)
@@ -224,6 +240,15 @@ def forward(
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
 
+        # For limited cases that match Llama3's behavior, use fused RoPE
+        if self.use_fused_rope:
+            assert not (
+                self.attn_temperature_tuning and self.nope
+            ), f"{self.attn_temperature_tuning=} and {self.nope=} must be False with {VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE=}"
+            attn_output = self.attn(q, k, v, positions=positions)
+            output, _ = self.o_proj(attn_output)
+            return output
+
         if self.rotary_emb is not None:
             q, k = self.rotary_emb(positions, q, k)