Add some comments and remove unneeded condition

tpopp · tpopp · commit 506f79851a12 · 2025-10-24T16:03:59.000+02:00
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
@@ -532,7 +532,7 @@ def unified_attention_with_output(
     # Not all layers can use RoPE fusing, so check that they were given all
     # needed inputs along with the environment variable to enable this.
     if (
-        VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE
+        
         and hasattr(self.impl, "rotary_emb")
         and self.impl.rotary_emb is not None
         and positions is not None
@@ -542,6 +542,7 @@ def unified_attention_with_output(
             or isinstance(self.impl, AiterMLAImpl)
         )
     ):
+        assert VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE, f"Only expecting rotary_emb and positions when VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE is True."
         # fusing RoPE with flushing kv_cache operation
         self.impl.forward(self,
                         query,
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
@@ -209,10 +209,12 @@ def __init__(self,
         extra_args = {}
         if use_chunked_local_attn:
             extra_args["attention_chunk_size"] = config.attention_chunk_size
+        # Use the rotary_emb in attention only when it's supported
         self.use_fused_rope = (
             VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE
             and self.rotary_emb is not None
             and self.qk_norm is None
+            and not self.attn_temperature_tuning
         )
         if self.use_fused_rope:
             extra_args["rotary_emb"] = self.rotary_emb
@@ -240,10 +242,10 @@ def forward(
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
 
-        # For limited cases that match Llama3's behavior, use fused RoPE
+        # rotary_emb is fused into self.attn in this case
         if self.use_fused_rope:
             assert not (
-                self.attn_temperature_tuning and self.nope
+                self.attn_temperature_tuning
             ), f"{self.attn_temperature_tuning=} and {self.nope=} must be False with {VLLM_ROCM_USE_AITER_TRITON_FUSED_ROPE_ZEROS_KV_CACHE=}"
             attn_output = self.attn(q, k, v, positions=positions)
             output, _ = self.o_proj(attn_output)