mypy pass

LucasWilkinson · LucasWilkinson · commit 28464b50603e · 2025-02-18T05:58:15.000Z
Signed-off-by: Lucas Wilkinson &lt;lwilkins@redhat.com&gt;
diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
@@ -199,7 +199,7 @@
 from dataclasses import dataclass
 from itertools import accumulate
 from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Tuple,
-                    Type)
+                    Type, TypeVar)
 
 import torch
 from compressed_tensors.quantization import QuantizationStrategy
@@ -209,8 +209,7 @@
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
                                               AttentionMetadata,
                                               AttentionMetadataBuilder,
-                                              AttentionState, MLAAttentionImpl,
-                                              T)
+                                              AttentionState, MLAAttentionImpl)
 from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
                                            compute_slot_mapping_start_idx,
                                            get_flash_attn_version,
@@ -723,6 +722,9 @@ def advance_step(self,
                                    block_tables=self.block_tables)
 
 
+T = TypeVar("T", bound=MLACommonMetadata)
+
+
 class MLACommonMetadataBuilder(AttentionMetadataBuilder[MLACommonMetadata]):
     """
     NOTE: Please read the comment at the top of the file before trying to 
@@ -1268,12 +1270,15 @@ def _compute_prefill_context(
         assert prefill_metadata.context_chunk_cu_seq_lens is not None
         assert prefill_metadata.context_chunk_starts is not None
         assert prefill_metadata.context_chunk_max_seq_lens is not None
-        # assert prefill_metadata.block_tables is not None
         assert prefill_metadata.context_lens_tensor is not None
 
         output = None
         iters = len(prefill_metadata.context_chunk_seq_tot)
-        assert hasattr(attn_metadata, "chunked_prefill_workspace")
+
+        # Fetch from attn_metadata directly, since it late bound by
+        # MLAAttentionState, grabbing it directly `attn_metadata` can avoid
+        # any weirdness around prefill_metadata caching
+        assert attn_metadata.chunked_prefill_workspace is not None
         workspace = attn_metadata.chunked_prefill_workspace
 
         for i in range(iters):
@@ -1345,9 +1350,8 @@ def _forward_prefill(
         kv_c_normed: torch.Tensor,
         k_pe: torch.Tensor,
         kv_c_and_k_pe_cache: torch.Tensor,
-        attn_metadata: T,
+        attn_metadata: MLACommonMetadata,
     ) -> torch.Tensor:
-        assert isinstance(attn_metadata, MLACommonMetadata)
 
         prefill_metadata = attn_metadata.prefill_metadata
         assert prefill_metadata is not None
diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py
@@ -80,12 +80,14 @@ def _forward_decode(
                         dtype=q.dtype,
                         device=q.device)
 
+        num_kv_splits = 4  # TODO: heuristic
+
         # TODO(lucas) Allocate ahead of time
         attn_logits = torch.empty(
             (
                 B,
                 self.num_heads,
-                4,  #attn_metadata.num_kv_splits,
+                num_kv_splits,
                 # NOTE(lucas) idk why the +1 is here but sglang has it so we
                 # just mirror that
                 self.kv_lora_rank + 1,
@@ -100,16 +102,9 @@ def _forward_decode(
         PAGE_SIZE = kv_c_and_k_pe_cache.size(1)
 
         # Run MQA
-        decode_attention_fwd(
-            q,
-            kv_c_and_k_pe_cache,
-            kv_c_cache,
-            o,
-            decode_meta.block_tables,
-            decode_meta.seq_lens_tensor,
-            attn_logits,
-            4,
-            self.scale,  #attn_metadata.num_kv_splits
-            PAGE_SIZE)
+        decode_attention_fwd(q, kv_c_and_k_pe_cache, kv_c_cache, o,
+                             decode_meta.block_tables,
+                             decode_meta.seq_lens_tensor, attn_logits,
+                             num_kv_splits, self.scale, PAGE_SIZE)
 
         return self._v_up_proj_and_o_proj(o)