[None][refactor] Move _update_k_cache into sparse_attn_indexer

liji-nv · Lance Liao · commit 2c8b44c224c9 · 2026-03-24T07:52:27.000-07:00
Move _update_k_cache call to the top of sparse_attn_indexer so
the k cache is populated right before prefill chunks gather from it.
Remove pre_indexer (now redundant); forward() and forward_dsa_proj
both call pre_indexer_proj directly.

Signed-off-by: Jin Li &lt;59594262+liji-nv@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/attention_backend/sparse/dsa.py b/tensorrt_llm/_torch/attention_backend/sparse/dsa.py
@@ -1395,6 +1395,9 @@ def sparse_attn_indexer(
         weights: torch.Tensor,
         use_custom_topk: bool = True,
     ) -> torch.Tensor:
+        # Update the indexer k cache before prefill chunks gather from it.
+        self._update_k_cache(k_fp8, k_scale, metadata)
+
         num_contexts = metadata.num_contexts
         num_generations = metadata.num_generations
         num_ctx_tokens = metadata.num_ctx_tokens
@@ -1669,24 +1672,6 @@ def _prep_q_or_k(self, qk_pe: torch.Tensor, qk_nope: torch.Tensor):
             qk_pe, qk_nope, self.scale_fmt == "ue8m0")
         return fp8_out, scale
 
-    @torch.inference_mode()
-    def pre_indexer(
-        self, qr: torch.Tensor, hidden_states: torch.Tensor,
-        metadata: DSAtrtllmAttentionMetadata, position_ids: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Token-wise projections, FP8 quantize, weight scaling, and k cache update.
-
-        Runs the full indexer pre-computation including k cache update.
-        Used by the eager path (Indexer.forward) where everything runs
-        outside CUDA graph capture.
-
-        Returns (q_fp8, k_fp8, k_scale, weights).
-        """
-        q_fp8, k_fp8, k_scale, weights = self.pre_indexer_proj(
-            qr, hidden_states, position_ids)
-        self._update_k_cache(k_fp8, k_scale, metadata)
-        return q_fp8, k_fp8, k_scale, weights
-
     def pre_indexer_proj(
         self, qr: torch.Tensor, hidden_states: torch.Tensor,
         position_ids: torch.Tensor
@@ -1733,8 +1718,8 @@ def pre_indexer_proj(
     def forward(self, qr: torch.Tensor, hidden_states: torch.Tensor,
                 metadata: DSAtrtllmAttentionMetadata,
                 position_ids: torch.Tensor):
-        q_fp8, k_fp8, k_scale, weights = self.pre_indexer(
-            qr, hidden_states, metadata, position_ids)
+        q_fp8, k_fp8, k_scale, weights = self.pre_indexer_proj(
+            qr, hidden_states, position_ids)
 
         # Return topk indices buffer for sparse attention [num_tokens, index_topk]
         return self.sparse_attn_indexer(metadata, hidden_states, q_fp8, k_fp8,
diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py
@@ -1786,9 +1786,6 @@ def forward_dsa_attn(
             k_fp8 = k_fp8[:num_tokens, ...]
             k_scale = k_scale[:num_tokens, ...]
             weights = weights[:num_tokens, ...]
-            # Update the indexer k cache here (outside CUDA graph) because
-            # it accesses batch-specific metadata (slot_mapping_fp8/scale).
-            self.mqa.indexer._update_k_cache(k_fp8, k_scale, attn_metadata)
             topk_indices = self.mqa.indexer.sparse_attn_indexer(
                 attn_metadata,
                 q,  # only used for shape/device in buffer allocation