Add support for KVCache reuse for DSv32

Tabrizian · Tabrizian · commit b41a20bb0807 · 2025-11-26T19:59:36.000Z
Signed-off-by: Iman Tabrizian &lt;10105175+tabrizian@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/attention_backend/sparse/dsa.py b/tensorrt_llm/_torch/attention_backend/sparse/dsa.py
@@ -930,22 +930,24 @@ def prepare(metadata: DSAtrtllmAttentionMetadata):
                     start_idx=0,
                 )
 
-                if len(chunk_groups) > 1:
+                if len(chunk_groups
+                       ) > 1 or metadata.enable_context_mla_with_cached_kv:
                     metadata.indexer_prefill_chunks = [
                         Indexer.prepare_one_prefill_chunk(
                             metadata,
                             chunk_specs,
                         ) for chunk_specs in chunk_groups
                     ]
                 else:
-                    # Single chunk - use non-chunked fallback path
                     metadata.indexer_prefill_chunks = None
 
-            host_cu_seqlen_ks, _ = compute_cu_seqlen_kv_bounds_with_cache(
+            host_cu_seqlen_ks, host_cu_seqlen_ke = compute_cu_seqlen_kv_bounds_with_cache(
                 host_seq_lens, num_contexts, num_ctx_tokens, host_cached_tokens)
 
             metadata.cu_seqlen_ks[:num_ctx_tokens].copy_(host_cu_seqlen_ks,
                                                          non_blocking=True)
+            metadata.cu_seqlen_ke[:num_ctx_tokens].copy_(host_cu_seqlen_ke,
+                                                         non_blocking=True)
 
         # Prepare for decode phase if there are generation requests
         if num_generations > 0:
@@ -1016,9 +1018,9 @@ def prepare(metadata: DSAtrtllmAttentionMetadata):
         metadata.slot_mapping_scale[:total_tokens].copy_(
             metadata.host_slot_mapping_scale[:total_tokens], non_blocking=True)
 
-        # Only when MLA chunked prefill is enabled, we need to gather the full KV for indexer's logit computation.
+        # When chunked prefill or KVCache reuse is enabled, we need to gather the full KV for indexer's logit computation.
         # Indexer's own chunking does not need full KV gathering, instead it gathers only the current chunk with loop-based gathering.
-        _need_full_kv_gathering = num_contexts > 0 and has_mla_chunked_prefill
+        _need_full_kv_gathering = num_contexts > 0 and metadata.enable_context_mla_with_cached_kv
         if _need_full_kv_gathering:
             total_kv_len = metadata.host_ctx_kv_indptr[num_contexts].item()
             total_kv_per_request = seq_lens[:
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2417,17 +2417,13 @@ def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
         if get_sm_version() == 100 or get_sm_version() == 103:
             moe_backend = "DEEPGEMM" if moe_backend == "_DEFAULT" else moe_backend
             moe_config = MoeConfig(backend=moe_backend, max_num_tokens=16384)
-            # TODO: Support block reuse for DeepSeek-V3.2
-            kv_cache_config = KvCacheConfig(enable_block_reuse=False,
-                                            free_gpu_memory_fraction=0.6,
+            kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6,
                                             tokens_per_block=64)
         else:
             if moe_backend != "_DEFAULT":
                 pytest.skip("Not supported MoE backend!")
             moe_config = MoeConfig()
-            # TODO: Support block reuse for DeepSeek-V3.2
-            kv_cache_config = KvCacheConfig(enable_block_reuse=False,
-                                            free_gpu_memory_fraction=0.7,
+            kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
                                             tokens_per_block=64)
 
         pytorch_config = dict(
@@ -2490,8 +2486,7 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
                 "MOE TRTLLM backend does not support SM version 120 or 121")
 
         moe_config = MoeConfig(backend=moe_backend, max_num_tokens=16384)
-        kv_cache_config = KvCacheConfig(enable_block_reuse=True,
-                                        free_gpu_memory_fraction=0.7,
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
                                         tokens_per_block=64)
         cuda_graph_config = CudaGraphConfig(
             enable_padding=True,
@@ -2550,8 +2545,7 @@ def test_nvfp4_multi_gpus_chunked_prefill(self, tp_size, pp_size, ep_size,
                 "MOE TRTLLM backend does not support SM version 120 or 121")
 
         moe_config = MoeConfig(backend=moe_backend, max_num_tokens=16384)
-        kv_cache_config = KvCacheConfig(enable_block_reuse=False,
-                                        free_gpu_memory_fraction=0.7,
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
                                         tokens_per_block=64)
         cuda_graph_config = CudaGraphConfig(
             enable_padding=True,