[TRTLLM-5532][feat] store the block of context request into kv cache (NVIDIA#6683)

byshiue · web-flow · commit 83dbc6c75dd1 · 2025-08-11T16:14:52.000+08:00
Signed-off-by: bhsueh &lt;11360707+byshiue@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
@@ -2043,10 +2043,13 @@ void KVCacheManager::addSequence(
 void KVCacheManager::storeContextBlocks(LlmRequest const& llmRequest)
 {
     auto const requestId = llmRequest.mRequestId;
-    auto& sequence = getSequence(requestId);
-    if (mEnableBlockReuse && !sequence.isCyclic() && !llmRequest.isDummyRequest())
+    if (mSequences.find(requestId) != mSequences.end())
     {
-        mBlockManager.storeContextBlocks(sequence, llmRequest);
+        auto& sequence = getSequence(requestId);
+        if (mEnableBlockReuse && !sequence.isCyclic() && !llmRequest.isDummyRequest())
+        {
+            mBlockManager.storeContextBlocks(sequence, llmRequest);
+        }
     }
 }
 
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -492,6 +492,10 @@ def update_resources(self, scheduled_batch: ScheduledRequests):
                 if request.py_rewind_len > 0:
                     self.rewind_kv_cache(request, request.py_rewind_len)
 
+        # For context requests, we store the blocks for reuse.
+        for request in scheduled_batch.context_requests:
+            self.impl.store_context_blocks(request)
+
     def free_resources(self, request: LlmRequest):
         self.impl.remove_sequence(request.py_request_id, request)
 

Original file line number	Diff line number	Diff line change
`@@ -2043,10 +2043,13 @@ void KVCacheManager::addSequence(`
`2043`	`2043`	`void KVCacheManager::storeContextBlocks(LlmRequest const& llmRequest)`
`2044`	`2044`	`{`
`2045`	`2045`	`auto const requestId = llmRequest.mRequestId;`
`2046`		`- auto& sequence = getSequence(requestId);`
`2047`		`- if (mEnableBlockReuse && !sequence.isCyclic() && !llmRequest.isDummyRequest())`
	`2046`	`+ if (mSequences.find(requestId) != mSequences.end())`
`2048`	`2047`	`{`
`2049`		`- mBlockManager.storeContextBlocks(sequence, llmRequest);`
	`2048`	`+ auto& sequence = getSequence(requestId);`
	`2049`	`+ if (mEnableBlockReuse && !sequence.isCyclic() && !llmRequest.isDummyRequest())`
	`2050`	`+ {`
	`2051`	`+ mBlockManager.storeContextBlocks(sequence, llmRequest);`
	`2052`	`+ }`
`2050`	`2053`	`}`
`2051`	`2054`	`}`
`2052`	`2055`