NVIDIA
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 17 additions & 0 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp‎
Lines changed: 16 additions & 2 deletions b/‎cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp‎
Lines changed: 16 additions & 2 deletions
@@ -380,6 +380,7 @@ class GenerationRequest
         , mBeamWidth(beamWidth)
         , mKvCacheRetentionConfig(std::move(kvCacheRetentionConfig))
         , mNumFrontBlocksRemoved(0)
+        , mCurrentPrepopulatedPromptLen(std::numeric_limits<SizeType32>::max())
     {
         auto const numWindowSizes = windowSizeToMetadata.size();
         mCacheBlockIds.reserve(numWindowSizes);
@@ -500,6 +501,20 @@ class GenerationRequest
         return mKvCacheRetentionConfig.getDirectory();
     }
 
+    [[nodiscard]] SizeType32 getCurrentPrepopulatedPromptLen() const
+    {
+        return mCurrentPrepopulatedPromptLen;
+    }
+
+    void setCurrentPrepopulatedPromptLen(SizeType32 currentPrepopulatedPromptLen)
+    {
+        TLLM_CHECK_WITH_INFO(currentPrepopulatedPromptLen <= mCurrentPrepopulatedPromptLen,
+            "currentPrepopulatedPromptLen must be updated non-increasingly due to the "
+            "assumption that smaller window sizes have shorter or equal"
+            "currentPrepopulatedPromptLen in WindowSizeManager::loadOrAllocateBlocks.");
+        mCurrentPrepopulatedPromptLen = currentPrepopulatedPromptLen;
+    }
+
 private:
     // Request id of the sequence
     LlmRequest::RequestIdType mRequestId;
@@ -517,6 +532,8 @@ class GenerationRequest
     SizeType32 mNumFrontBlocksRemoved;
     // Set of used blocks by the sequence
     std::set<KVCacheBlock::IdType> mUsedBlocks;
+    // Current prepopulated prompt length
+    SizeType32 mCurrentPrepopulatedPromptLen;
 };
 
 // attach metadata to a pool pointer
 
@@ -1224,7 +1224,7 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
         auto [partialMatch, numMatched, matchingBlock] = searchRoot != nullptr && blockItr != blockKeys.end()
             ? searchRoot->findMatchingBlock(*blockItr, mEnablePartialReuse, mCopyOnPartialReuse)
             : std::make_tuple(false, 0, nullptr);
-        if (matchingBlock != nullptr)
+        if (matchingBlock != nullptr && numMatchedTokens + numMatched <= sequence.getCurrentPrepopulatedPromptLen())
         {
             KVCacheBlock::IdType matchingBlockId = matchingBlock->getBlockId();
 
@@ -1338,6 +1338,7 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
         }
     }
 
+    sequence.setCurrentPrepopulatedPromptLen(numMatchedTokens);
     return numMatchedTokens;
 }
 
@@ -1731,9 +1732,22 @@ std::optional<KVCacheBlock::IdType> BlockManager::releaseBlocks(
     // Released block will be stored when reuse is enabled.
     // Reuse is implied to be enabled if llmRequest is provided.
     std::optional<KVCacheBlock::IdType> lastStoredId = std::nullopt;
+
+    // For now, the attention kernel only accepts a single
+    // "prepopulatedPromptLen", that is, all window sizes will use the same
+    // prepopulated prompt length, so it is meaningless right now to save
+    // blocks only for a certain window size while blocks in the other
+    // window size are not valid for saving for reuse.
+    bool isAllWindowSizesValidForStoreForReuse = true;
+    for (auto& [windowSize, manager] : mWindowBlockManagers)
+    {
+        isAllWindowSizesValidForStoreForReuse &= manager.isSequenceValidForStoreForReuse(sequence.getRequestId());
+    }
+
     for (auto& [_, manager] : mWindowBlockManagers)
     {
-        if (!llmRequest.has_value() || llmRequest->isDummyRequest() || sequence.getBeamWidth() > 1)
+        if (!llmRequest.has_value() || llmRequest->isDummyRequest() || sequence.getBeamWidth() > 1
+            || !isAllWindowSizesValidForStoreForReuse)
         {
             lastStoredId = manager.releaseBlocks(sequence, std::nullopt);
         }
Original file line number	Diff line number	Diff line change
`@@ -1224,7 +1224,7 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&`
`1224`	`1224`	`auto [partialMatch, numMatched, matchingBlock] = searchRoot != nullptr && blockItr != blockKeys.end()`
`1225`	`1225`	`? searchRoot->findMatchingBlock(*blockItr, mEnablePartialReuse, mCopyOnPartialReuse)`
`1226`	`1226`	`: std::make_tuple(false, 0, nullptr);`
`1227`		`- if (matchingBlock != nullptr)`
	`1227`	`+ if (matchingBlock != nullptr && numMatchedTokens + numMatched <= sequence.getCurrentPrepopulatedPromptLen())`
`1228`	`1228`	`{`
`1229`	`1229`	`KVCacheBlock::IdType matchingBlockId = matchingBlock->getBlockId();`
`1230`	`1230`
`@@ -1338,6 +1338,7 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&`
`1338`	`1338`	`}`
`1339`	`1339`	`}`
`1340`	`1340`
	`1341`	`+ sequence.setCurrentPrepopulatedPromptLen(numMatchedTokens);`
`1341`	`1342`	`return numMatchedTokens;`
`1342`	`1343`	`}`
`1343`	`1344`
`@@ -1731,9 +1732,22 @@ std::optional<KVCacheBlock::IdType> BlockManager::releaseBlocks(`
`1731`	`1732`	`// Released block will be stored when reuse is enabled.`
`1732`	`1733`	`// Reuse is implied to be enabled if llmRequest is provided.`
`1733`	`1734`	`std::optional<KVCacheBlock::IdType> lastStoredId = std::nullopt;`
	`1735`	`+`
	`1736`	`+ // For now, the attention kernel only accepts a single`
	`1737`	`+ // "prepopulatedPromptLen", that is, all window sizes will use the same`
	`1738`	`+ // prepopulated prompt length, so it is meaningless right now to save`
	`1739`	`+ // blocks only for a certain window size while blocks in the other`
	`1740`	`+ // window size are not valid for saving for reuse.`
	`1741`	`+ bool isAllWindowSizesValidForStoreForReuse = true;`
	`1742`	`+ for (auto& [windowSize, manager] : mWindowBlockManagers)`
	`1743`	`+ {`
	`1744`	`+ isAllWindowSizesValidForStoreForReuse &= manager.isSequenceValidForStoreForReuse(sequence.getRequestId());`
	`1745`	`+ }`
	`1746`	`+`
`1734`	`1747`	`for (auto& [_, manager] : mWindowBlockManagers)`
`1735`	`1748`	`{`
`1736`		`- if (!llmRequest.has_value() \|\| llmRequest->isDummyRequest() \|\| sequence.getBeamWidth() > 1)`
	`1749`	`+ if (!llmRequest.has_value() \|\| llmRequest->isDummyRequest() \|\| sequence.getBeamWidth() > 1`
	`1750`	`+ \|\| !isAllWindowSizesValidForStoreForReuse)`
`1737`	`1751`	`{`
`1738`	`1752`	`lastStoredId = manager.releaseBlocks(sequence, std::nullopt);`
`1739`	`1753`	`}`