diff --git a/cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h b/cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h index affa83279b7..abab168bb14 100644 --- a/cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h +++ b/cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h @@ -91,11 +91,20 @@ class LRUEvictionPolicy : public BaseEvictionPolicy bool verifyQueueIntegrity() override; +private: + //! \brief Add block to free block queue. Records all info needed to remove block from queue + void addToFreeBlockQueue(BlockPtr block, bool toFront); + + //! \brief Remove block from free block queue, using info stored when block was added. It is always safe to call + //! this method \param block The block to be removed from free blocks queue. NOOP if block is not currently in queue + //! \return True if block was removed from free queue. + [[nodiscard]] bool removeFromFreeBlockQueue(BlockPtr block); + private: // Queues of available leaf blocks, split by cache level and priority level std::vector> mFreeQueues; - // Iterators to block entries in mFreeQueues - std::vector> mFreeBlockIterators; + // Iterators to block entries in mFreeQueues. Holds ALL arguments needed to remove block from free queue + std::vector>> mFreeBlockIterators; // Amount of free blocks at each cache level std::vector mNumFreeBlocksPerLevel; // Secondary offload threshold. Blocks below this priority won't be evicted. diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h index 64ff4c0d3fc..5b8f6f2a9e7 100644 --- a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h +++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h @@ -337,9 +337,6 @@ class KVCacheBlock : public std::enable_shared_from_this // Distinct from getPrevBlock() (which navigates the radix lookup tree) BlockPtr mPrevBlockInSeq; - // Iterator pointing to this block in mFreeBlocks. - std::optional mFreeBlockIterator; - // Flag indicating if block is full bool mIsFull; diff --git a/cpp/tensorrt_llm/batch_manager/evictionPolicy.cpp b/cpp/tensorrt_llm/batch_manager/evictionPolicy.cpp index 97108823f4d..6b843b6c346 100644 --- a/cpp/tensorrt_llm/batch_manager/evictionPolicy.cpp +++ b/cpp/tensorrt_llm/batch_manager/evictionPolicy.cpp @@ -65,7 +65,8 @@ void LRUEvictionPolicy::initialize(std::vector& mAllBlocksById, std::v for (SizeType32 blockId = 0; blockId < sizes[cacheLevel]; blockId++) { // Initialize all blocks to be the default priority level - mFreeBlockIterators.emplace_back(freeQueue.insert(freeQueue.end(), mAllBlocksById[startIdx + blockId])); + mFreeBlockIterators.emplace_back(std::make_tuple( + cacheLevel, defaultPriorityIdx, freeQueue.insert(freeQueue.end(), mAllBlocksById[startIdx + blockId]))); } startIdx += sizes[cacheLevel]; @@ -122,6 +123,37 @@ std::tuple LRUEvictionPolicy::getFreeBlock(SizeType32 cacheLevel TLLM_THROW("No free block found. This shouldn't happen!"); } +void LRUEvictionPolicy::addToFreeBlockQueue(BlockPtr block, bool toFront) +{ + // Info needed to add (and later remove) block to queue. + SizeType32 const cacheLevel = getCacheLevel(block); + SizeType32 const id = block->getBlockId(); + SizeType32 const priority = getPriorityIdx(block->getPriority()); + // Add block to queue along with all info required to later remove it + auto& q = mFreeQueues[cacheLevel][priority]; + auto insertItr = toFront ? q.begin() : q.end(); + mFreeBlockIterators[id] = std::make_tuple(cacheLevel, priority, q.insert(insertItr, block)); + mNumFreeBlocksPerLevel[cacheLevel]++; +} + +bool LRUEvictionPolicy::removeFromFreeBlockQueue(BlockPtr block) +{ + SizeType32 const id = block->getBlockId(); + if (mFreeBlockIterators[id].has_value()) + { + // Remove block using stored values, not current values + auto [cacheLevel, priority, it] = mFreeBlockIterators[id].value(); + mFreeQueues[cacheLevel][priority].erase(it); + mNumFreeBlocksPerLevel[cacheLevel] -= 1; + mFreeBlockIterators[id] = std::nullopt; + return true; + } + else + { + return false; + } +} + void LRUEvictionPolicy::releaseBlock(BlockPtr block) { releaseBlock(block, false); @@ -136,21 +168,8 @@ void LRUEvictionPolicy::releaseBlock(BlockPtr block, bool toFront) "Attempted to release the cached-blocks root into the eviction queue"); // Placeholder blocks have no physical GPU memory and must never enter the eviction queue. TLLM_CHECK_WITH_INFO(!block->isPlaceholder(), "Attempted to release a placeholder block into the eviction queue"); - SizeType32 const cacheLevel = getCacheLevel(block); - SizeType32 const id = block->getBlockId(); - - // If there are no children, this is a leaf block. Insert into a queue. - auto& q = mFreeQueues[cacheLevel][getPriorityIdx(block->getPriority())]; - if (toFront) - { - mFreeBlockIterators[id] = q.insert(q.begin(), block); - } - else - { - mFreeBlockIterators[id] = q.insert(q.end(), block); - } - mNumFreeBlocksPerLevel[cacheLevel]++; + addToFreeBlockQueue(block, toFront); if (block->getDurationMs().has_value() && block->getPriority() != executor::KvCacheRetentionConfig::kDefaultRetentionPriority) @@ -174,23 +193,16 @@ void LRUEvictionPolicy::claimBlock(BlockPtr block) void LRUEvictionPolicy::claimBlock(BlockPtr block, std::optional priority, std::optional durationMs) { - SizeType32 const id = block->getBlockId(); - SizeType32 const cacheLevel = getCacheLevel(block); - - if (mFreeBlockIterators[id] != std::nullopt) + if (removeFromFreeBlockQueue(block)) { - mFreeQueues[cacheLevel][getPriorityIdx(block->getPriority())].erase(*mFreeBlockIterators[id]); - mNumFreeBlocksPerLevel[cacheLevel] -= 1; + // Only need to remove block from expiring block heap if block was removed from free blocks queue + mExpiringBlockHeap.erase(block); } - mFreeBlockIterators[id] = std::nullopt; - if (priority.has_value()) { block->setPriority(*priority); } - - mExpiringBlockHeap.erase(block); block->setDurationMs(durationMs); } @@ -209,19 +221,15 @@ void LRUEvictionPolicy::refresh() break; } - auto const id = block->getBlockId(); - auto const level = getCacheLevel(block); - mExpiringBlockHeap.erase(mExpiringBlockHeap.begin()); - if (mFreeBlockIterators[id] != std::nullopt) + // Add block to free blocks queue with default priority if it was removed from another priority free blocks + // queue + if (removeFromFreeBlockQueue(block)) { - // This is already in another queue. Delete it, and bring it down to the default queue - mFreeQueues[level][getPriorityIdx(block->getPriority())].erase(*mFreeBlockIterators[id]); - auto& q = mFreeQueues[level][getPriorityIdx(kDefaultPriority)]; - mFreeBlockIterators[id] = q.insert(q.end(), block); + block->setPriority(kDefaultPriority); + addToFreeBlockQueue(block, /*toFront*/ false); } - block->setPriority(kDefaultPriority); } } diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp index 107c703b0b7..df82ccc4ac3 100755 --- a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp +++ b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp @@ -99,7 +99,6 @@ KVCacheBlock::KVCacheBlock(IdType blockId, tk::KVCacheIndex blockIdx) , mWindowSize{std::numeric_limits::max()} // sentinel: unattached; valid sizes are >= 1 or kRecurrentStates (-1) , mIsPlaceholder{false} - , mFreeBlockIterator(std::nullopt) , mIsFull{false} , mPriority{executor::KvCacheRetentionConfig::kDefaultRetentionPriority} , mDurationMs{std::nullopt} @@ -990,6 +989,13 @@ BlockPtr WindowBlockManager::getFreeBlock(GenerationRequest& sequence, executor: { // Offload block in primary memory before repurposing auto offloadBlock = std::get<0>(mEvictionPolicy->getFreeBlock(kSecondaryLevel)); + // Claim both blocks BEFORE the swap so getCacheLevel() still reflects the + // actual free-queue each iterator belongs to. After swapMemoryPoolBlockOffset() + // isPrimary() is inverted for both blocks, so calling claimBlock() post-swap + // would make it erase from the wrong std::list -- undefined behaviour. + // This ordering matches WindowBlockManager::offloadBlock(). + mEvictionPolicy->claimBlock(block); // block is PRIMARY -> erases from primary queue + mEvictionPolicy->claimBlock(offloadBlock); // offloadBlock is SECONDARY -> erases from secondary queue mTransferManager->offload(block, offloadBlock, mPools, 0, mode, directory); // swap linear block offsets (i.e. make block the offload block) block->swapMemoryPoolBlockOffset(offloadBlock); @@ -1000,18 +1006,19 @@ BlockPtr WindowBlockManager::getFreeBlock(GenerationRequest& sequence, executor: tle::KVCacheUpdatedData(block->getHash()).cacheLevelUpdated(kPrimaryLevel, kSecondaryLevel), mWindowSize); } - // Update the block as a secondary block (maintaining its priority) - mEvictionPolicy->claimBlock(block); - // Release the block into secondary block queue + // Release block (now SECONDARY after the swap) into the secondary queue mEvictionPolicy->releaseBlock(block); // We have the offloaded block as the block to use now. block = offloadBlock; } + else + { + // Claim the block in primary block queue + mEvictionPolicy->claimBlock(block, priority, durationMs); + } // Removes children of the block from the search tree freeChildren(block); - // Claim the block in primary block queue - mEvictionPolicy->claimBlock(block, priority, durationMs); // Deal with invalidating block save for reuse for the sequence if (mBlockToSequence.count(block->getBlockId()) > 0) diff --git a/cpp/tests/unit_tests/batch_manager/evictionPolicyTest.cpp b/cpp/tests/unit_tests/batch_manager/evictionPolicyTest.cpp index 96b6011eb56..bf5a6405299 100644 --- a/cpp/tests/unit_tests/batch_manager/evictionPolicyTest.cpp +++ b/cpp/tests/unit_tests/batch_manager/evictionPolicyTest.cpp @@ -223,6 +223,61 @@ TEST_F(LRUPolicyTest, TimedBlockTest) EXPECT_EQ(std::get<0>(policy->getFreeBlock(0)), block1); } +// Regression test for PR #12297: claimBlock() used getCacheLevel() (which reads isPrimary()) +// to locate the right free queue. If swapMemoryPoolBlockOffset() ran first — flipping +// isPrimary() on both blocks — claimBlock() would erase from the wrong std::list (UB). +// The fix stores (cacheLevel, priorityIdx, iterator) at enqueue time so the removal path +// is independent of the block's current isPrimary() value. +TEST_F(LRUPolicyTest, ClaimAfterSwapDoesNotCorruptQueues) +{ + // getFreeBlock is a peek — it does not claim. Grab one block from each level. + auto [primaryBlock, primaryShouldOffload] = policy->getFreeBlock(0); + auto [secondaryBlock, secondaryShouldOffload] = policy->getFreeBlock(1); + + ASSERT_NE(primaryBlock, nullptr); + ASSERT_NE(secondaryBlock, nullptr); + ASSERT_TRUE(primaryBlock->isPrimary()); + ASSERT_FALSE(secondaryBlock->isPrimary()); + + // Remove them from their queues so we can re-insert with a clean baseline. + policy->claimBlock(primaryBlock); + policy->claimBlock(secondaryBlock); + + // Re-insert: primaryBlock is enqueued into mFreeQueues[kPrimaryLevel=0], + // secondaryBlock is enqueued into mFreeQueues[kSecondaryLevel=1]. + // The stored tuple inside mFreeBlockIterators records (cacheLevel=0, ...) and + // (cacheLevel=1, ...) respectively at this point. + policy->releaseBlock(primaryBlock); + policy->releaseBlock(secondaryBlock); + + ASSERT_EQ(policy->getNumFreeBlocks(0), NUM_PRIMARY_BLOCKS); + ASSERT_EQ(policy->getNumFreeBlocks(1), NUM_SECONDARY_BLOCKS); + + // Simulate swapMemoryPoolBlockOffset: flip isPrimary() on both blocks. + // After this: primaryBlock->isPrimary() == false, secondaryBlock->isPrimary() == true. + // This is exactly what WindowBlockManager::getFreeBlock() did before calling claimBlock() + // in the pre-fix code — the ordering bug in PR #12297. + primaryBlock->swapMemoryPoolBlockOffset(secondaryBlock); + + ASSERT_FALSE(primaryBlock->isPrimary()); // confirm the swap happened + ASSERT_TRUE(secondaryBlock->isPrimary()); // confirm the swap happened + + // With old code (bare iterator + getCacheLevel()): getCacheLevel(primaryBlock) == 1 now, + // but primaryBlock's iterator lives in mFreeQueues[0] — erasing it from mFreeQueues[1] + // is undefined behavior and silently corrupts mNumFreeBlocksPerLevel counters. + // With the fix (stored tuple): claimBlock uses the recorded cacheLevel=0 and erases + // correctly from mFreeQueues[0], regardless of what isPrimary() currently returns. + policy->claimBlock(primaryBlock); + policy->claimBlock(secondaryBlock); + + // Each block must have been removed from its ORIGINAL queue, not the post-swap one. + EXPECT_EQ(policy->getNumFreeBlocks(0), NUM_PRIMARY_BLOCKS - 1); + EXPECT_EQ(policy->getNumFreeBlocks(1), NUM_SECONDARY_BLOCKS - 1); + + // No dangling iterators, double-erases, or corrupted list linkage. + EXPECT_TRUE(policy->verifyQueueIntegrity()); +} + TEST_F(KvCacheRetentionConfigTest, InitializeTest) { // Invalid EOS