NVIDIA · thorjohnsen · Mar 17, 2026 · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026
@@ -91,11 +91,20 @@ class LRUEvictionPolicy : public BaseEvictionPolicy
 
     bool verifyQueueIntegrity() override;
 
+private:
+    //! \brief Add block to free block queue. Records all info needed to remove block from queue
+    void addToFreeBlockQueue(BlockPtr block, bool toFront);
+
+    //! \brief Remove block from free block queue, using info stored when block was added. It is always safe to call
+    //! this method \param block The block to be removed from free blocks queue. NOOP if block is not currently in queue
+    //! \return True if block was removed from free queue.
+    [[nodiscard]] bool removeFromFreeBlockQueue(BlockPtr block);
+
 private:
     // Queues of available leaf blocks, split by cache level and priority level
     std::vector<std::vector<FreeBlocksQueue>> mFreeQueues;
-    // Iterators to block entries in mFreeQueues
-    std::vector<std::optional<FreeBlocksQueue::iterator>> mFreeBlockIterators;
+    // Iterators to block entries in mFreeQueues. Holds ALL arguments needed to remove block from free queue
+    std::vector<std::optional<std::tuple<SizeType32, SizeType32, FreeBlocksQueue::iterator>>> mFreeBlockIterators;
     // Amount of free blocks at each cache level
     std::vector<SizeType32> mNumFreeBlocksPerLevel;
     // Secondary offload threshold. Blocks below this priority won't be evicted.

@@ -337,9 +337,6 @@ class KVCacheBlock : public std::enable_shared_from_this<KVCacheBlock>
     // Distinct from getPrevBlock() (which navigates the radix lookup tree)
     BlockPtr mPrevBlockInSeq;
 
-    // Iterator pointing to this block in mFreeBlocks.
-    std::optional<FreeBlocksQueue::iterator> mFreeBlockIterator;
-
     // Flag indicating if block is full
     bool mIsFull;
 

@@ -65,7 +65,8 @@ void LRUEvictionPolicy::initialize(std::vector<BlockPtr>& mAllBlocksById, std::v
         for (SizeType32 blockId = 0; blockId < sizes[cacheLevel]; blockId++)
         {
             // Initialize all blocks to be the default priority level
-            mFreeBlockIterators.emplace_back(freeQueue.insert(freeQueue.end(), mAllBlocksById[startIdx + blockId]));
+            mFreeBlockIterators.emplace_back(std::make_tuple(
+                cacheLevel, defaultPriorityIdx, freeQueue.insert(freeQueue.end(), mAllBlocksById[startIdx + blockId])));
         }
 
         startIdx += sizes[cacheLevel];
@@ -122,6 +123,37 @@ std::tuple<BlockPtr, bool> LRUEvictionPolicy::getFreeBlock(SizeType32 cacheLevel
     TLLM_THROW("No free block found. This shouldn't happen!");
 }
 
+void LRUEvictionPolicy::addToFreeBlockQueue(BlockPtr block, bool toFront)
+{
+    // Info needed to add (and later remove) block to queue.
+    SizeType32 const cacheLevel = getCacheLevel(block);
+    SizeType32 const id = block->getBlockId();
+    SizeType32 const priority = getPriorityIdx(block->getPriority());
+    // Add block to queue along with all info required to later remove it
+    auto& q = mFreeQueues[cacheLevel][priority];
+    auto insertItr = toFront ? q.begin() : q.end();
+    mFreeBlockIterators[id] = std::make_tuple(cacheLevel, priority, q.insert(insertItr, block));
+    mNumFreeBlocksPerLevel[cacheLevel]++;
+}
+
+bool LRUEvictionPolicy::removeFromFreeBlockQueue(BlockPtr block)
+{
+    SizeType32 const id = block->getBlockId();
+    if (mFreeBlockIterators[id].has_value())
+    {
+        // Remove block using stored values, not current values
+        auto [cacheLevel, priority, it] = mFreeBlockIterators[id].value();
+        mFreeQueues[cacheLevel][priority].erase(it);
+        mNumFreeBlocksPerLevel[cacheLevel] -= 1;
+        mFreeBlockIterators[id] = std::nullopt;
+        return true;
+    }
+    else
+    {
+        return false;
+    }
+}
+
 void LRUEvictionPolicy::releaseBlock(BlockPtr block)
 {
     releaseBlock(block, false);
@@ -136,21 +168,8 @@ void LRUEvictionPolicy::releaseBlock(BlockPtr block, bool toFront)
         "Attempted to release the cached-blocks root into the eviction queue");
     // Placeholder blocks have no physical GPU memory and must never enter the eviction queue.
     TLLM_CHECK_WITH_INFO(!block->isPlaceholder(), "Attempted to release a placeholder block into the eviction queue");
-    SizeType32 const cacheLevel = getCacheLevel(block);
-    SizeType32 const id = block->getBlockId();
-
-    // If there are no children, this is a leaf block. Insert into a queue.
-    auto& q = mFreeQueues[cacheLevel][getPriorityIdx(block->getPriority())];
-    if (toFront)
-    {
-        mFreeBlockIterators[id] = q.insert(q.begin(), block);
-    }
-    else
-    {
-        mFreeBlockIterators[id] = q.insert(q.end(), block);
-    }
 
-    mNumFreeBlocksPerLevel[cacheLevel]++;
+    addToFreeBlockQueue(block, toFront);
 
     if (block->getDurationMs().has_value()
         && block->getPriority() != executor::KvCacheRetentionConfig::kDefaultRetentionPriority)
@@ -174,23 +193,16 @@ void LRUEvictionPolicy::claimBlock(BlockPtr block)
 void LRUEvictionPolicy::claimBlock(BlockPtr block, std::optional<executor::RetentionPriority> priority,
     std::optional<std::chrono::milliseconds> durationMs)
 {
-    SizeType32 const id = block->getBlockId();
-    SizeType32 const cacheLevel = getCacheLevel(block);
-
-    if (mFreeBlockIterators[id] != std::nullopt)
+    if (removeFromFreeBlockQueue(block))
     {
-        mFreeQueues[cacheLevel][getPriorityIdx(block->getPriority())].erase(*mFreeBlockIterators[id]);
-        mNumFreeBlocksPerLevel[cacheLevel] -= 1;
+        // Only need to remove block from expiring block heap if block was removed from free blocks queue
+        mExpiringBlockHeap.erase(block);
     }
 
-    mFreeBlockIterators[id] = std::nullopt;
-
     if (priority.has_value())
     {
         block->setPriority(*priority);
     }
-
-    mExpiringBlockHeap.erase(block);
     block->setDurationMs(durationMs);
 }
 
@@ -209,19 +221,15 @@ void LRUEvictionPolicy::refresh()
             break;
         }
 
-        auto const id = block->getBlockId();
-        auto const level = getCacheLevel(block);
-
         mExpiringBlockHeap.erase(mExpiringBlockHeap.begin());
 
-        if (mFreeBlockIterators[id] != std::nullopt)
+        // Add block to free blocks queue with default priority if it was removed from another priority free blocks
+        // queue
+        if (removeFromFreeBlockQueue(block))
         {
-            // This is already in another queue. Delete it, and bring it down to the default queue
-            mFreeQueues[level][getPriorityIdx(block->getPriority())].erase(*mFreeBlockIterators[id]);
-            auto& q = mFreeQueues[level][getPriorityIdx(kDefaultPriority)];
-            mFreeBlockIterators[id] = q.insert(q.end(), block);
+            block->setPriority(kDefaultPriority);
+            addToFreeBlockQueue(block, /*toFront*/ false);
         }
-        block->setPriority(kDefaultPriority);
     }
 }
 

@@ -99,7 +99,6 @@ KVCacheBlock::KVCacheBlock(IdType blockId, tk::KVCacheIndex blockIdx)
     , mWindowSize{std::numeric_limits<int>::max()}
     // sentinel: unattached; valid sizes are >= 1 or kRecurrentStates (-1)
     , mIsPlaceholder{false}
-    , mFreeBlockIterator(std::nullopt)
     , mIsFull{false}
     , mPriority{executor::KvCacheRetentionConfig::kDefaultRetentionPriority}
     , mDurationMs{std::nullopt}
@@ -990,6 +989,13 @@ BlockPtr WindowBlockManager::getFreeBlock(GenerationRequest& sequence, executor:
     {
         // Offload block in primary memory before repurposing
         auto offloadBlock = std::get<0>(mEvictionPolicy->getFreeBlock(kSecondaryLevel));
+        // Claim both blocks BEFORE the swap so getCacheLevel() still reflects the
+        // actual free-queue each iterator belongs to.  After swapMemoryPoolBlockOffset()
+        // isPrimary() is inverted for both blocks, so calling claimBlock() post-swap
+        // would make it erase from the wrong std::list -- undefined behaviour.
+        // This ordering matches WindowBlockManager::offloadBlock().
+        mEvictionPolicy->claimBlock(block);        // block is PRIMARY  -> erases from primary queue
+        mEvictionPolicy->claimBlock(offloadBlock); // offloadBlock is SECONDARY -> erases from secondary queue
         mTransferManager->offload(block, offloadBlock, mPools, 0, mode, directory);
         // swap linear block offsets (i.e. make block the offload block)
         block->swapMemoryPoolBlockOffset(offloadBlock);
@@ -1000,18 +1006,19 @@ BlockPtr WindowBlockManager::getFreeBlock(GenerationRequest& sequence, executor:
                 tle::KVCacheUpdatedData(block->getHash()).cacheLevelUpdated(kPrimaryLevel, kSecondaryLevel),
                 mWindowSize);
         }
-        // Update the block as a secondary block (maintaining its priority)
-        mEvictionPolicy->claimBlock(block);
-        // Release the block into secondary block queue
+        // Release block (now SECONDARY after the swap) into the secondary queue
         mEvictionPolicy->releaseBlock(block);
         // We have the offloaded block as the block to use now.
         block = offloadBlock;
     }
+    else
+    {
+        // Claim the block in primary block queue
+        mEvictionPolicy->claimBlock(block, priority, durationMs);
+    }
 
     // Removes children of the block from the search tree
     freeChildren(block);
-    // Claim the block in primary block queue
-    mEvictionPolicy->claimBlock(block, priority, durationMs);
 
     // Deal with invalidating block save for reuse for the sequence
     if (mBlockToSequence.count(block->getBlockId()) > 0)