Skip to content
Open
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,20 @@ class LRUEvictionPolicy : public BaseEvictionPolicy

bool verifyQueueIntegrity() override;

private:
//! \brief Add block to free block queue. Records all info needed to remove block from queue
void addToFreeBlockQueue(BlockPtr block, bool toFront);

//! \brief Remove block from free block queue, using info stored when block was added. It is always safe to call
//! this method \param block The block to be removed from free blocks queue. NOOP if block is not currently in queue
//! \return True if block was removed from free queue.
[[nodiscard]] bool removeFromFreeBlockQueue(BlockPtr block);

private:
// Queues of available leaf blocks, split by cache level and priority level
std::vector<std::vector<FreeBlocksQueue>> mFreeQueues;
// Iterators to block entries in mFreeQueues
std::vector<std::optional<FreeBlocksQueue::iterator>> mFreeBlockIterators;
// Iterators to block entries in mFreeQueues. Holds ALL arguments needed to remove block from free queue
std::vector<std::optional<std::tuple<SizeType32, SizeType32, FreeBlocksQueue::iterator>>> mFreeBlockIterators;
// Amount of free blocks at each cache level
std::vector<SizeType32> mNumFreeBlocksPerLevel;
// Secondary offload threshold. Blocks below this priority won't be evicted.
Expand Down
3 changes: 0 additions & 3 deletions cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -337,9 +337,6 @@ class KVCacheBlock : public std::enable_shared_from_this<KVCacheBlock>
// Distinct from getPrevBlock() (which navigates the radix lookup tree)
BlockPtr mPrevBlockInSeq;

// Iterator pointing to this block in mFreeBlocks.
std::optional<FreeBlocksQueue::iterator> mFreeBlockIterator;

// Flag indicating if block is full
bool mIsFull;

Expand Down
76 changes: 42 additions & 34 deletions cpp/tensorrt_llm/batch_manager/evictionPolicy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ void LRUEvictionPolicy::initialize(std::vector<BlockPtr>& mAllBlocksById, std::v
for (SizeType32 blockId = 0; blockId < sizes[cacheLevel]; blockId++)
{
// Initialize all blocks to be the default priority level
mFreeBlockIterators.emplace_back(freeQueue.insert(freeQueue.end(), mAllBlocksById[startIdx + blockId]));
mFreeBlockIterators.emplace_back(std::make_tuple(
cacheLevel, defaultPriorityIdx, freeQueue.insert(freeQueue.end(), mAllBlocksById[startIdx + blockId])));
}

startIdx += sizes[cacheLevel];
Expand Down Expand Up @@ -122,6 +123,37 @@ std::tuple<BlockPtr, bool> LRUEvictionPolicy::getFreeBlock(SizeType32 cacheLevel
TLLM_THROW("No free block found. This shouldn't happen!");
}

void LRUEvictionPolicy::addToFreeBlockQueue(BlockPtr block, bool toFront)
{
// Info needed to add (and later remove) block to queue.
SizeType32 const cacheLevel = getCacheLevel(block);
SizeType32 const id = block->getBlockId();
SizeType32 const priority = getPriorityIdx(block->getPriority());
// Add block to queue along with all info required to later remove it
auto& q = mFreeQueues[cacheLevel][priority];
auto insertItr = toFront ? q.begin() : q.end();
mFreeBlockIterators[id] = std::make_tuple(cacheLevel, priority, q.insert(insertItr, block));
mNumFreeBlocksPerLevel[cacheLevel]++;
}

bool LRUEvictionPolicy::removeFromFreeBlockQueue(BlockPtr block)
{
SizeType32 const id = block->getBlockId();
if (mFreeBlockIterators[id].has_value())
{
// Remove block using stored values, not current values
auto [cacheLevel, priority, it] = mFreeBlockIterators[id].value();
mFreeQueues[cacheLevel][priority].erase(it);
mNumFreeBlocksPerLevel[cacheLevel] -= 1;
mFreeBlockIterators[id] = std::nullopt;
return true;
}
else
{
return false;
}
}

void LRUEvictionPolicy::releaseBlock(BlockPtr block)
{
releaseBlock(block, false);
Expand All @@ -136,21 +168,8 @@ void LRUEvictionPolicy::releaseBlock(BlockPtr block, bool toFront)
"Attempted to release the cached-blocks root into the eviction queue");
// Placeholder blocks have no physical GPU memory and must never enter the eviction queue.
TLLM_CHECK_WITH_INFO(!block->isPlaceholder(), "Attempted to release a placeholder block into the eviction queue");
SizeType32 const cacheLevel = getCacheLevel(block);
SizeType32 const id = block->getBlockId();

// If there are no children, this is a leaf block. Insert into a queue.
auto& q = mFreeQueues[cacheLevel][getPriorityIdx(block->getPriority())];
if (toFront)
{
mFreeBlockIterators[id] = q.insert(q.begin(), block);
}
else
{
mFreeBlockIterators[id] = q.insert(q.end(), block);
}

mNumFreeBlocksPerLevel[cacheLevel]++;
addToFreeBlockQueue(block, toFront);

if (block->getDurationMs().has_value()
&& block->getPriority() != executor::KvCacheRetentionConfig::kDefaultRetentionPriority)
Expand All @@ -174,23 +193,16 @@ void LRUEvictionPolicy::claimBlock(BlockPtr block)
void LRUEvictionPolicy::claimBlock(BlockPtr block, std::optional<executor::RetentionPriority> priority,
std::optional<std::chrono::milliseconds> durationMs)
{
SizeType32 const id = block->getBlockId();
SizeType32 const cacheLevel = getCacheLevel(block);

if (mFreeBlockIterators[id] != std::nullopt)
if (removeFromFreeBlockQueue(block))
{
mFreeQueues[cacheLevel][getPriorityIdx(block->getPriority())].erase(*mFreeBlockIterators[id]);
mNumFreeBlocksPerLevel[cacheLevel] -= 1;
// Only need to remove block from expiring block heap if block was removed from free blocks queue
mExpiringBlockHeap.erase(block);
}

mFreeBlockIterators[id] = std::nullopt;

if (priority.has_value())
{
block->setPriority(*priority);
}

mExpiringBlockHeap.erase(block);
block->setDurationMs(durationMs);
}

Expand All @@ -209,19 +221,15 @@ void LRUEvictionPolicy::refresh()
break;
}

auto const id = block->getBlockId();
auto const level = getCacheLevel(block);

mExpiringBlockHeap.erase(mExpiringBlockHeap.begin());

if (mFreeBlockIterators[id] != std::nullopt)
// Add block to free blocks queue with default priority if it was removed from another priority free blocks
// queue
if (removeFromFreeBlockQueue(block))
{
// This is already in another queue. Delete it, and bring it down to the default queue
mFreeQueues[level][getPriorityIdx(block->getPriority())].erase(*mFreeBlockIterators[id]);
auto& q = mFreeQueues[level][getPriorityIdx(kDefaultPriority)];
mFreeBlockIterators[id] = q.insert(q.end(), block);
block->setPriority(kDefaultPriority);
addToFreeBlockQueue(block, /*toFront*/ false);
}
block->setPriority(kDefaultPriority);
}
}

Expand Down
19 changes: 13 additions & 6 deletions cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ KVCacheBlock::KVCacheBlock(IdType blockId, tk::KVCacheIndex blockIdx)
, mWindowSize{std::numeric_limits<int>::max()}
// sentinel: unattached; valid sizes are >= 1 or kRecurrentStates (-1)
, mIsPlaceholder{false}
, mFreeBlockIterator(std::nullopt)
, mIsFull{false}
, mPriority{executor::KvCacheRetentionConfig::kDefaultRetentionPriority}
, mDurationMs{std::nullopt}
Expand Down Expand Up @@ -990,6 +989,13 @@ BlockPtr WindowBlockManager::getFreeBlock(GenerationRequest& sequence, executor:
{
// Offload block in primary memory before repurposing
auto offloadBlock = std::get<0>(mEvictionPolicy->getFreeBlock(kSecondaryLevel));
// Claim both blocks BEFORE the swap so getCacheLevel() still reflects the
// actual free-queue each iterator belongs to. After swapMemoryPoolBlockOffset()
// isPrimary() is inverted for both blocks, so calling claimBlock() post-swap
// would make it erase from the wrong std::list -- undefined behaviour.
// This ordering matches WindowBlockManager::offloadBlock().
mEvictionPolicy->claimBlock(block); // block is PRIMARY -> erases from primary queue
mEvictionPolicy->claimBlock(offloadBlock); // offloadBlock is SECONDARY -> erases from secondary queue
mTransferManager->offload(block, offloadBlock, mPools, 0, mode, directory);
// swap linear block offsets (i.e. make block the offload block)
block->swapMemoryPoolBlockOffset(offloadBlock);
Expand All @@ -1000,18 +1006,19 @@ BlockPtr WindowBlockManager::getFreeBlock(GenerationRequest& sequence, executor:
tle::KVCacheUpdatedData(block->getHash()).cacheLevelUpdated(kPrimaryLevel, kSecondaryLevel),
mWindowSize);
}
// Update the block as a secondary block (maintaining its priority)
mEvictionPolicy->claimBlock(block);
// Release the block into secondary block queue
// Release block (now SECONDARY after the swap) into the secondary queue
mEvictionPolicy->releaseBlock(block);
// We have the offloaded block as the block to use now.
block = offloadBlock;
}
else
{
// Claim the block in primary block queue
mEvictionPolicy->claimBlock(block, priority, durationMs);
}

// Removes children of the block from the search tree
freeChildren(block);
// Claim the block in primary block queue
mEvictionPolicy->claimBlock(block, priority, durationMs);

// Deal with invalidating block save for reuse for the sequence
if (mBlockToSequence.count(block->getBlockId()) > 0)
Expand Down
Loading