Skip to content

Commit 5bf57de

Browse files
authored
Merge branch 'main' into clean_cuda_graph
2 parents a396d06 + 493da02 commit 5bf57de

File tree

1,664 files changed

+15551
-6070
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,664 files changed

+15551
-6070
lines changed

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@ TensorRT LLM
1818
<div align="left">
1919

2020
## Tech Blogs
21+
22+
* [10/13] Scaling Expert Parallelism in TensorRT LLM (Part 3: Pushing the Performance Boundary)
23+
[➡️ link](./docs/source/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.md)
24+
2125
* [09/26] Inference Time Compute Implementation in TensorRT LLM
2226
[➡️ link](./docs/source/blogs/tech_blog/blog13_Inference_Time_Compute_Implementation_in_TensorRT-LLM.md)
2327

cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -92,13 +92,8 @@ class LRUEvictionPolicy : public BaseEvictionPolicy
9292
bool verifyQueueIntegrity() override;
9393

9494
private:
95-
// Check if the block should be added to mFreeQueues.
96-
bool isReleasedLeafBlock(BlockPtr const& block);
97-
9895
// Queues of available leaf blocks, split by cache level and priority level
9996
std::vector<std::vector<FreeBlocksQueue>> mFreeQueues;
100-
// All blocks that have been released, along with the amount of released children
101-
std::vector<std::unordered_set<SizeType32>> mReleasedBlocks;
10297
// Iterators to block entries in mFreeQueues
10398
std::vector<std::optional<FreeBlocksQueue::iterator>> mFreeBlockIterators;
10499
// Amount of free blocks at each cache level

cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h

Lines changed: 92 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -130,14 +130,17 @@ struct WindowSizeMetadata
130130
SizeType32 temporaryAttentionWindow; // Temporary kv cache length per sequence.
131131
// Only needed when chunked context + sliding window attention are used
132132
// together. And it should only be considered when allocating blocks.
133+
SizeType32 windowSize;
134+
bool isSWA;
133135

134136
std::string toString()
135137
{
136138
return tensorrt_llm::common::fmtstr(
137139
"WindowSizeMetadata{ .allottedPrimaryBlocks=%d, .allottedSecondaryBlocks=%d, .absolutePoolsOffset=%d, "
138-
".numPools=%d, .maxTokenNum=%d, .maxBlocksPerSeq=%d, .maxNumBlocks=%d, .temporaryAttentionWindow=%d }",
140+
".numPools=%d, .maxTokenNum=%d, .maxBlocksPerSeq=%d, .maxNumBlocks=%d, .temporaryAttentionWindow=%d, "
141+
".windowSize=%d, .isSWA=%d }",
139142
allottedPrimaryBlocks, allottedSecondaryBlocks, absolutePoolsOffset, numPools, maxTokenNum, maxBlocksPerSeq,
140-
maxNumBlocks, temporaryAttentionWindow);
143+
maxNumBlocks, temporaryAttentionWindow, windowSize, isSWA);
141144
}
142145
};
143146

@@ -512,6 +515,8 @@ class GenerationRequest
512515
executor::KvCacheRetentionConfig mKvCacheRetentionConfig;
513516
// Number of front blocks removed from the sequence
514517
SizeType32 mNumFrontBlocksRemoved;
518+
// Set of used blocks by the sequence
519+
std::set<KVCacheBlock::IdType> mUsedBlocks;
515520
};
516521

517522
// attach metadata to a pool pointer
@@ -628,15 +633,15 @@ class WindowBlockManager
628633
void releaseLastBlock(GenerationRequest& sequence);
629634

630635
//! \brief Detach front block from the sequence
631-
void detachFrontBlock(GenerationRequest& sequence, bool isEnableBlockReuse);
636+
void detachFrontBlock(GenerationRequest& sequence);
632637

633638
//! \brief Add/detach block(s) to/from the sequence if needed
634639
//! \details When we need a new block, we add it. For sliding window
635640
//! attention (SWA), when a block goes out-of-window (OOW), we detach it
636-
//! and store it if reuse is enabled. If this called in the first step of
637-
//! the generation phase, we may detach more than a single block since
638-
//! there may be more than one context block that goes OOW.
639-
void adjustBlocksIfNeeded(GenerationRequest& sequence, bool isEnableBlockReuse);
641+
//! If this called in the first step of the generation phase, we may detach
642+
//! more than a single block since there may be more than one context block
643+
//! that goes OOW.
644+
void adjustBlocksIfNeeded(GenerationRequest& sequence);
640645

641646
[[nodiscard]] SizeType32 getWindowSize() const noexcept
642647
{
@@ -763,7 +768,7 @@ class WindowBlockManager
763768

764769
//! \brief Bring offloaded block from secondary to primary memory.
765770
//! \details Does nothing if block is already in primary memory.
766-
void onboardBlock(BlockPtr const& offloadBlock,
771+
void onboardBlock(GenerationRequest& sequence, BlockPtr const& offloadBlock,
767772
executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM, std::string const& directory = "");
768773

769774
//! \brief Bring block from primary to secondary memory.
@@ -826,6 +831,23 @@ class WindowBlockManager
826831
//! \brief Unpin blocks by starting from a block id and walking prev pointers.
827832
void unpinBlocksById(KVCacheBlock::IdType blockId);
828833

834+
void initializeSequenceStorageValidity(LlmRequest::RequestIdType requestId)
835+
{
836+
mIsValidStoreForReuseSequence[requestId] = true;
837+
}
838+
839+
void releaseSequenceStorageValidity(LlmRequest::RequestIdType requestId)
840+
{
841+
mIsValidStoreForReuseSequence.erase(requestId);
842+
}
843+
844+
//! \brief Return whether this sequence is valid for store for reuse
845+
[[nodiscard]] bool isSequenceValidForStoreForReuse(LlmRequest::RequestIdType requestId) const
846+
{
847+
TLLM_CHECK_WITH_INFO(mIsValidStoreForReuseSequence.count(requestId) > 0, "Sequence should be bookkeeped");
848+
return mIsValidStoreForReuseSequence.at(requestId);
849+
}
850+
829851
private:
830852
//! \brief Add single block to beam of sequence and mAllocatedBlocksPerSeq.
831853
void addBlockToBeam(BlockPtr& block, GenerationRequest& sequence, SizeType32 beamIdx);
@@ -842,18 +864,17 @@ class WindowBlockManager
842864
executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM, std::string const& directory = "");
843865

844866
//! \brief Free block and all it's descendants. This makes block a claimed leaf block.
845-
void freeChildren(BlockPtr const& block, executor::RetentionPriority priority,
846-
std::optional<std::chrono::milliseconds> durationMs);
867+
void freeChildren(BlockPtr const& block);
847868

848869
//! \brief Find block least likely to be reused, free it if necessary and return.
849-
[[nodiscard]] BlockPtr getFreeBlock(
870+
//! \param sequence Sequence which the free block is allocated for
871+
[[nodiscard]] BlockPtr getFreeBlock(GenerationRequest& sequence,
850872
executor::RetentionPriority = executor::KvCacheRetentionConfig::kDefaultRetentionPriority,
851873
std::optional<std::chrono::milliseconds> durationMs = std::nullopt,
852874
executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM, std::string const& directory = "");
853875

854-
//! \brief Free block from previous block and claim it from free blocks list.
855-
void claimLeafBlock(BlockPtr const& block, std::optional<executor::RetentionPriority> priority = std::nullopt,
856-
std::optional<std::chrono::milliseconds> durationMs = std::nullopt);
876+
//! \brief Calls KVCacheBlock::freeLeafBlock to remove block from search tree.
877+
void freeLeafBlock(BlockPtr const& block);
857878

858879
//! \brief For FP4 quantization. Creates pool objects for FP4 block scalars.
859880
void createBlockScalePools(SizeType32 blockSize);
@@ -933,6 +954,14 @@ class WindowBlockManager
933954

934955
// Mutex for the cached blocks root
935956
std::mutex mCachedBlocksRootMutex;
957+
958+
// Record which sequence is using the block
959+
std::map<KVCacheBlock::IdType, LlmRequest::RequestIdType> mBlockToSequence;
960+
// Record whether a sequence has all blocks held valid.
961+
// The boolean value is set to true upon first encounter of a new sequence.
962+
// It may be invalidated to false when other sequence acquires a block that
963+
// is used by another sequence.
964+
std::map<LlmRequest::RequestIdType, bool> mIsValidStoreForReuseSequence;
936965
};
937966

938967
class BlockManager
@@ -1008,7 +1037,7 @@ class BlockManager
10081037

10091038
//! \brief Bring block from primary to secondary memory for window size.
10101039
//! \details Does nothing if block is already in primary memory.
1011-
void onboardBlock(BlockPtr const& offloadBlock, SizeType32 windowSize,
1040+
void onboardBlock(GenerationRequest& sequence, BlockPtr const& offloadBlock, SizeType32 windowSize,
10121041
executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM, std::string const& directory = "");
10131042

10141043
//! \brief Bring block from primary to secondary memory for window size.
@@ -1239,10 +1268,52 @@ class BlockManager
12391268
//! \brief Add/detach block(s) to/from the sequence if needed
12401269
//! \details When we need a new block, we add it. For sliding window
12411270
//! attention (SWA), when a block goes out-of-window (OOW), we detach it
1242-
//! and store it if reuse is enabled. If this called in the first step of
1243-
//! the generation phase, we may detach more than a single block since
1244-
//! there may be more than one context block that goes OOW.
1245-
void adjustBlocksIfNeeded(GenerationRequest& sequence, bool isEnableBlockReuse);
1271+
//! If this called in the first step of the generation phase, we may
1272+
//! detach more than a single block since there may be more than one
1273+
//! context block that goes OOW.
1274+
void adjustBlocksIfNeeded(GenerationRequest& sequence);
1275+
1276+
//! \brief Return whether the sequence is already managed by the block manager
1277+
[[nodiscard]] bool isSequenceHeld(LlmRequest::RequestIdType requestId) const
1278+
{
1279+
return mManagedSequences.count(requestId) > 0;
1280+
}
1281+
1282+
//! \brief Add a sequence to the managed sequences
1283+
//! \details Take the sequence into account for the manager. Initialize
1284+
//! sequence storage validity under all window sizes.
1285+
void holdSequence(LlmRequest::RequestIdType requestId)
1286+
{
1287+
mManagedSequences.insert(requestId);
1288+
for (auto const& [windowSize, metadata] : mWindowSizeToMetadata)
1289+
{
1290+
mWindowBlockManagers.at(windowSize).initializeSequenceStorageValidity(requestId);
1291+
}
1292+
}
1293+
1294+
//! \brief Remove a sequence from the managed sequences.
1295+
//! \details Remove sequence from the managed sequences and remove sequence
1296+
//! storage
1297+
void releaseSequence(LlmRequest::RequestIdType requestId)
1298+
{
1299+
mManagedSequences.erase(requestId);
1300+
for (auto const& [windowSize, metadata] : mWindowSizeToMetadata)
1301+
{
1302+
mWindowBlockManagers.at(windowSize).releaseSequenceStorageValidity(requestId);
1303+
}
1304+
}
1305+
1306+
//! \brief Return whether the sequence is still valid for store-for-reuse
1307+
//! regarding the specific window size.
1308+
//! \details Currently this utility function is only used under
1309+
//! kvCacheManagerTest.cpp. Checking for store-for-reuse for each window
1310+
//! size is done in an iterating fashion under BlockManager::releaseBlocks.
1311+
bool isSequenceValidForStoreForReuse(LlmRequest::RequestIdType requestId, SizeType32 windowSize) const
1312+
{
1313+
TLLM_CHECK_WITH_INFO(
1314+
mWindowBlockManagers.count(windowSize) > 0, "Querying window size is not found under mWindowBlockManager");
1315+
return mWindowBlockManagers.at(windowSize).isSequenceValidForStoreForReuse(requestId);
1316+
}
12461317

12471318
private:
12481319
[[nodiscard]] WindowBlockManager const& windowManagerByLayer(SizeType32 layerIdx) const
@@ -1278,6 +1349,8 @@ class BlockManager
12781349
std::vector<SizeType32> mLayerToWindowSize;
12791350
std::vector<SizeType32> mAbsolutePoolToWindowSize;
12801351
std::vector<SizeType32> mAbsolutePoolToRelativePoolIndex;
1352+
// Record what sequences are currently managed by the block manager
1353+
std::set<LlmRequest::RequestIdType> mManagedSequences;
12811354
};
12821355

12831356
struct OffsetTableDimensions

cpp/include/tensorrt_llm/batch_manager/llmRequest.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1828,9 +1828,10 @@ class GenericLlmRequest
18281828

18291829
void updatePerfMetrics(executor::IterationType iter)
18301830
{
1831+
auto const currentTokenTime = getSteadyClockNow();
1832+
18311833
if (!mPerfMetrics.firstIter)
18321834
{
1833-
auto const currentTokenTime = getSteadyClockNow();
18341835
mPerfMetrics.firstIter = iter;
18351836
mPerfMetrics.timingMetrics.firstTokenTime = currentTokenTime;
18361837
}
@@ -1839,7 +1840,6 @@ class GenericLlmRequest
18391840

18401841
if (isFinished())
18411842
{
1842-
auto const currentTokenTime = getSteadyClockNow();
18431843
mPerfMetrics.lastIter = iter;
18441844
mPerfMetrics.timingMetrics.lastTokenTime = currentTokenTime;
18451845
}

cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -291,8 +291,9 @@ class CacheSender::Impl
291291
mSelfState.setCommState(std::move(commState));
292292
}
293293

294-
[[nodiscard]] size_t getCounterpartsCount(LlmRequest::RequestIdType requestId) const
294+
[[nodiscard]] size_t getCounterpartsCount(LlmRequest::RequestIdType requestId)
295295
{
296+
std::unique_lock<std::mutex> lock(mMtxForMap);
296297
auto it = mRequestToSession.find(requestId);
297298
TLLM_CHECK(it != mRequestToSession.end());
298299
return it->second.getConnections().size();
@@ -400,10 +401,14 @@ class CacheSender::Impl
400401

401402
void sendReadySignal(LlmRequest::RequestIdType requestId, bool isReady)
402403
{
403-
auto it = mRequestToSession.find(requestId);
404-
TLLM_CHECK(it != mRequestToSession.end());
405-
auto& session = it->second;
406-
auto const& connections = session.getConnections();
404+
TransferSession* session = nullptr;
405+
{
406+
std::unique_lock<std::mutex> lock(mMtxForMap);
407+
auto it = mRequestToSession.find(requestId);
408+
TLLM_CHECK(it != mRequestToSession.end());
409+
session = std::addressof(it->second);
410+
}
411+
auto const& connections = session->getConnections();
407412
for (size_t i = 0; i < connections.size(); i++)
408413
{
409414
auto* agentConnectionManager = dynamic_cast<executor::kv_cache::AgentConnectionManager*>(mManager);

cpp/tensorrt_llm/batch_manager/evictionPolicy.cpp

Lines changed: 7 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -59,15 +59,13 @@ void LRUEvictionPolicy::initialize(std::vector<BlockPtr>& mAllBlocksById, std::v
5959
{
6060
mFreeBlockIterators.reserve(mFreeBlockIterators.size() + sizes[cacheLevel]);
6161
mFreeQueues.emplace_back(std::vector<FreeBlocksQueue>(kMaxPriority - kMinPriority + 1));
62-
mReleasedBlocks.emplace_back(std::unordered_set<SizeType32>());
6362

6463
auto& freeQueue = mFreeQueues[cacheLevel][defaultPriorityIdx];
6564

6665
for (SizeType32 blockId = 0; blockId < sizes[cacheLevel]; blockId++)
6766
{
6867
// Initialize all blocks to be the default priority level
6968
mFreeBlockIterators.emplace_back(freeQueue.insert(freeQueue.end(), mAllBlocksById[startIdx + blockId]));
70-
mReleasedBlocks[cacheLevel].insert(startIdx + blockId);
7169
}
7270

7371
startIdx += sizes[cacheLevel];
@@ -134,35 +132,15 @@ void LRUEvictionPolicy::releaseBlock(BlockPtr block, bool toFront)
134132
SizeType32 const cacheLevel = getCacheLevel(block);
135133
SizeType32 const id = block->getBlockId();
136134

137-
mReleasedBlocks[cacheLevel].insert(id);
138-
139-
// It's possible that this block is the child of a matched block that's in mFreeQueues. If this happens, we need to
140-
// remove the parent from mFreeQueues, since it's no longer a released leaf block.
141-
auto parent = block->getPrevBlock();
142-
if (parent != nullptr)
135+
// If there are no children, this is a leaf block. Insert into a queue.
136+
auto& q = mFreeQueues[cacheLevel][getPriorityIdx(block->getPriority())];
137+
if (toFront)
143138
{
144-
auto const parentId = parent->getBlockId();
145-
if (parentId != KVCacheBlock::kCachedBlocksRootId && mFreeBlockIterators[parent->getBlockId()] != std::nullopt
146-
&& !isReleasedLeafBlock(parent))
147-
{
148-
mFreeQueues[getCacheLevel(parent)][getPriorityIdx(parent->getPriority())].erase(
149-
*mFreeBlockIterators[parentId]);
150-
mFreeBlockIterators[parentId] = std::nullopt;
151-
}
139+
mFreeBlockIterators[id] = q.insert(q.begin(), block);
152140
}
153-
154-
if (mFreeBlockIterators[block->getBlockId()] == std::nullopt && isReleasedLeafBlock(block))
141+
else
155142
{
156-
// If there are no children, this is a leaf block. Insert into a queue.
157-
auto& q = mFreeQueues[cacheLevel][getPriorityIdx(block->getPriority())];
158-
if (toFront)
159-
{
160-
mFreeBlockIterators[id] = q.insert(q.begin(), block);
161-
}
162-
else
163-
{
164-
mFreeBlockIterators[id] = q.insert(q.end(), block);
165-
}
143+
mFreeBlockIterators[id] = q.insert(q.end(), block);
166144
}
167145

168146
mNumFreeBlocksPerLevel[cacheLevel]++;
@@ -192,24 +170,10 @@ void LRUEvictionPolicy::claimBlock(BlockPtr block, std::optional<executor::Reten
192170
SizeType32 const id = block->getBlockId();
193171
SizeType32 const cacheLevel = getCacheLevel(block);
194172

195-
if (mReleasedBlocks[cacheLevel].find(id) != mReleasedBlocks[cacheLevel].end())
196-
{
197-
mNumFreeBlocksPerLevel[cacheLevel] -= 1;
198-
mReleasedBlocks[cacheLevel].erase(id);
199-
}
200-
201173
if (mFreeBlockIterators[id] != std::nullopt)
202174
{
203175
mFreeQueues[cacheLevel][getPriorityIdx(block->getPriority())].erase(*mFreeBlockIterators[id]);
204-
205-
BlockPtr const parent = block->getPrevBlock();
206-
207-
if (parent.get() != nullptr && parent->getBlockId() != KVCacheBlock::kCachedBlocksRootId
208-
&& mFreeBlockIterators[parent->getBlockId()] == std::nullopt && isReleasedLeafBlock(parent))
209-
{
210-
auto& q = mFreeQueues[getCacheLevel(parent)][getPriorityIdx(parent->getPriority())];
211-
mFreeBlockIterators[parent->getBlockId()] = q.insert(q.end(), parent);
212-
}
176+
mNumFreeBlocksPerLevel[cacheLevel] -= 1;
213177
}
214178

215179
mFreeBlockIterators[id] = std::nullopt;
@@ -223,28 +187,6 @@ void LRUEvictionPolicy::claimBlock(BlockPtr block, std::optional<executor::Reten
223187
block->setDurationMs(durationMs);
224188
}
225189

226-
bool LRUEvictionPolicy::isReleasedLeafBlock(BlockPtr const& block)
227-
{
228-
SizeType32 const blockCacheLevel = getCacheLevel(block);
229-
230-
if (mReleasedBlocks[blockCacheLevel].find(block->getBlockId()) == mReleasedBlocks[blockCacheLevel].end())
231-
{
232-
return false;
233-
}
234-
235-
for (auto const& p : block->getNextBlocks())
236-
{
237-
SizeType32 const childCacheLevel = getCacheLevel(p.second);
238-
if (mReleasedBlocks[childCacheLevel].find(p.second->getBlockId()) != mReleasedBlocks[childCacheLevel].end()
239-
&& childCacheLevel <= blockCacheLevel)
240-
{
241-
return false;
242-
}
243-
}
244-
245-
return true;
246-
}
247-
248190
std::chrono::steady_clock::time_point::duration LRUEvictionPolicy::getTime() const
249191
{
250192
return std::chrono::steady_clock::now().time_since_epoch();

0 commit comments

Comments
 (0)