QiJune
diff --git a/‎README.md‎
Lines changed: 4 additions & 0 deletions b/‎README.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h‎
Lines changed: 0 additions & 5 deletions b/‎cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 92 additions & 19 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 92 additions & 19 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h‎
Lines changed: 2 additions & 2 deletions b/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp‎
Lines changed: 10 additions & 5 deletions b/‎cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/evictionPolicy.cpp‎
Lines changed: 7 additions & 65 deletions b/‎cpp/tensorrt_llm/batch_manager/evictionPolicy.cpp‎
Lines changed: 7 additions & 65 deletions
@@ -18,6 +18,10 @@ TensorRT LLM
 <div align="left">
 
 ## Tech Blogs
+
+* [10/13] Scaling Expert Parallelism in TensorRT LLM (Part 3: Pushing the Performance Boundary)
+✨ [➡️ link](./docs/source/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.md)
+
 * [09/26] Inference Time Compute Implementation in TensorRT LLM
 ✨ [➡️ link](./docs/source/blogs/tech_blog/blog13_Inference_Time_Compute_Implementation_in_TensorRT-LLM.md)
 
 
@@ -92,13 +92,8 @@ class LRUEvictionPolicy : public BaseEvictionPolicy
     bool verifyQueueIntegrity() override;
 
 private:
-    // Check if the block should be added to mFreeQueues.
-    bool isReleasedLeafBlock(BlockPtr const& block);
-
     // Queues of available leaf blocks, split by cache level and priority level
     std::vector<std::vector<FreeBlocksQueue>> mFreeQueues;
-    // All blocks that have been released, along with the amount of released children
-    std::vector<std::unordered_set<SizeType32>> mReleasedBlocks;
     // Iterators to block entries in mFreeQueues
     std::vector<std::optional<FreeBlocksQueue::iterator>> mFreeBlockIterators;
     // Amount of free blocks at each cache level
 
@@ -130,14 +130,17 @@ struct WindowSizeMetadata
     SizeType32 temporaryAttentionWindow; // Temporary kv cache length per sequence.
                                          // Only needed when chunked context + sliding window attention are used
                                          // together. And it should only be considered when allocating blocks.
+    SizeType32 windowSize;
+    bool isSWA;
 
     std::string toString()
     {
         return tensorrt_llm::common::fmtstr(
             "WindowSizeMetadata{ .allottedPrimaryBlocks=%d, .allottedSecondaryBlocks=%d, .absolutePoolsOffset=%d, "
-            ".numPools=%d, .maxTokenNum=%d, .maxBlocksPerSeq=%d, .maxNumBlocks=%d, .temporaryAttentionWindow=%d }",
+            ".numPools=%d, .maxTokenNum=%d, .maxBlocksPerSeq=%d, .maxNumBlocks=%d, .temporaryAttentionWindow=%d, "
+            ".windowSize=%d, .isSWA=%d }",
             allottedPrimaryBlocks, allottedSecondaryBlocks, absolutePoolsOffset, numPools, maxTokenNum, maxBlocksPerSeq,
-            maxNumBlocks, temporaryAttentionWindow);
+            maxNumBlocks, temporaryAttentionWindow, windowSize, isSWA);
     }
 };
 
@@ -512,6 +515,8 @@ class GenerationRequest
     executor::KvCacheRetentionConfig mKvCacheRetentionConfig;
     // Number of front blocks removed from the sequence
     SizeType32 mNumFrontBlocksRemoved;
+    // Set of used blocks by the sequence
+    std::set<KVCacheBlock::IdType> mUsedBlocks;
 };
 
 // attach metadata to a pool pointer
@@ -628,15 +633,15 @@ class WindowBlockManager
     void releaseLastBlock(GenerationRequest& sequence);
 
     //! \brief Detach front block from the sequence
-    void detachFrontBlock(GenerationRequest& sequence, bool isEnableBlockReuse);
+    void detachFrontBlock(GenerationRequest& sequence);
 
     //! \brief Add/detach block(s) to/from the sequence if needed
     //! \details When we need a new block, we add it. For sliding window
     //! attention (SWA), when a block goes out-of-window (OOW), we detach it
-    //! and store it if reuse is enabled. If this called in the first step of
-    //! the generation phase, we may detach more than a single block since
-    //! there may be more than one context block that goes OOW.
-    void adjustBlocksIfNeeded(GenerationRequest& sequence, bool isEnableBlockReuse);
+    //! If this called in the first step of the generation phase, we may detach
+    //! more than a single block since there may be more than one context block
+    //! that goes OOW.
+    void adjustBlocksIfNeeded(GenerationRequest& sequence);
 
     [[nodiscard]] SizeType32 getWindowSize() const noexcept
     {
@@ -763,7 +768,7 @@ class WindowBlockManager
 
     //! \brief Bring offloaded block from secondary to primary memory.
     //! \details Does nothing if block is already in primary memory.
-    void onboardBlock(BlockPtr const& offloadBlock,
+    void onboardBlock(GenerationRequest& sequence, BlockPtr const& offloadBlock,
         executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM, std::string const& directory = "");
 
     //! \brief Bring block from primary to secondary memory.
@@ -826,6 +831,23 @@ class WindowBlockManager
     //! \brief Unpin blocks by starting from a block id and walking prev pointers.
     void unpinBlocksById(KVCacheBlock::IdType blockId);
 
+    void initializeSequenceStorageValidity(LlmRequest::RequestIdType requestId)
+    {
+        mIsValidStoreForReuseSequence[requestId] = true;
+    }
+
+    void releaseSequenceStorageValidity(LlmRequest::RequestIdType requestId)
+    {
+        mIsValidStoreForReuseSequence.erase(requestId);
+    }
+
+    //! \brief Return whether this sequence is valid for store for reuse
+    [[nodiscard]] bool isSequenceValidForStoreForReuse(LlmRequest::RequestIdType requestId) const
+    {
+        TLLM_CHECK_WITH_INFO(mIsValidStoreForReuseSequence.count(requestId) > 0, "Sequence should be bookkeeped");
+        return mIsValidStoreForReuseSequence.at(requestId);
+    }
+
 private:
     //! \brief Add single block to beam of sequence and mAllocatedBlocksPerSeq.
     void addBlockToBeam(BlockPtr& block, GenerationRequest& sequence, SizeType32 beamIdx);
@@ -842,18 +864,17 @@ class WindowBlockManager
         executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM, std::string const& directory = "");
 
     //! \brief Free block and all it's descendants. This makes block a claimed leaf block.
-    void freeChildren(BlockPtr const& block, executor::RetentionPriority priority,
-        std::optional<std::chrono::milliseconds> durationMs);
+    void freeChildren(BlockPtr const& block);
 
     //! \brief Find block least likely to be reused, free it if necessary and return.
-    [[nodiscard]] BlockPtr getFreeBlock(
+    //! \param sequence Sequence which the free block is allocated for
+    [[nodiscard]] BlockPtr getFreeBlock(GenerationRequest& sequence,
         executor::RetentionPriority = executor::KvCacheRetentionConfig::kDefaultRetentionPriority,
         std::optional<std::chrono::milliseconds> durationMs = std::nullopt,
         executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM, std::string const& directory = "");
 
-    //! \brief Free block from previous block and claim it from free blocks list.
-    void claimLeafBlock(BlockPtr const& block, std::optional<executor::RetentionPriority> priority = std::nullopt,
-        std::optional<std::chrono::milliseconds> durationMs = std::nullopt);
+    //! \brief Calls KVCacheBlock::freeLeafBlock to remove block from search tree.
+    void freeLeafBlock(BlockPtr const& block);
 
     //! \brief For FP4 quantization. Creates pool objects for FP4 block scalars.
     void createBlockScalePools(SizeType32 blockSize);
@@ -933,6 +954,14 @@ class WindowBlockManager
 
     // Mutex for the cached blocks root
     std::mutex mCachedBlocksRootMutex;
+
+    // Record which sequence is using the block
+    std::map<KVCacheBlock::IdType, LlmRequest::RequestIdType> mBlockToSequence;
+    // Record whether a sequence has all blocks held valid.
+    // The boolean value is set to true upon first encounter of a new sequence.
+    // It may be invalidated to false when other sequence acquires a block that
+    // is used by another sequence.
+    std::map<LlmRequest::RequestIdType, bool> mIsValidStoreForReuseSequence;
 };
 
 class BlockManager
@@ -1008,7 +1037,7 @@ class BlockManager
 
     //! \brief Bring block from primary to secondary memory for window size.
     //! \details Does nothing if block is already in primary memory.
-    void onboardBlock(BlockPtr const& offloadBlock, SizeType32 windowSize,
+    void onboardBlock(GenerationRequest& sequence, BlockPtr const& offloadBlock, SizeType32 windowSize,
         executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM, std::string const& directory = "");
 
     //! \brief Bring block from primary to secondary memory for window size.
@@ -1239,10 +1268,52 @@ class BlockManager
     //! \brief Add/detach block(s) to/from the sequence if needed
     //! \details When we need a new block, we add it. For sliding window
     //! attention (SWA), when a block goes out-of-window (OOW), we detach it
-    //! and store it if reuse is enabled. If this called in the first step of
-    //! the generation phase, we may detach more than a single block since
-    //! there may be more than one context block that goes OOW.
-    void adjustBlocksIfNeeded(GenerationRequest& sequence, bool isEnableBlockReuse);
+    //! If this called in the first step of the generation phase, we may
+    //! detach more than a single block since there may be more than one
+    //! context block that goes OOW.
+    void adjustBlocksIfNeeded(GenerationRequest& sequence);
+
+    //! \brief Return whether the sequence is already managed by the block manager
+    [[nodiscard]] bool isSequenceHeld(LlmRequest::RequestIdType requestId) const
+    {
+        return mManagedSequences.count(requestId) > 0;
+    }
+
+    //! \brief Add a sequence to the managed sequences
+    //! \details Take the sequence into account for the manager. Initialize
+    //! sequence storage validity under all window sizes.
+    void holdSequence(LlmRequest::RequestIdType requestId)
+    {
+        mManagedSequences.insert(requestId);
+        for (auto const& [windowSize, metadata] : mWindowSizeToMetadata)
+        {
+            mWindowBlockManagers.at(windowSize).initializeSequenceStorageValidity(requestId);
+        }
+    }
+
+    //! \brief Remove a sequence from the managed sequences.
+    //! \details Remove sequence from the managed sequences and remove sequence
+    //! storage
+    void releaseSequence(LlmRequest::RequestIdType requestId)
+    {
+        mManagedSequences.erase(requestId);
+        for (auto const& [windowSize, metadata] : mWindowSizeToMetadata)
+        {
+            mWindowBlockManagers.at(windowSize).releaseSequenceStorageValidity(requestId);
+        }
+    }
+
+    //! \brief Return whether the sequence is still valid for store-for-reuse
+    //! regarding the specific window size.
+    //! \details Currently this utility function is only used under
+    //! kvCacheManagerTest.cpp. Checking for store-for-reuse for each window
+    //! size is done in an iterating fashion under BlockManager::releaseBlocks.
+    bool isSequenceValidForStoreForReuse(LlmRequest::RequestIdType requestId, SizeType32 windowSize) const
+    {
+        TLLM_CHECK_WITH_INFO(
+            mWindowBlockManagers.count(windowSize) > 0, "Querying window size is not found under mWindowBlockManager");
+        return mWindowBlockManagers.at(windowSize).isSequenceValidForStoreForReuse(requestId);
+    }
 
 private:
     [[nodiscard]] WindowBlockManager const& windowManagerByLayer(SizeType32 layerIdx) const
@@ -1278,6 +1349,8 @@ class BlockManager
     std::vector<SizeType32> mLayerToWindowSize;
     std::vector<SizeType32> mAbsolutePoolToWindowSize;
     std::vector<SizeType32> mAbsolutePoolToRelativePoolIndex;
+    // Record what sequences are currently managed by the block manager
+    std::set<LlmRequest::RequestIdType> mManagedSequences;
 };
 
 struct OffsetTableDimensions
 
@@ -1828,9 +1828,10 @@ class GenericLlmRequest
 
     void updatePerfMetrics(executor::IterationType iter)
     {
+        auto const currentTokenTime = getSteadyClockNow();
+
         if (!mPerfMetrics.firstIter)
         {
-            auto const currentTokenTime = getSteadyClockNow();
             mPerfMetrics.firstIter = iter;
             mPerfMetrics.timingMetrics.firstTokenTime = currentTokenTime;
         }
@@ -1839,7 +1840,6 @@ class GenericLlmRequest
 
         if (isFinished())
         {
-            auto const currentTokenTime = getSteadyClockNow();
             mPerfMetrics.lastIter = iter;
             mPerfMetrics.timingMetrics.lastTokenTime = currentTokenTime;
         }
 
@@ -291,8 +291,9 @@ class CacheSender::Impl
         mSelfState.setCommState(std::move(commState));
     }
 
-    [[nodiscard]] size_t getCounterpartsCount(LlmRequest::RequestIdType requestId) const
+    [[nodiscard]] size_t getCounterpartsCount(LlmRequest::RequestIdType requestId)
     {
+        std::unique_lock<std::mutex> lock(mMtxForMap);
         auto it = mRequestToSession.find(requestId);
         TLLM_CHECK(it != mRequestToSession.end());
         return it->second.getConnections().size();
@@ -400,10 +401,14 @@ class CacheSender::Impl
 
     void sendReadySignal(LlmRequest::RequestIdType requestId, bool isReady)
     {
-        auto it = mRequestToSession.find(requestId);
-        TLLM_CHECK(it != mRequestToSession.end());
-        auto& session = it->second;
-        auto const& connections = session.getConnections();
+        TransferSession* session = nullptr;
+        {
+            std::unique_lock<std::mutex> lock(mMtxForMap);
+            auto it = mRequestToSession.find(requestId);
+            TLLM_CHECK(it != mRequestToSession.end());
+            session = std::addressof(it->second);
+        }
+        auto const& connections = session->getConnections();
         for (size_t i = 0; i < connections.size(); i++)
         {
             auto* agentConnectionManager = dynamic_cast<executor::kv_cache::AgentConnectionManager*>(mManager);
 
@@ -59,15 +59,13 @@ void LRUEvictionPolicy::initialize(std::vector<BlockPtr>& mAllBlocksById, std::v
     {
         mFreeBlockIterators.reserve(mFreeBlockIterators.size() + sizes[cacheLevel]);
         mFreeQueues.emplace_back(std::vector<FreeBlocksQueue>(kMaxPriority - kMinPriority + 1));
-        mReleasedBlocks.emplace_back(std::unordered_set<SizeType32>());
 
         auto& freeQueue = mFreeQueues[cacheLevel][defaultPriorityIdx];
 
         for (SizeType32 blockId = 0; blockId < sizes[cacheLevel]; blockId++)
         {
             // Initialize all blocks to be the default priority level
             mFreeBlockIterators.emplace_back(freeQueue.insert(freeQueue.end(), mAllBlocksById[startIdx + blockId]));
-            mReleasedBlocks[cacheLevel].insert(startIdx + blockId);
         }
 
         startIdx += sizes[cacheLevel];
@@ -134,35 +132,15 @@ void LRUEvictionPolicy::releaseBlock(BlockPtr block, bool toFront)
     SizeType32 const cacheLevel = getCacheLevel(block);
     SizeType32 const id = block->getBlockId();
 
-    mReleasedBlocks[cacheLevel].insert(id);
-
-    // It's possible that this block is the child of a matched block that's in mFreeQueues. If this happens, we need to
-    // remove the parent from mFreeQueues, since it's no longer a released leaf block.
-    auto parent = block->getPrevBlock();
-    if (parent != nullptr)
+    // If there are no children, this is a leaf block. Insert into a queue.
+    auto& q = mFreeQueues[cacheLevel][getPriorityIdx(block->getPriority())];
+    if (toFront)
     {
-        auto const parentId = parent->getBlockId();
-        if (parentId != KVCacheBlock::kCachedBlocksRootId && mFreeBlockIterators[parent->getBlockId()] != std::nullopt
-            && !isReleasedLeafBlock(parent))
-        {
-            mFreeQueues[getCacheLevel(parent)][getPriorityIdx(parent->getPriority())].erase(
-                *mFreeBlockIterators[parentId]);
-            mFreeBlockIterators[parentId] = std::nullopt;
-        }
+        mFreeBlockIterators[id] = q.insert(q.begin(), block);
     }
-
-    if (mFreeBlockIterators[block->getBlockId()] == std::nullopt && isReleasedLeafBlock(block))
+    else
     {
-        // If there are no children, this is a leaf block. Insert into a queue.
-        auto& q = mFreeQueues[cacheLevel][getPriorityIdx(block->getPriority())];
-        if (toFront)
-        {
-            mFreeBlockIterators[id] = q.insert(q.begin(), block);
-        }
-        else
-        {
-            mFreeBlockIterators[id] = q.insert(q.end(), block);
-        }
+        mFreeBlockIterators[id] = q.insert(q.end(), block);
     }
 
     mNumFreeBlocksPerLevel[cacheLevel]++;
@@ -192,24 +170,10 @@ void LRUEvictionPolicy::claimBlock(BlockPtr block, std::optional<executor::Reten
     SizeType32 const id = block->getBlockId();
     SizeType32 const cacheLevel = getCacheLevel(block);
 
-    if (mReleasedBlocks[cacheLevel].find(id) != mReleasedBlocks[cacheLevel].end())
-    {
-        mNumFreeBlocksPerLevel[cacheLevel] -= 1;
-        mReleasedBlocks[cacheLevel].erase(id);
-    }
-
     if (mFreeBlockIterators[id] != std::nullopt)
     {
         mFreeQueues[cacheLevel][getPriorityIdx(block->getPriority())].erase(*mFreeBlockIterators[id]);
-
-        BlockPtr const parent = block->getPrevBlock();
-
-        if (parent.get() != nullptr && parent->getBlockId() != KVCacheBlock::kCachedBlocksRootId
-            && mFreeBlockIterators[parent->getBlockId()] == std::nullopt && isReleasedLeafBlock(parent))
-        {
-            auto& q = mFreeQueues[getCacheLevel(parent)][getPriorityIdx(parent->getPriority())];
-            mFreeBlockIterators[parent->getBlockId()] = q.insert(q.end(), parent);
-        }
+        mNumFreeBlocksPerLevel[cacheLevel] -= 1;
     }
 
     mFreeBlockIterators[id] = std::nullopt;
@@ -223,28 +187,6 @@ void LRUEvictionPolicy::claimBlock(BlockPtr block, std::optional<executor::Reten
     block->setDurationMs(durationMs);
 }
 
-bool LRUEvictionPolicy::isReleasedLeafBlock(BlockPtr const& block)
-{
-    SizeType32 const blockCacheLevel = getCacheLevel(block);
-
-    if (mReleasedBlocks[blockCacheLevel].find(block->getBlockId()) == mReleasedBlocks[blockCacheLevel].end())
-    {
-        return false;
-    }
-
-    for (auto const& p : block->getNextBlocks())
-    {
-        SizeType32 const childCacheLevel = getCacheLevel(p.second);
-        if (mReleasedBlocks[childCacheLevel].find(p.second->getBlockId()) != mReleasedBlocks[childCacheLevel].end()
-            && childCacheLevel <= blockCacheLevel)
-        {
-            return false;
-        }
-    }
-
-    return true;
-}
-
 std::chrono::steady_clock::time_point::duration LRUEvictionPolicy::getTime() const
 {
     return std::chrono::steady_clock::now().time_since_epoch();
Original file line number	Diff line number	Diff line change
`@@ -1828,9 +1828,10 @@ class GenericLlmRequest`
`1828`	`1828`
`1829`	`1829`	`void updatePerfMetrics(executor::IterationType iter)`
`1830`	`1830`	`{`
	`1831`	`+ auto const currentTokenTime = getSteadyClockNow();`
	`1832`	`+`
`1831`	`1833`	`if (!mPerfMetrics.firstIter)`
`1832`	`1834`	`{`
`1833`		`- auto const currentTokenTime = getSteadyClockNow();`
`1834`	`1835`	`mPerfMetrics.firstIter = iter;`
`1835`	`1836`	`mPerfMetrics.timingMetrics.firstTokenTime = currentTokenTime;`
`1836`	`1837`	`}`
`@@ -1839,7 +1840,6 @@ class GenericLlmRequest`
`1839`	`1840`
`1840`	`1841`	`if (isFinished())`
`1841`	`1842`	`{`
`1842`		`- auto const currentTokenTime = getSteadyClockNow();`
`1843`	`1843`	`mPerfMetrics.lastIter = iter;`
`1844`	`1844`	`mPerfMetrics.timingMetrics.lastTokenTime = currentTokenTime;`
`1845`	`1845`	`}`