[https://nvbugs/5689235][fix] Fix cancellation+chunked prefill+disagg

Tabrizian · Tabrizian · commit 2d8a6db1300d · 2026-01-01T19:24:26.000Z
Signed-off-by: Iman Tabrizian &lt;10105175+tabrizian@users.noreply.github.com&gt;
diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
@@ -648,7 +648,7 @@ class WindowBlockManager
 
     void replaceSharedBlock(GenerationRequest& sequence, SizeType32 blockIdx);
 
-    [[nodiscard]] std::optional<KVCacheBlock::IdType> storeBlocksForReuse(
+    [[nodiscard]] std::vector<KVCacheBlock::IdType> storeBlocksForReuse(
         GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest, bool pinBlocks = false);
 
     void storeNewBlock(GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest);
@@ -853,8 +853,8 @@ class WindowBlockManager
     //! \param blockKeys Key of each block.
     //! \param blockIds Id of each block.
     //! \param pinBlocks If true, increment ref count for blocks while storing (pin on store).
-    //! \return Pair of (num blocks stored for reuse, id of the last block stored if any).
-    [[nodiscard]] std::pair<SizeType32, std::optional<KVCacheBlock::IdType>> storeBlocks(
+    //! \return Pair of (num blocks stored for reuse, vector of pinned block IDs).
+    [[nodiscard]] std::pair<SizeType32, std::vector<KVCacheBlock::IdType>> storeBlocks(
         std::vector<BlockKey> const& blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds,
         bool pinBlocks = false);
 
@@ -886,8 +886,8 @@ class WindowBlockManager
 
     [[nodiscard]] std::shared_ptr<KVCacheBlock> findBlocksInReuseTreeByBlockKey(BlockKey const& blockKey);
 
-    //! \brief Unpin blocks by starting from a block id and walking prev pointers.
-    void unpinBlocksById(KVCacheBlock::IdType blockId);
+    //! \brief Unpin blocks by block ids directly
+    void unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds);
 
     void initializeSequenceStorageValidity(LlmRequest::RequestIdType requestId)
     {
@@ -1103,7 +1103,7 @@ class BlockManager
     std::optional<KVCacheBlock::IdType> releaseBlocks(
         GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest = std::nullopt, bool pinBlocks = false);
 
-    [[nodiscard]] std::optional<KVCacheBlock::IdType> storeBlocksForReuse(
+    [[nodiscard]] std::vector<KVCacheBlock::IdType> storeBlocksForReuse(
         GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest = std::nullopt, bool pinBlocks = false);
 
     void schedulingReleaseBlocks(LlmRequest::RequestIdType requestId);
@@ -1112,7 +1112,7 @@ class BlockManager
     /// @param sequence The generation request whose blocks should be pinned.
     void pinBlocks(GenerationRequest& sequence);
 
-    void unpinBlocksById(KVCacheBlock::IdType blockId);
+    void unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds);
 
     void releaseLastBlock(GenerationRequest& sequence, SizeType32 windowSize);
 
@@ -1133,7 +1133,7 @@ class BlockManager
     void offloadBlock(BlockPtr const& block, SizeType32 windowSize,
         executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM, std::string const& directory = "");
 
-    [[nodiscard]] std::pair<SizeType32, std::optional<KVCacheBlock::IdType>> storeBlocks(
+    [[nodiscard]] std::pair<SizeType32, std::vector<KVCacheBlock::IdType>> storeBlocks(
         std::vector<BlockKey> const& blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds,
         SizeType32 windowSize, bool pinBlocks = false)
     {
@@ -1584,7 +1584,7 @@ class BaseKVCacheManager
     virtual void storeNewBlock(LlmRequest const& llmRequest) = 0;
 
     /// \brief Store blocks for reuse for a given request id
-    [[nodiscard]] virtual std::optional<KVCacheBlock::IdType> storeBlocksForReuse(
+    [[nodiscard]] virtual std::vector<KVCacheBlock::IdType> storeBlocksForReuse(
         LlmRequest::RequestIdType requestId, OptionalRef<LlmRequest const> llmRequest, bool pinBlocks = false)
         = 0;
 
@@ -1678,7 +1678,7 @@ class BaseKVCacheManager
         BlockKey const& blockKey, SizeType32 windowSize)
         = 0;
 
-    virtual void unpinBlocksById(KVCacheBlock::IdType blockId) = 0;
+    virtual void unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds) = 0;
 };
 
 class KVCacheManager : public BaseKVCacheManager
@@ -1939,7 +1939,7 @@ class KVCacheManager : public BaseKVCacheManager
     //! \brief Store newest blocks for reuse
     void storeNewBlock(LlmRequest const& llmRequest) override;
 
-    [[nodiscard]] std::optional<KVCacheBlock::IdType> storeBlocksForReuse(
+    [[nodiscard]] std::vector<KVCacheBlock::IdType> storeBlocksForReuse(
         LlmRequest::RequestIdType requestId, OptionalRef<LlmRequest const> llmRequest, bool pinBlocks = false) override;
 
     [[nodiscard]] static SizeType32 getSinkBubbleLength(SizeType32 sinkTokenLen, SizeType32 tokensPerBlock);
@@ -1960,7 +1960,7 @@ class KVCacheManager : public BaseKVCacheManager
 
     void pinBlocks(LlmRequest::RequestIdType requestId) override;
 
-    void unpinBlocksById(KVCacheBlock::IdType blockId) override;
+    void unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds) override;
 
     std::optional<KVCacheBlock::IdType> getLastBlockId(LlmRequest::RequestIdType requestId) const override;
 
diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@@ -1667,6 +1667,12 @@ class GenericLlmRequest
             [](auto reason) { return reason == executor::FinishReason::kLENGTH; });
     }
 
+    [[nodiscard]] bool isFinishedDueToCancellation() const noexcept
+    {
+        return std::all_of(mFinishReasons.begin(), mFinishReasons.end(),
+            [](auto reason) { return reason == executor::FinishReason::kCANCELLED; });
+    }
+
     [[nodiscard]] bool isTimedOut() const
     {
         if (!mAllottedTimeMs.has_value())
diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
@@ -1556,7 +1556,7 @@ void WindowBlockManager::allocateBlock(GenerationRequest& sequence, bool shareAm
     }
 }
 
-std::pair<SizeType32, std::optional<KVCacheBlock::IdType>> WindowBlockManager::storeBlocks(
+std::pair<SizeType32, std::vector<KVCacheBlock::IdType>> WindowBlockManager::storeBlocks(
     std::vector<BlockKey> const& blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds, bool pinBlocks)
 {
     SizeType32 numBlocksStoredForReuse = 0;
@@ -1569,7 +1569,7 @@ std::pair<SizeType32, std::optional<KVCacheBlock::IdType>> WindowBlockManager::s
 
     auto numBlocks = blockKeys.size();
     std::vector<BlockPtr> storedBlocks;
-    std::optional<KVCacheBlock::IdType> lastStoredId = std::nullopt;
+    std::vector<KVCacheBlock::IdType> pinnedBlockIds;
     for (std::size_t blockCnt = 0; blockCnt < numBlocks; ++blockCnt)
     {
         auto const bid = blockIds[blockCnt];
@@ -1620,14 +1620,14 @@ std::pair<SizeType32, std::optional<KVCacheBlock::IdType>> WindowBlockManager::s
         if (pinBlocks)
         {
             searchRoot->incRefCount();
+            pinnedBlockIds.push_back(searchRoot->getBlockId());
         }
-        lastStoredId = searchRoot->getBlockId();
     }
     if (mEventManager)
     {
         mEventManager->enqueueStoredEvent(storedBlocks, mWindowSize);
     }
-    return {numBlocksStoredForReuse, lastStoredId};
+    return {numBlocksStoredForReuse, pinnedBlockIds};
 }
 
 void BlockManager::replaceSharedBlock(GenerationRequest& sequence, SizeType32 windowSize, SizeType32 blockIdx)
@@ -1715,15 +1715,15 @@ std::deque<tle::KVCacheEvent> BlockManager::getLatestEvents(std::optional<std::c
     return mEventManager ? mEventManager->getEvents(timeout) : std::deque<tle::KVCacheEvent>{};
 }
 
-std::optional<KVCacheBlock::IdType> BlockManager::storeBlocksForReuse(
+std::vector<KVCacheBlock::IdType> BlockManager::storeBlocksForReuse(
     GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest, bool pinBlocks)
 {
-    std::optional<KVCacheBlock::IdType> lastStoredId = std::nullopt;
+    std::vector<KVCacheBlock::IdType> pinnedBlockIds;
     for (auto& [_, manager] : mWindowBlockManagers)
     {
-        lastStoredId = manager.storeBlocksForReuse(sequence, llmRequest, pinBlocks);
+        pinnedBlockIds = manager.storeBlocksForReuse(sequence, llmRequest, pinBlocks);
     }
-    return lastStoredId;
+    return pinnedBlockIds;
 }
 
 std::optional<KVCacheBlock::IdType> BlockManager::releaseBlocks(
@@ -1767,15 +1767,15 @@ void BlockManager::pinBlocks(GenerationRequest& sequence)
     }
 }
 
-void BlockManager::unpinBlocksById(KVCacheBlock::IdType blockId)
+void BlockManager::unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds)
 {
     // Use the first window size
     if (mWindowBlockManagers.empty())
     {
         return;
     }
     auto& firstManager = mWindowBlockManagers.begin()->second;
-    firstManager.unpinBlocksById(blockId);
+    firstManager.unpinBlocksById(blockIds);
 }
 
 void WindowBlockManager::pinBlocks(GenerationRequest& sequence)
@@ -1788,21 +1788,28 @@ void WindowBlockManager::pinBlocks(GenerationRequest& sequence)
     }
 }
 
-void WindowBlockManager::unpinBlocksById(KVCacheBlock::IdType blockId)
+void WindowBlockManager::unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds)
 {
-    if (blockId < 0 || static_cast<size_t>(blockId) >= mAllBlocksById.size())
+    if (blockIds.empty())
     {
         return;
     }
-    auto block = mAllBlocksById[blockId];
-    while (block && block->getBlockId() != KVCacheBlock::kCachedBlocksRootId)
+
+    for (auto const& blockId : blockIds)
     {
-        block->decRefCount();
-        if (!block->hasRefs())
+        if (blockId < 0 || static_cast<size_t>(blockId) >= mAllBlocksById.size())
         {
-            mEvictionPolicy->releaseBlock(block);
+            continue;
+        }
+        auto block = mAllBlocksById[blockId];
+        if (block && block->getBlockId() != KVCacheBlock::kCachedBlocksRootId)
+        {
+            block->decRefCount();
+            if (!block->hasRefs())
+            {
+                mEvictionPolicy->releaseBlock(block);
+            }
         }
-        block = std::move(block->getPrevBlock());
     }
 }
 
@@ -1870,7 +1877,7 @@ void WindowBlockManager::storeNewBlock(GenerationRequest& sequence, OptionalRef<
     (void) storeBlocks(std::move(blockKeys), cacheBlockIds[beamIdx]);
 }
 
-std::optional<KVCacheBlock::IdType> WindowBlockManager::storeBlocksForReuse(
+std::vector<KVCacheBlock::IdType> WindowBlockManager::storeBlocksForReuse(
     GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest, bool pinBlocks)
 {
     auto constexpr beamIdx = 0;
@@ -1883,7 +1890,10 @@ std::optional<KVCacheBlock::IdType> WindowBlockManager::storeBlocksForReuse(
     auto const usableSize = static_cast<runtime::SizeType32>(uniqueTokens.size()) - 1;
     auto blockedUniqueTokens = chopVectorIntoBlocks<UniqueToken>(uniqueTokens, usableSize, mTokensPerBlock, true);
     auto blockKeys = buildBlockKeys(blockedUniqueTokens, *llmRequest);
-    return storeBlocks(std::move(blockKeys), cacheBlockIds[beamIdx], pinBlocks).second;
+
+    auto [numStored, pinnedBlockIds] = storeBlocks(std::move(blockKeys), cacheBlockIds[beamIdx], pinBlocks);
+
+    return pinnedBlockIds;
 }
 
 std::optional<KVCacheBlock::IdType> WindowBlockManager::releaseBlocks(
@@ -1922,7 +1932,7 @@ std::optional<KVCacheBlock::IdType> WindowBlockManager::releaseBlocks(
             std::transform(allocatedBlocks.begin(), allocatedBlocks.end(), cacheBlockIds.begin(),
                 [](BlockPtr const& block) { return block->getBlockId(); });
 
-            auto [numBlocksStoredForReuse, lastStoredId] = storeBlocks(std::move(blockKeys), cacheBlockIds);
+            auto [numBlocksStoredForReuse, pinnedBlockIds] = storeBlocks(std::move(blockKeys), cacheBlockIds);
             TLLM_LOG_DEBUG("%s::releaseBlocks Request %lu, %d blocks stored for reuse", mLogPrefix.c_str(),
                 sequence.getRequestId(), numBlocksStoredForReuse);
         }
@@ -2499,15 +2509,14 @@ std::optional<KVCacheBlock::IdType> KVCacheManager::removeSequence(
     return lastStoredId;
 }
 
-std::optional<KVCacheBlock::IdType> KVCacheManager::storeBlocksForReuse(
+std::vector<KVCacheBlock::IdType> KVCacheManager::storeBlocksForReuse(
     RequestIdType requestId, OptionalRef<LlmRequest const> llmRequest, bool pinBlocks)
 {
     TLLM_LOG_TRACE("[%s]::%s start", isCrossKv() ? "CROSS" : "SELF", __PRETTY_FUNCTION__);
     auto& sequence = getSequence(requestId);
-    std::optional<KVCacheBlock::IdType> lastStoredId
-        = mBlockManager.storeBlocksForReuse(sequence, llmRequest, pinBlocks);
+    auto pinnedBlockIds = mBlockManager.storeBlocksForReuse(sequence, llmRequest, pinBlocks);
     TLLM_LOG_TRACE("[%s]::%s stop", isCrossKv() ? "CROSS" : "SELF", __PRETTY_FUNCTION__);
-    return lastStoredId;
+    return pinnedBlockIds;
 }
 
 void KVCacheManager::schedulingRemoveSequence(RequestIdType requestId)
@@ -2522,9 +2531,9 @@ void KVCacheManager::pinBlocks(RequestIdType requestId)
     mBlockManager.pinBlocks(sequence);
 }
 
-void KVCacheManager::unpinBlocksById(KVCacheBlock::IdType blockId)
+void KVCacheManager::unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds)
 {
-    mBlockManager.unpinBlocksById(blockId);
+    mBlockManager.unpinBlocksById(blockIds);
 }
 
 SizeType32 KVCacheManager::copyBlockOffsets(ITensor& output, SizeType32 outputSlotOffset, RequestIdType requestId) const
diff --git a/cpp/tensorrt_llm/executor/executorImpl.cpp b/cpp/tensorrt_llm/executor/executorImpl.cpp
@@ -2179,11 +2179,11 @@ void Executor::Impl::terminateContextFinishedRequests(InTransList& inTransmissio
         auto req = item.request;
         if (req->isDisaggContextCompleteState())
         {
-            // If lastBlockId was tracked, unpin it. Otherwise, just terminate.
+            // If pinnedBlockIds were tracked, unpin them. Otherwise, just terminate.
             auto kvMgr = mModel->getKVCacheManager();
-            if (kvMgr && item.lastBlockId.has_value())
+            if (kvMgr && !item.pinnedBlockIds.empty())
             {
-                kvMgr->unpinBlocksById(item.lastBlockId.value());
+                kvMgr->unpinBlocksById(item.pinnedBlockIds);
             }
             else
             {
@@ -2234,14 +2234,14 @@ Executor::Impl::RequestList Executor::Impl::populateNewResponses(
             // move the in transmission requests to another tracker
             if (llmReq->isDisaggContextTransmissionState())
             {
-                std::optional<SizeType32> lastBlockId{};
+                std::vector<SizeType32> pinnedBlockIds{};
                 auto kvMgr = mModel->getKVCacheManager();
                 if (kvMgr && kvMgr->isEnableBlockReuse() && !kvMgr->getBlockManager().isVariableWindow())
                 {
-                    lastBlockId = kvMgr->storeBlocksForReuse(llmReq->mRequestId, llmReq, /*pinBlocks=*/true);
+                    pinnedBlockIds = kvMgr->storeBlocksForReuse(llmReq->mRequestId, llmReq, /*pinBlocks=*/true);
                     mModel->terminateRequest(llmReq);
                 }
-                inTransmissionRequests.push_back(InTransmissionItem{*it, lastBlockId});
+                inTransmissionRequests.push_back(InTransmissionItem{*it, pinnedBlockIds});
             }
             finishedRequests.push_back(*it);
             it = activeRequests.erase(it);
diff --git a/cpp/tensorrt_llm/executor/executorImpl.h b/cpp/tensorrt_llm/executor/executorImpl.h
@@ -80,12 +80,12 @@ class Executor::Impl
     using RequestList = std::list<LlmRequestPtr>;
 
     // When block reuse is enabled for context worker for disaggregated serving,
-    // we need to store the last block id so that we can unpin the block when
+    // we need to store the pinned block ids so that we can unpin them when
     // the request is finished.
     struct InTransmissionItem
     {
         LlmRequestPtr request;
-        std::optional<SizeType32> lastBlockId;
+        std::vector<SizeType32> pinnedBlockIds;
     };
 
     using InTransList = std::list<InTransmissionItem>;
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp
@@ -161,6 +161,7 @@ void initBindings(nb::module_& m)
         .def("set_finished_reason", &GenLlmReq::setFinishedReason, nb::arg("finish_reason"), nb::arg("beam"))
         .def_prop_ro("is_finished", &GenLlmReq::isFinished)
         .def_prop_ro("is_finished_due_to_length", &GenLlmReq::isFinishedDueToLength)
+        .def_prop_ro("is_finished_due_to_cancellation", &GenLlmReq::isFinishedDueToCancellation)
         .def_prop_rw(
             "context_current_position", &GenLlmReq::getContextCurrentPosition, &GenLlmReq::setContextCurrentPosition)
         .def_prop_ro("prepopulated_prompt_len", &GenLlmReq::getPrepopulatedPromptLen)
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp
@@ -123,7 +123,7 @@ class PyKvCacheManager : public tbk::BaseKVCacheManager
         NB_OVERRIDE_PURE(removeSequence, requestId, llmRequest, pinOnRelease);
     }
 
-    std::optional<tbk::KVCacheBlock::IdType> storeBlocksForReuse(tb::LlmRequest::RequestIdType requestId,
+    std::vector<tbk::KVCacheBlock::IdType> storeBlocksForReuse(tb::LlmRequest::RequestIdType requestId,
         tensorrt_llm::common::OptionalRef<tb::LlmRequest const> llmRequest, bool pinBlocks) override
     {
         NB_OVERRIDE_PURE(storeBlocksForReuse, requestId, llmRequest, pinBlocks);
@@ -363,7 +363,22 @@ void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(nb::module_& m)
             nb::call_guard<nb::gil_scoped_release>())
         .def("add_token", &BaseKVCacheManager::addToken, nb::call_guard<nb::gil_scoped_release>())
         .def("add_sequence", &BaseKVCacheManager::addSequence, nb::call_guard<nb::gil_scoped_release>())
-        .def("remove_sequence", &BaseKVCacheManager::removeSequence, nb::call_guard<nb::gil_scoped_release>())
+        .def(
+            "remove_sequence",
+            [](tbk::BaseKVCacheManager& self, tb::LlmRequest::RequestIdType requestId, tb::LlmRequest const* llmRequest,
+                bool pinOnRelease)
+            {
+                if (llmRequest != nullptr)
+                {
+                    return self.removeSequence(requestId, *llmRequest, pinOnRelease);
+                }
+                else
+                {
+                    return self.removeSequence(requestId, std::nullopt, pinOnRelease);
+                }
+            },
+            nb::arg("request_id"), nb::arg("llm_request") = nullptr, nb::arg("pin_on_release") = false,
+            nb::call_guard<nb::gil_scoped_release>())
         .def("pin_blocks", &BaseKVCacheManager::pinBlocks, nb::call_guard<nb::gil_scoped_release>())
         .def("scheduling_remove_sequence", &BaseKVCacheManager::schedulingRemoveSequence,
             nb::call_guard<nb::gil_scoped_release>())
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -165,6 +165,7 @@ void initBindings(pybind11::module_& m)
         .def("set_finished_reason", &GenLlmReq::setFinishedReason, py::arg("finish_reason"), py::arg("beam"))
         .def_property_readonly("is_finished", &GenLlmReq::isFinished)
         .def_property_readonly("is_finished_due_to_length", &GenLlmReq::isFinishedDueToLength)
+        .def_property_readonly("is_finished_due_to_cancellation", &GenLlmReq::isFinishedDueToCancellation)
         .def_property(
             "context_current_position", &GenLlmReq::getContextCurrentPosition, &GenLlmReq::setContextCurrentPosition)
         .def_property_readonly("prepopulated_prompt_len", &GenLlmReq::getPrepopulatedPromptLen)
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp
diff --git a/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp b/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py

Original file line number	Diff line number	Diff line change
`@@ -1667,6 +1667,12 @@ class GenericLlmRequest`
`1667`	`1667`	`[](auto reason) { return reason == executor::FinishReason::kLENGTH; });`
`1668`	`1668`	`}`
`1669`	`1669`
	`1670`	`+ [[nodiscard]] bool isFinishedDueToCancellation() const noexcept`
	`1671`	`+ {`
	`1672`	`+ return std::all_of(mFinishReasons.begin(), mFinishReasons.end(),`
	`1673`	`+ [](auto reason) { return reason == executor::FinishReason::kCANCELLED; });`
	`1674`	`+ }`
	`1675`	`+`
`1670`	`1676`	`[[nodiscard]] bool isTimedOut() const`
`1671`	`1677`	`{`
`1672`	`1678`	`if (!mAllottedTimeMs.has_value())`
Original file line number	Diff line number	Diff line change
`@@ -1556,7 +1556,7 @@ void WindowBlockManager::allocateBlock(GenerationRequest& sequence, bool shareAm`
`1556`	`1556`	`}`
`1557`	`1557`	`}`
`1558`	`1558`
`1559`		`-std::pair<SizeType32, std::optional<KVCacheBlock::IdType>> WindowBlockManager::storeBlocks(`
	`1559`	`+std::pair<SizeType32, std::vector<KVCacheBlock::IdType>> WindowBlockManager::storeBlocks(`
`1560`	`1560`	`std::vector<BlockKey> const& blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds, bool pinBlocks)`
`1561`	`1561`	`{`
`1562`	`1562`	`SizeType32 numBlocksStoredForReuse = 0;`
`@@ -1569,7 +1569,7 @@ std::pair<SizeType32, std::optional<KVCacheBlock::IdType>> WindowBlockManager::s`
`1569`	`1569`
`1570`	`1570`	`auto numBlocks = blockKeys.size();`
`1571`	`1571`	`std::vector<BlockPtr> storedBlocks;`
`1572`		`- std::optional<KVCacheBlock::IdType> lastStoredId = std::nullopt;`
	`1572`	`+ std::vector<KVCacheBlock::IdType> pinnedBlockIds;`
`1573`	`1573`	`for (std::size_t blockCnt = 0; blockCnt < numBlocks; ++blockCnt)`
`1574`	`1574`	`{`
`1575`	`1575`	`auto const bid = blockIds[blockCnt];`
`@@ -1620,14 +1620,14 @@ std::pair<SizeType32, std::optional<KVCacheBlock::IdType>> WindowBlockManager::s`
`1620`	`1620`	`if (pinBlocks)`
`1621`	`1621`	`{`
`1622`	`1622`	`searchRoot->incRefCount();`
	`1623`	`+ pinnedBlockIds.push_back(searchRoot->getBlockId());`
`1623`	`1624`	`}`
`1624`		`- lastStoredId = searchRoot->getBlockId();`
`1625`	`1625`	`}`
`1626`	`1626`	`if (mEventManager)`
`1627`	`1627`	`{`
`1628`	`1628`	`mEventManager->enqueueStoredEvent(storedBlocks, mWindowSize);`
`1629`	`1629`	`}`
`1630`		`- return {numBlocksStoredForReuse, lastStoredId};`
	`1630`	`+ return {numBlocksStoredForReuse, pinnedBlockIds};`
`1631`	`1631`	`}`
`1632`	`1632`
`1633`	`1633`	`void BlockManager::replaceSharedBlock(GenerationRequest& sequence, SizeType32 windowSize, SizeType32 blockIdx)`
`@@ -1715,15 +1715,15 @@ std::deque<tle::KVCacheEvent> BlockManager::getLatestEvents(std::optional<std::c`
`1715`	`1715`	`return mEventManager ? mEventManager->getEvents(timeout) : std::deque<tle::KVCacheEvent>{};`
`1716`	`1716`	`}`
`1717`	`1717`
`1718`		`-std::optional<KVCacheBlock::IdType> BlockManager::storeBlocksForReuse(`
	`1718`	`+std::vector<KVCacheBlock::IdType> BlockManager::storeBlocksForReuse(`
`1719`	`1719`	`GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest, bool pinBlocks)`
`1720`	`1720`	`{`
`1721`		`- std::optional<KVCacheBlock::IdType> lastStoredId = std::nullopt;`
	`1721`	`+ std::vector<KVCacheBlock::IdType> pinnedBlockIds;`
`1722`	`1722`	`for (auto& [_, manager] : mWindowBlockManagers)`
`1723`	`1723`	`{`
`1724`		`- lastStoredId = manager.storeBlocksForReuse(sequence, llmRequest, pinBlocks);`
	`1724`	`+ pinnedBlockIds = manager.storeBlocksForReuse(sequence, llmRequest, pinBlocks);`
`1725`	`1725`	`}`
`1726`		`- return lastStoredId;`
	`1726`	`+ return pinnedBlockIds;`
`1727`	`1727`	`}`
`1728`	`1728`
`1729`	`1729`	`std::optional<KVCacheBlock::IdType> BlockManager::releaseBlocks(`
`@@ -1767,15 +1767,15 @@ void BlockManager::pinBlocks(GenerationRequest& sequence)`
`1767`	`1767`	`}`
`1768`	`1768`	`}`
`1769`	`1769`
`1770`		`-void BlockManager::unpinBlocksById(KVCacheBlock::IdType blockId)`
	`1770`	`+void BlockManager::unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds)`
`1771`	`1771`	`{`
`1772`	`1772`	`// Use the first window size`
`1773`	`1773`	`if (mWindowBlockManagers.empty())`
`1774`	`1774`	`{`
`1775`	`1775`	`return;`
`1776`	`1776`	`}`
`1777`	`1777`	`auto& firstManager = mWindowBlockManagers.begin()->second;`
`1778`		`- firstManager.unpinBlocksById(blockId);`
	`1778`	`+ firstManager.unpinBlocksById(blockIds);`
`1779`	`1779`	`}`
`1780`	`1780`
`1781`	`1781`	`void WindowBlockManager::pinBlocks(GenerationRequest& sequence)`
`@@ -1788,21 +1788,28 @@ void WindowBlockManager::pinBlocks(GenerationRequest& sequence)`
`1788`	`1788`	`}`
`1789`	`1789`	`}`
`1790`	`1790`
`1791`		`-void WindowBlockManager::unpinBlocksById(KVCacheBlock::IdType blockId)`
	`1791`	`+void WindowBlockManager::unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds)`
`1792`	`1792`	`{`
`1793`		`- if (blockId < 0 \|\| static_cast<size_t>(blockId) >= mAllBlocksById.size())`
	`1793`	`+ if (blockIds.empty())`
`1794`	`1794`	`{`
`1795`	`1795`	`return;`
`1796`	`1796`	`}`
`1797`		`- auto block = mAllBlocksById[blockId];`
`1798`		`- while (block && block->getBlockId() != KVCacheBlock::kCachedBlocksRootId)`
	`1797`	`+`
	`1798`	`+ for (auto const& blockId : blockIds)`
`1799`	`1799`	`{`
`1800`		`- block->decRefCount();`
`1801`		`- if (!block->hasRefs())`
	`1800`	`+ if (blockId < 0 \|\| static_cast<size_t>(blockId) >= mAllBlocksById.size())`
`1802`	`1801`	`{`
`1803`		`- mEvictionPolicy->releaseBlock(block);`
	`1802`	`+ continue;`
	`1803`	`+ }`
	`1804`	`+ auto block = mAllBlocksById[blockId];`
	`1805`	`+ if (block && block->getBlockId() != KVCacheBlock::kCachedBlocksRootId)`
	`1806`	`+ {`
	`1807`	`+ block->decRefCount();`
	`1808`	`+ if (!block->hasRefs())`
	`1809`	`+ {`
	`1810`	`+ mEvictionPolicy->releaseBlock(block);`
	`1811`	`+ }`
`1804`	`1812`	`}`
`1805`		`- block = std::move(block->getPrevBlock());`
`1806`	`1813`	`}`
`1807`	`1814`	`}`
`1808`	`1815`
`@@ -1870,7 +1877,7 @@ void WindowBlockManager::storeNewBlock(GenerationRequest& sequence, OptionalRef<`
`1870`	`1877`	`(void) storeBlocks(std::move(blockKeys), cacheBlockIds[beamIdx]);`
`1871`	`1878`	`}`
`1872`	`1879`
`1873`		`-std::optional<KVCacheBlock::IdType> WindowBlockManager::storeBlocksForReuse(`
	`1880`	`+std::vector<KVCacheBlock::IdType> WindowBlockManager::storeBlocksForReuse(`
`1874`	`1881`	`GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest, bool pinBlocks)`
`1875`	`1882`	`{`
`1876`	`1883`	`auto constexpr beamIdx = 0;`
`@@ -1883,7 +1890,10 @@ std::optional<KVCacheBlock::IdType> WindowBlockManager::storeBlocksForReuse(`
`1883`	`1890`	`auto const usableSize = static_cast<runtime::SizeType32>(uniqueTokens.size()) - 1;`
`1884`	`1891`	`auto blockedUniqueTokens = chopVectorIntoBlocks<UniqueToken>(uniqueTokens, usableSize, mTokensPerBlock, true);`
`1885`	`1892`	`auto blockKeys = buildBlockKeys(blockedUniqueTokens, *llmRequest);`
`1886`		`- return storeBlocks(std::move(blockKeys), cacheBlockIds[beamIdx], pinBlocks).second;`
	`1893`	`+`
	`1894`	`+ auto [numStored, pinnedBlockIds] = storeBlocks(std::move(blockKeys), cacheBlockIds[beamIdx], pinBlocks);`
	`1895`	`+`
	`1896`	`+ return pinnedBlockIds;`
`1887`	`1897`	`}`
`1888`	`1898`
`1889`	`1899`	`std::optional<KVCacheBlock::IdType> WindowBlockManager::releaseBlocks(`
`@@ -1922,7 +1932,7 @@ std::optional<KVCacheBlock::IdType> WindowBlockManager::releaseBlocks(`
`1922`	`1932`	`std::transform(allocatedBlocks.begin(), allocatedBlocks.end(), cacheBlockIds.begin(),`
`1923`	`1933`	`[](BlockPtr const& block) { return block->getBlockId(); });`
`1924`	`1934`
`1925`		`- auto [numBlocksStoredForReuse, lastStoredId] = storeBlocks(std::move(blockKeys), cacheBlockIds);`
	`1935`	`+ auto [numBlocksStoredForReuse, pinnedBlockIds] = storeBlocks(std::move(blockKeys), cacheBlockIds);`
`1926`	`1936`	`TLLM_LOG_DEBUG("%s::releaseBlocks Request %lu, %d blocks stored for reuse", mLogPrefix.c_str(),`
`1927`	`1937`	`sequence.getRequestId(), numBlocksStoredForReuse);`
`1928`	`1938`	`}`
`@@ -2499,15 +2509,14 @@ std::optional<KVCacheBlock::IdType> KVCacheManager::removeSequence(`
`2499`	`2509`	`return lastStoredId;`
`2500`	`2510`	`}`
`2501`	`2511`
`2502`		`-std::optional<KVCacheBlock::IdType> KVCacheManager::storeBlocksForReuse(`
	`2512`	`+std::vector<KVCacheBlock::IdType> KVCacheManager::storeBlocksForReuse(`
`2503`	`2513`	`RequestIdType requestId, OptionalRef<LlmRequest const> llmRequest, bool pinBlocks)`
`2504`	`2514`	`{`
`2505`	`2515`	`TLLM_LOG_TRACE("[%s]::%s start", isCrossKv() ? "CROSS" : "SELF", __PRETTY_FUNCTION__);`
`2506`	`2516`	`auto& sequence = getSequence(requestId);`
`2507`		`- std::optional<KVCacheBlock::IdType> lastStoredId`
`2508`		`- = mBlockManager.storeBlocksForReuse(sequence, llmRequest, pinBlocks);`
	`2517`	`+ auto pinnedBlockIds = mBlockManager.storeBlocksForReuse(sequence, llmRequest, pinBlocks);`
`2509`	`2518`	`TLLM_LOG_TRACE("[%s]::%s stop", isCrossKv() ? "CROSS" : "SELF", __PRETTY_FUNCTION__);`
`2510`		`- return lastStoredId;`
	`2519`	`+ return pinnedBlockIds;`
`2511`	`2520`	`}`
`2512`	`2521`
`2513`	`2522`	`void KVCacheManager::schedulingRemoveSequence(RequestIdType requestId)`
`@@ -2522,9 +2531,9 @@ void KVCacheManager::pinBlocks(RequestIdType requestId)`
`2522`	`2531`	`mBlockManager.pinBlocks(sequence);`
`2523`	`2532`	`}`
`2524`	`2533`
`2525`		`-void KVCacheManager::unpinBlocksById(KVCacheBlock::IdType blockId)`
	`2534`	`+void KVCacheManager::unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds)`
`2526`	`2535`	`{`
`2527`		`- mBlockManager.unpinBlocksById(blockId);`
	`2536`	`+ mBlockManager.unpinBlocksById(blockIds);`
`2528`	`2537`	`}`
`2529`	`2538`
`2530`	`2539`	`SizeType32 KVCacheManager::copyBlockOffsets(ITensor& output, SizeType32 outputSlotOffset, RequestIdType requestId) const`
Original file line number	Diff line number	Diff line change
`@@ -2179,11 +2179,11 @@ void Executor::Impl::terminateContextFinishedRequests(InTransList& inTransmissio`
`2179`	`2179`	`auto req = item.request;`
`2180`	`2180`	`if (req->isDisaggContextCompleteState())`
`2181`	`2181`	`{`
`2182`		`- // If lastBlockId was tracked, unpin it. Otherwise, just terminate.`
	`2182`	`+ // If pinnedBlockIds were tracked, unpin them. Otherwise, just terminate.`
`2183`	`2183`	`auto kvMgr = mModel->getKVCacheManager();`
`2184`		`- if (kvMgr && item.lastBlockId.has_value())`
	`2184`	`+ if (kvMgr && !item.pinnedBlockIds.empty())`
`2185`	`2185`	`{`
`2186`		`- kvMgr->unpinBlocksById(item.lastBlockId.value());`
	`2186`	`+ kvMgr->unpinBlocksById(item.pinnedBlockIds);`
`2187`	`2187`	`}`
`2188`	`2188`	`else`
`2189`	`2189`	`{`
`@@ -2234,14 +2234,14 @@ Executor::Impl::RequestList Executor::Impl::populateNewResponses(`
`2234`	`2234`	`// move the in transmission requests to another tracker`
`2235`	`2235`	`if (llmReq->isDisaggContextTransmissionState())`
`2236`	`2236`	`{`
`2237`		`- std::optional<SizeType32> lastBlockId{};`
	`2237`	`+ std::vector<SizeType32> pinnedBlockIds{};`
`2238`	`2238`	`auto kvMgr = mModel->getKVCacheManager();`
`2239`	`2239`	`if (kvMgr && kvMgr->isEnableBlockReuse() && !kvMgr->getBlockManager().isVariableWindow())`
`2240`	`2240`	`{`
`2241`		`- lastBlockId = kvMgr->storeBlocksForReuse(llmReq->mRequestId, llmReq, /pinBlocks=/true);`
	`2241`	`+ pinnedBlockIds = kvMgr->storeBlocksForReuse(llmReq->mRequestId, llmReq, /pinBlocks=/true);`
`2242`	`2242`	`mModel->terminateRequest(llmReq);`
`2243`	`2243`	`}`
`2244`		`- inTransmissionRequests.push_back(InTransmissionItem{*it, lastBlockId});`
	`2244`	`+ inTransmissionRequests.push_back(InTransmissionItem{*it, pinnedBlockIds});`
`2245`	`2245`	`}`
`2246`	`2246`	`finishedRequests.push_back(*it);`
`2247`	`2247`	`it = activeRequests.erase(it);`