[https://nvbugs/5689235][fix] Fix cancellation+chunked prefill+disagg

Tabrizian · Tabrizian · commit 655832d06264 · 2025-12-19T19:50:40.000Z
Signed-off-by: Iman Tabrizian &lt;10105175+tabrizian@users.noreply.github.com&gt;
diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
@@ -631,7 +631,7 @@ class WindowBlockManager
 
     void replaceSharedBlock(GenerationRequest& sequence, SizeType32 blockIdx);
 
-    [[nodiscard]] std::optional<KVCacheBlock::IdType> storeBlocksForReuse(
+    [[nodiscard]] std::vector<KVCacheBlock::IdType> storeBlocksForReuse(
         GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest, bool pinBlocks = false);
 
     void storeNewBlock(GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest);
@@ -836,8 +836,8 @@ class WindowBlockManager
     //! \param blockKeys Key of each block.
     //! \param blockIds Id of each block.
     //! \param pinBlocks If true, increment ref count for blocks while storing (pin on store).
-    //! \return Pair of (num blocks stored for reuse, id of the last block stored if any).
-    [[nodiscard]] std::pair<SizeType32, std::optional<KVCacheBlock::IdType>> storeBlocks(
+    //! \return Pair of (num blocks stored for reuse, vector of pinned block IDs).
+    [[nodiscard]] std::pair<SizeType32, std::vector<KVCacheBlock::IdType>> storeBlocks(
         std::vector<BlockKey> const& blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds,
         bool pinBlocks = false);
 
@@ -869,8 +869,8 @@ class WindowBlockManager
 
     [[nodiscard]] std::shared_ptr<KVCacheBlock> findBlocksInReuseTreeByBlockKey(BlockKey const& blockKey);
 
-    //! \brief Unpin blocks by starting from a block id and walking prev pointers.
-    void unpinBlocksById(KVCacheBlock::IdType blockId);
+    //! \brief Unpin blocks by block ids directly
+    void unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds);
 
     void initializeSequenceStorageValidity(LlmRequest::RequestIdType requestId)
     {
@@ -1086,7 +1086,7 @@ class BlockManager
     std::optional<KVCacheBlock::IdType> releaseBlocks(
         GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest = std::nullopt, bool pinBlocks = false);
 
-    [[nodiscard]] std::optional<KVCacheBlock::IdType> storeBlocksForReuse(
+    [[nodiscard]] std::vector<KVCacheBlock::IdType> storeBlocksForReuse(
         GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest = std::nullopt, bool pinBlocks = false);
 
     void schedulingReleaseBlocks(LlmRequest::RequestIdType requestId);
@@ -1095,7 +1095,7 @@ class BlockManager
     /// @param sequence The generation request whose blocks should be pinned.
     void pinBlocks(GenerationRequest& sequence);
 
-    void unpinBlocksById(KVCacheBlock::IdType blockId);
+    void unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds);
 
     void releaseLastBlock(GenerationRequest& sequence, SizeType32 windowSize);
 
@@ -1116,7 +1116,7 @@ class BlockManager
     void offloadBlock(BlockPtr const& block, SizeType32 windowSize,
         executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM, std::string const& directory = "");
 
-    [[nodiscard]] std::pair<SizeType32, std::optional<KVCacheBlock::IdType>> storeBlocks(
+    [[nodiscard]] std::pair<SizeType32, std::vector<KVCacheBlock::IdType>> storeBlocks(
         std::vector<BlockKey> const& blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds,
         SizeType32 windowSize, bool pinBlocks = false)
     {
@@ -1567,7 +1567,7 @@ class BaseKVCacheManager
     virtual void storeNewBlock(LlmRequest const& llmRequest) = 0;
 
     /// \brief Store blocks for reuse for a given request id
-    [[nodiscard]] virtual std::optional<KVCacheBlock::IdType> storeBlocksForReuse(
+    [[nodiscard]] virtual std::vector<KVCacheBlock::IdType> storeBlocksForReuse(
         LlmRequest::RequestIdType requestId, OptionalRef<LlmRequest const> llmRequest, bool pinBlocks = false)
         = 0;
 
@@ -1661,7 +1661,7 @@ class BaseKVCacheManager
         BlockKey const& blockKey, SizeType32 windowSize)
         = 0;
 
-    virtual void unpinBlocksById(KVCacheBlock::IdType blockId) = 0;
+    virtual void unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds) = 0;
 };
 
 class KVCacheManager : public BaseKVCacheManager
@@ -1922,7 +1922,7 @@ class KVCacheManager : public BaseKVCacheManager
     //! \brief Store newest blocks for reuse
     void storeNewBlock(LlmRequest const& llmRequest) override;
 
-    [[nodiscard]] std::optional<KVCacheBlock::IdType> storeBlocksForReuse(
+    [[nodiscard]] std::vector<KVCacheBlock::IdType> storeBlocksForReuse(
         LlmRequest::RequestIdType requestId, OptionalRef<LlmRequest const> llmRequest, bool pinBlocks = false) override;
 
     [[nodiscard]] static SizeType32 getSinkBubbleLength(SizeType32 sinkTokenLen, SizeType32 tokensPerBlock);
@@ -1943,7 +1943,7 @@ class KVCacheManager : public BaseKVCacheManager
 
     void pinBlocks(LlmRequest::RequestIdType requestId) override;
 
-    void unpinBlocksById(KVCacheBlock::IdType blockId) override;
+    void unpinBlocksById(std::vector<KVCacheBlock::IdType> const& blockIds) override;
 
     std::optional<KVCacheBlock::IdType> getLastBlockId(LlmRequest::RequestIdType requestId) const override;
 
diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@@ -1667,6 +1667,12 @@ class GenericLlmRequest
             [](auto reason) { return reason == executor::FinishReason::kLENGTH; });
     }
 
+    [[nodiscard]] bool isFinishedDueToCancellation() const noexcept
+    {
+        return std::all_of(mFinishReasons.begin(), mFinishReasons.end(),
+            [](auto reason) { return reason == executor::FinishReason::kCANCELLED; });
+    }
+
     [[nodiscard]] bool isTimedOut() const
     {
         if (!mAllottedTimeMs.has_value())
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp
@@ -161,6 +161,7 @@ void initBindings(nb::module_& m)
         .def("set_finished_reason", &GenLlmReq::setFinishedReason, nb::arg("finish_reason"), nb::arg("beam"))
         .def_prop_ro("is_finished", &GenLlmReq::isFinished)
         .def_prop_ro("is_finished_due_to_length", &GenLlmReq::isFinishedDueToLength)
+        .def_prop_ro("is_finished_due_to_cancellation", &GenLlmReq::isFinishedDueToCancellation)
         .def_prop_rw(
             "context_current_position", &GenLlmReq::getContextCurrentPosition, &GenLlmReq::setContextCurrentPosition)
         .def_prop_ro("prepopulated_prompt_len", &GenLlmReq::getPrepopulatedPromptLen)
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -165,6 +165,7 @@ void initBindings(pybind11::module_& m)
         .def("set_finished_reason", &GenLlmReq::setFinishedReason, py::arg("finish_reason"), py::arg("beam"))
         .def_property_readonly("is_finished", &GenLlmReq::isFinished)
         .def_property_readonly("is_finished_due_to_length", &GenLlmReq::isFinishedDueToLength)
+        .def_property_readonly("is_finished_due_to_cancellation", &GenLlmReq::isFinishedDueToCancellation)
         .def_property(
             "context_current_position", &GenLlmReq::getContextCurrentPosition, &GenLlmReq::setContextCurrentPosition)
         .def_property_readonly("prepopulated_prompt_len", &GenLlmReq::getPrepopulatedPromptLen)
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp
@@ -111,10 +111,10 @@ class PyKvCacheManager : public tbk::BaseKVCacheManager
             requestId, llmRequest, pinOnRelease);
     }
 
-    std::optional<tbk::KVCacheBlock::IdType> storeBlocksForReuse(tb::LlmRequest::RequestIdType requestId,
+    std::vector<tbk::KVCacheBlock::IdType> storeBlocksForReuse(tb::LlmRequest::RequestIdType requestId,
         tensorrt_llm::common::OptionalRef<tb::LlmRequest const> llmRequest, bool pinBlocks) override
     {
-        PYBIND11_OVERLOAD_PURE(std::optional<tbk::KVCacheBlock::IdType>, tbk::BaseKVCacheManager, storeBlocksForReuse,
+        PYBIND11_OVERLOAD_PURE(std::vector<tbk::KVCacheBlock::IdType>, tbk::BaseKVCacheManager, storeBlocksForReuse,
             requestId, llmRequest, pinBlocks);
     }
 
@@ -367,7 +367,22 @@ void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(py::module_& m)
             py::call_guard<py::gil_scoped_release>())
         .def("add_token", &BaseKVCacheManager::addToken, py::call_guard<py::gil_scoped_release>())
         .def("add_sequence", &BaseKVCacheManager::addSequence, py::call_guard<py::gil_scoped_release>())
-        .def("remove_sequence", &BaseKVCacheManager::removeSequence, py::call_guard<py::gil_scoped_release>())
+        .def(
+            "remove_sequence",
+            [](tbk::BaseKVCacheManager& self, tb::LlmRequest::RequestIdType requestId, tb::LlmRequest const* llmRequest,
+                bool pinOnRelease)
+            {
+                if (llmRequest != nullptr)
+                {
+                    return self.removeSequence(requestId, *llmRequest, pinOnRelease);
+                }
+                else
+                {
+                    return self.removeSequence(requestId, std::nullopt, pinOnRelease);
+                }
+            },
+            py::arg("request_id"), py::arg("llm_request") = nullptr, py::arg("pin_on_release") = false,
+            py::call_guard<py::gil_scoped_release>())
         .def("pin_blocks", &BaseKVCacheManager::pinBlocks, py::call_guard<py::gil_scoped_release>())
         .def("scheduling_remove_sequence", &BaseKVCacheManager::schedulingRemoveSequence,
             py::call_guard<py::gil_scoped_release>())
diff --git a/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp b/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp
@@ -4050,11 +4050,13 @@ TEST_F(KVCacheManagerTest, PinAndUnpinBlocksById)
     kvCacheManager.pinBlocks(requestId);
     auto lastBlockIdOpt = kvCacheManager.getLastBlockId(requestId);
     ASSERT_TRUE(lastBlockIdOpt.has_value());
+    auto const& allBlockIds = kvCacheManager.getCacheBlockIds(requestId, maxAttentionWindow)[0];
+    std::vector<SizeType32> pinnedBlockIds(allBlockIds.begin(), allBlockIds.end());
     (void) kvCacheManager.removeSequence(requestId, llmRequest);
     auto const freeAfterRemovePinned = kvCacheManager.getNumFreeBlocks();
     EXPECT_LT(freeAfterRemovePinned, totalBlocks);
 
-    kvCacheManager.unpinBlocksById(lastBlockIdOpt.value());
+    kvCacheManager.unpinBlocksById(pinnedBlockIds);
     auto const freeAfterUnpin = kvCacheManager.getNumFreeBlocks();
     EXPECT_EQ(freeAfterUnpin, totalBlocks);
 }
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -1071,7 +1071,8 @@ def _executor_loop_pp(self):
                             for req in previous_batch.scheduled_ctx_reqs:
                                 if req.is_context_only_request and (
                                         req.is_context_finished
-                                        or req.is_finished_due_to_length):
+                                        or req.is_finished_due_to_length
+                                ) and not req.is_finished_due_to_cancellation:
                                     block_id = self.kv_cache_manager.store_blocks_for_reuse(
                                         req, True)
                                     self.ctx_in_transmission_requests[
@@ -1340,7 +1341,8 @@ def _executor_loop(self):
                         for req in scheduled_batch.context_requests:
                             if req.is_context_only_request and (
                                     req.is_context_finished
-                                    or req.is_finished_due_to_length):
+                                    or req.is_finished_due_to_length
+                            ) and not req.is_finished_due_to_cancellation:
                                 block_id = self.kv_cache_manager.store_blocks_for_reuse(
                                     req, True)
                                 self.ctx_in_transmission_requests[
@@ -1567,7 +1569,8 @@ def _executor_loop_overlap(self):
                         for req in self.previous_batch.sample_state.scheduled_requests.context_requests:
                             if req.is_context_only_request and (
                                     req.is_context_finished
-                                    or req.is_finished_due_to_length):
+                                    or req.is_finished_due_to_length
+                            ) and not req.is_finished_due_to_cancellation:
                                 block_id = self.kv_cache_manager.store_blocks_for_reuse(
                                     req, True)
                                 self.ctx_in_transmission_requests[
@@ -2076,8 +2079,9 @@ def _send_disagg_ctx_cache(self, scheduled_ctx_requests):
         if (scheduled_ctx_requests is None or len(scheduled_ctx_requests) == 0):
             return []
         for req in scheduled_ctx_requests:
-            if req.is_context_only_request and (req.is_context_finished or
-                                                req.is_finished_due_to_length):
+            if req.is_context_only_request and (
+                    req.is_context_finished or req.is_finished_due_to_length
+            ) and not req.is_finished_due_to_cancellation:
                 self.kv_cache_transceiver.respond_and_send_async(req)
                 for resource_mgr_type in (
                         ResourceManagerType.SEQ_SLOT_MANAGER,
@@ -2327,7 +2331,12 @@ def _do_terminate_request(self, request: LlmRequest):
                     self.ctx_in_transmission_requests[request.py_request_id] = (
                         (request, block_id, self.ctx_in_transmission_counter))
 
-        self.resource_manager.free_resources(request)
+        store_blocks_for_reuse = not (self.block_reuse_enabled
+                                      and not self.kv_cache_manager.is_vswa
+                                      and self.kv_cache_transceiver
+                                      and request.is_context_only_request)
+        self.resource_manager.free_resources(
+            request, store_blocks_for_reuse=store_blocks_for_reuse)
 
         if self.gather_all_responses or self.dist.rank == 0:
             self.result_wait_queues.pop(request.py_request_id, None)
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -663,8 +663,13 @@ def update_kv_cache_draft_token_location(self,
                 None,
             )
 
-    def free_resources(self, request: LlmRequest, pin_on_release: bool = False):
-        return self.impl.remove_sequence(request.py_request_id, request,
+    def free_resources(self,
+                       request: LlmRequest,
+                       pin_on_release: bool = False,
+                       store_blocks_for_reuse: bool = True):
+        # When store_blocks_for_reuse is False, pass None to prevent block storage
+        llm_request = request if store_blocks_for_reuse else None
+        return self.impl.remove_sequence(request.py_request_id, llm_request,
                                          pin_on_release)
 
     def store_blocks_for_reuse(self,
@@ -1407,10 +1412,17 @@ def update_resources(self,
                 else:
                     resource_manager.update_resources(scheduled_batch)
 
-    def free_resources(self, request: LlmRequest):
-        for _, resource_manager in reversed(self.resource_managers.items()):
+    def free_resources(self,
+                       request: LlmRequest,
+                       store_blocks_for_reuse: bool = True):
+        for resource_type, resource_manager in reversed(
+                self.resource_managers.items()):
             if hasattr(resource_manager, "free_resources"):
-                resource_manager.free_resources(request)
+                if resource_type == ResourceManagerType.KV_CACHE_MANAGER:
+                    resource_manager.free_resources(
+                        request, store_blocks_for_reuse=store_blocks_for_reuse)
+                else:
+                    resource_manager.free_resources(request)
 
     def reorder_pipeline(self,
                          resource_manager_list: list[ResourceManagerType]):

Original file line number	Diff line number	Diff line change
`@@ -1667,6 +1667,12 @@ class GenericLlmRequest`
`1667`	`1667`	`[](auto reason) { return reason == executor::FinishReason::kLENGTH; });`
`1668`	`1668`	`}`
`1669`	`1669`
	`1670`	`+ [[nodiscard]] bool isFinishedDueToCancellation() const noexcept`
	`1671`	`+ {`
	`1672`	`+ return std::all_of(mFinishReasons.begin(), mFinishReasons.end(),`
	`1673`	`+ [](auto reason) { return reason == executor::FinishReason::kCANCELLED; });`
	`1674`	`+ }`
	`1675`	`+`
`1670`	`1676`	`[[nodiscard]] bool isTimedOut() const`
`1671`	`1677`	`{`
`1672`	`1678`	`if (!mAllottedTimeMs.has_value())`