[https://nvbugs/5689235][fix] Fix cancellation+chunked prefill+disagg

Tabrizian · Tabrizian · commit 108a1c075ec5 · 2025-12-18T00:12:48.000-08:00
Signed-off-by: Iman Tabrizian &lt;10105175+tabrizian@users.noreply.github.com&gt;
diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@@ -1667,6 +1667,12 @@ class GenericLlmRequest
             [](auto reason) { return reason == executor::FinishReason::kLENGTH; });
     }
 
+    [[nodiscard]] bool isFinishedDueToCancellation() const noexcept
+    {
+        return std::all_of(mFinishReasons.begin(), mFinishReasons.end(),
+            [](auto reason) { return reason == executor::FinishReason::kCANCELLED; });
+    }
+
     [[nodiscard]] bool isTimedOut() const
     {
         if (!mAllottedTimeMs.has_value())
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp
@@ -161,6 +161,7 @@ void initBindings(nb::module_& m)
         .def("set_finished_reason", &GenLlmReq::setFinishedReason, nb::arg("finish_reason"), nb::arg("beam"))
         .def_prop_ro("is_finished", &GenLlmReq::isFinished)
         .def_prop_ro("is_finished_due_to_length", &GenLlmReq::isFinishedDueToLength)
+        .def_prop_ro("is_finished_due_to_cancellation", &GenLlmReq::isFinishedDueToCancellation)
         .def_prop_rw(
             "context_current_position", &GenLlmReq::getContextCurrentPosition, &GenLlmReq::setContextCurrentPosition)
         .def_prop_ro("prepopulated_prompt_len", &GenLlmReq::getPrepopulatedPromptLen)
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -165,6 +165,7 @@ void initBindings(pybind11::module_& m)
         .def("set_finished_reason", &GenLlmReq::setFinishedReason, py::arg("finish_reason"), py::arg("beam"))
         .def_property_readonly("is_finished", &GenLlmReq::isFinished)
         .def_property_readonly("is_finished_due_to_length", &GenLlmReq::isFinishedDueToLength)
+        .def_property_readonly("is_finished_due_to_cancellation", &GenLlmReq::isFinishedDueToCancellation)
         .def_property(
             "context_current_position", &GenLlmReq::getContextCurrentPosition, &GenLlmReq::setContextCurrentPosition)
         .def_property_readonly("prepopulated_prompt_len", &GenLlmReq::getPrepopulatedPromptLen)
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -1071,7 +1071,8 @@ def _executor_loop_pp(self):
                             for req in previous_batch.scheduled_ctx_reqs:
                                 if req.is_context_only_request and (
                                         req.is_context_finished
-                                        or req.is_finished_due_to_length):
+                                        or req.is_finished_due_to_length
+                                ) and not req.is_finished_due_to_cancellation:
                                     block_id = self.kv_cache_manager.store_blocks_for_reuse(
                                         req, True)
                                     self.ctx_in_transmission_requests[
@@ -1340,7 +1341,8 @@ def _executor_loop(self):
                         for req in scheduled_batch.context_requests:
                             if req.is_context_only_request and (
                                     req.is_context_finished
-                                    or req.is_finished_due_to_length):
+                                    or req.is_finished_due_to_length
+                            ) and not req.is_finished_due_to_cancellation:
                                 block_id = self.kv_cache_manager.store_blocks_for_reuse(
                                     req, True)
                                 self.ctx_in_transmission_requests[
@@ -1567,7 +1569,8 @@ def _executor_loop_overlap(self):
                         for req in self.previous_batch.sample_state.scheduled_requests.context_requests:
                             if req.is_context_only_request and (
                                     req.is_context_finished
-                                    or req.is_finished_due_to_length):
+                                    or req.is_finished_due_to_length
+                            ) and not req.is_finished_due_to_cancellation:
                                 block_id = self.kv_cache_manager.store_blocks_for_reuse(
                                     req, True)
                                 self.ctx_in_transmission_requests[
@@ -2076,8 +2079,9 @@ def _send_disagg_ctx_cache(self, scheduled_ctx_requests):
         if (scheduled_ctx_requests is None or len(scheduled_ctx_requests) == 0):
             return []
         for req in scheduled_ctx_requests:
-            if req.is_context_only_request and (req.is_context_finished or
-                                                req.is_finished_due_to_length):
+            if req.is_context_only_request and (
+                    req.is_context_finished or req.is_finished_due_to_length
+            ) and not req.is_finished_due_to_cancellation:
                 self.kv_cache_transceiver.respond_and_send_async(req)
                 for resource_mgr_type in (
                         ResourceManagerType.SEQ_SLOT_MANAGER,
@@ -2377,6 +2381,7 @@ def _handle_canceled_requests(self):
                 # to clean up the KV cache resources.
                 request.finish_by_reason(FinishReason.CANCELLED)
                 request.decoding_iter = request.py_decoding_iter
+                self.ctx_in_transmission_requests.pop(request.py_request_id)
             else:
                 still_pending_canceled_ids.append(req_id)
 

Original file line number	Diff line number	Diff line change
`@@ -1667,6 +1667,12 @@ class GenericLlmRequest`
`1667`	`1667`	`[](auto reason) { return reason == executor::FinishReason::kLENGTH; });`
`1668`	`1668`	`}`
`1669`	`1669`
	`1670`	`+ [[nodiscard]] bool isFinishedDueToCancellation() const noexcept`
	`1671`	`+ {`
	`1672`	`+ return std::all_of(mFinishReasons.begin(), mFinishReasons.end(),`
	`1673`	`+ [](auto reason) { return reason == executor::FinishReason::kCANCELLED; });`
	`1674`	`+ }`
	`1675`	`+`
`1670`	`1676`	`[[nodiscard]] bool isTimedOut() const`
`1671`	`1677`	`{`
`1672`	`1678`	`if (!mAllottedTimeMs.has_value())`