fix cancel request logic (#5800)

QiJune · web-flow · commit ce3940953076 · 2025-07-14T10:23:20.000+08:00
Signed-off-by: junq &lt;22017000+QiJune@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -9,7 +9,7 @@
 import time
 import traceback
 import weakref
-from collections import namedtuple
+from collections import deque, namedtuple
 from contextlib import contextmanager
 from typing import Dict, List, Optional, Tuple, Union
 
@@ -57,34 +57,62 @@
 class RequestQueueItem:
     id: int
     request: Optional[ExecutorRequest] = None
+    is_canceled_request: bool = False
     query: Optional[list] = None  # only used in `StarAttention`
 
+    @property
     def is_shutdown_request(self):
         return self.id == SHUTDOWN_REQUEST_ID
 
+    @property
+    def is_normal_request(self):
+        return not (self.is_shutdown_request or self.is_canceled_request)
 
-def _get_from_request_queue(request_queue,
-                            timeout: Optional[datetime.timedelta],
-                            max_req_count: int) -> List[RequestQueueItem]:
+
+def _get_from_request_queue(
+        request_queue,
+        timeout: Optional[datetime.timedelta]) -> List[RequestQueueItem]:
     items = []
     timeout_secs = timeout.total_seconds() if timeout is not None else None
-    req_count = 0
     try:
         if request_queue.empty() and (timeout_secs is None or timeout_secs > 0):
             # if queue is empty and want to wait, wait
             items.append(request_queue.get(timeout=timeout_secs))
         else:
             # if not empty or don't want to wait, just return all items in queue
-            while req_count < max_req_count:
+            while True:
                 queue_item = request_queue.get_nowait()
                 items.append(queue_item)
-                if not queue_item.is_shutdown_request():
-                    req_count += 1
     except queue.Empty:
         pass
     return items
 
 
+def _get_from_waiting_queue(
+    waiting_queue: deque[RequestQueueItem],
+    max_req_count: int,
+) -> List[RequestQueueItem]:
+    """Safely extracts up to max_req_count items from a deque.
+
+    Args:
+        waiting_queue: The queue to pop items from.
+        max_req_count: Maximum items to retrieve. Returns empty list if <=0.
+
+    Returns:
+        List of retrieved items (may be shorter than max_req_count if queue empties first).
+    """
+    # Edge case handling
+    if max_req_count <= 0:  # Handles negative/zero counts
+        return []
+
+    items = []
+    req_count = 0
+    while req_count < max_req_count and waiting_queue:
+        items.append(waiting_queue.popleft())
+        req_count += 1
+    return items
+
+
 @functools.cache
 def _load_iteration_indexes(env_var: str):
     spans = os.environ.get(env_var, None)
@@ -182,6 +210,7 @@ def __init__(self,
         self.device_id = torch.cuda.current_device()
         self.global_rank = global_mpi_rank()
         self.request_queue: queue.Queue[RequestQueueItem] = queue.Queue()
+        self.waiting_queue: deque[RequestQueueItem] = deque()
 
         # profile config
         self.profile_start_iters, self.profile_stop_iters = _load_iteration_indexes(
@@ -251,7 +280,7 @@ def __init__(self,
         self.send_handles = [None] * self.num_micro_batches
 
         self.inflight_req_ids = ReqIdsSet()
-        self.canceled_req_ids = ReqIdsSet()
+        self.canceled_req_ids = []
 
         self.model_engine.warmup(self.resource_manager)
         if self.draft_model_engine is not None:
@@ -368,7 +397,12 @@ def cancel_request(self, id: int):
         Args:
             id (int): The request id for which to cancel the response
         """
-        self.canceled_req_ids.insert(id)
+        try:
+            self.enqueue_lock.acquire()
+            self.request_queue.put(
+                RequestQueueItem(id, is_canceled_request=True))
+        finally:
+            self.enqueue_lock.release()
 
     def shutdown(self):
         """
@@ -454,6 +488,11 @@ def enqueue_request(self,
     def set_gather_responses(self, gather_all_responses):
         self.gather_all_responses = gather_all_responses
 
+    @property
+    def should_stop_processing(self):
+        return self.is_shutdown and len(self.active_requests) == 0 and len(
+            self.waiting_queue) == 0
+
     @contextmanager
     def _profiler(self):
         it = -1
@@ -710,12 +749,12 @@ def _executor_loop_pp(self):
         with self._profiler() as profile_step:
             iter_start_time = time.time()
             iter_stats = None
-            while not self.is_shutdown or len(self.active_requests) > 0:
+            while not self.should_stop_processing:
                 profile_step()
                 if self.enable_iter_perf_stats:
                     iter_start_time = time.time()
                 new_requests = self._fetch_new_requests()
-                if self.is_shutdown and len(self.active_requests) == 0:
+                if self.should_stop_processing:
                     break
 
                 if self.enable_iter_perf_stats:
@@ -839,7 +878,7 @@ def _executor_loop_pp(self):
                 if previous_batch is not None:
                     with torch.cuda.nvtx.range("_handle_previous_batch_pp"):
                         self._update_requests(previous_batch.sample_state)
-                        self._handle_cancelled_requests()
+                        self._handle_canceled_requests()
                         finished_requests = self._handle_responses()
                         previous_scheduled_batch = previous_batch.sample_state.scheduled_requests
                         self.resource_manager.update_resources(
@@ -861,12 +900,12 @@ def _executor_loop(self):
             sample_state = None
             iter_start_time = time.time()
             iter_stats = None
-            while not self.is_shutdown or len(self.active_requests) > 0:
+            while not self.should_stop_processing:
                 profile_step()
                 if self.enable_iter_perf_stats:
                     iter_start_time = time.time()
                 new_requests = self._fetch_new_requests()
-                if self.is_shutdown and len(self.active_requests) == 0:
+                if self.should_stop_processing:
                     break
 
                 if self.kv_cache_transceiver:
@@ -950,7 +989,7 @@ def _executor_loop(self):
                         for req in ctx_transmission_reqs:
                             req.state = LlmRequestState.DISAGG_CONTEXT_TRANS_IN_PROGRESS
 
-                    self._handle_cancelled_requests()
+                    self._handle_canceled_requests()
                     finished_requests = self._handle_responses()
                     self.resource_manager.update_resources(scheduled_batch)
                     if self.enable_kv_cache_events:
@@ -1006,12 +1045,12 @@ def _executor_loop_overlap(self):
         with self._profiler() as profile_step:
             iter_start_time = time.time()
             iter_stats = None
-            while not self.is_shutdown or len(self.active_requests) > 0:
+            while not self.should_stop_processing:
                 profile_step()
                 if self.enable_iter_perf_stats:
                     iter_start_time = time.time()
                 new_requests = self._fetch_new_requests()
-                if self.is_shutdown and len(self.active_requests) == 0:
+                if self.should_stop_processing:
                     break
 
                 if self.kv_cache_transceiver:
@@ -1125,7 +1164,7 @@ def _process_previous_batch(self):
             for req in self.previous_batch.ctx_transmission_reqs:
                 req.state = LlmRequestState.DISAGG_CONTEXT_TRANS_IN_PROGRESS
 
-        self._handle_cancelled_requests()
+        self._handle_canceled_requests()
         finished_requests = self._handle_responses()
         scheduled_requests = self.previous_batch.sample_state.scheduled_requests
         self.resource_manager.update_resources(scheduled_requests)
@@ -1200,13 +1239,11 @@ def _fetch_new_requests(self) -> List[RequestQueueItem]:
             total_num_active_requests = len(self.active_requests)
             total_max_num_active_requests = self.max_num_active_requests
 
-        timeout = None if total_num_active_requests == 0 else datetime.timedelta(
-            0)
+        timeout = None if (total_num_active_requests == 0) and len(
+            self.waiting_queue) == 0 else datetime.timedelta(0)
         new_requests = []
         if self.dist.rank == 0:
-            new_requests = _get_from_request_queue(
-                self.request_queue, timeout,
-                total_max_num_active_requests - total_num_active_requests)
+            new_requests = _get_from_request_queue(self.request_queue, timeout)
 
         if self.dist.rank == 0:
             py_logits_post_processors = self._collect_py_objects_from_requests(
@@ -1229,21 +1266,28 @@ def _fetch_new_requests(self) -> List[RequestQueueItem]:
         # drop requests arriving after shutdown
         valid_new_requests = []
         for req_item in new_requests:
-            if req_item.is_shutdown_request():
+            if req_item.is_shutdown_request:
                 self.is_shutdown = True
                 break
+            elif req_item.is_canceled_request:
+                self.canceled_req_ids.append(req_item.id)
             else:
                 valid_new_requests.append(req_item)
         # Check if the beam width of the requests is equal to the max_beam_width
         for req_item in valid_new_requests:
             assert req_item.request.sampling_config.beam_width == self.max_beam_width, f"Request beam width {req_item.request.sampling_config.beam_width} is not equal to max_beam_width {self.max_beam_width}. This is not supported!"
-        new_requests = valid_new_requests
 
         if py_request_objects and (self.dist.tp_size > 1
                                    or self.dist.has_pp) and self.dist.rank > 0:
             for attr_name, req_obj_dict in py_request_objects:
-                self._attach_py_objects_to_requests(new_requests, attr_name,
-                                                    req_obj_dict)
+                self._attach_py_objects_to_requests(valid_new_requests,
+                                                    attr_name, req_obj_dict)
+
+        self.waiting_queue.extend(valid_new_requests)
+
+        new_requests = _get_from_waiting_queue(
+            self.waiting_queue,
+            total_max_num_active_requests - total_num_active_requests)
 
         if not self.enable_attention_dp:
             self._update_new_active_requests_queue_latency(new_requests)
@@ -1339,7 +1383,7 @@ def _collect_py_objects_from_requests(
         """
         req_id_to_obj = {}
         for item in requests:
-            if item.is_shutdown_request():
+            if not item.is_normal_request:
                 continue
             obj = getattr(item.request, attribute_name, None)
             if obj is not None:
@@ -1926,41 +1970,28 @@ def _handle_errors(self, error_msg: Optional[str] = None):
     def _terminate_request(self, request: LlmRequest):
         self.resource_manager.free_resources(request)
 
-    @nvtx_range("_handle_cancelled_requests")
-    def _handle_cancelled_requests(self):
-        #TODO: properly handle canceled ids in pp case
-        if self.dist.has_tp:
-            self.canceled_req_ids = self.dist.broadcast(self.canceled_req_ids,
-                                                        root=0)
-
+    @nvtx_range("_handle_canceled_requests")
+    def _handle_canceled_requests(self):
         if len(self.canceled_req_ids) == 0:
             return
 
-        cancelled_responses = {}
-        left_requests = []
-        # Tracks canceled requests for proper handling in overlap mode during `sampler.update_requests`.
-        self.canceled_requests = []
+        # cancel request in the waiting queue
+        self.waiting_queue = deque(req for req in self.waiting_queue
+                                   if req.id not in self.canceled_req_ids)
+
         for request in self.active_requests:
             req_id = request.py_request_id
             if req_id in self.canceled_req_ids:
-                self._terminate_request(request)
+                # Mark requests as finished, then, we reuse all existing code
+                # to clean up the KV cache resources.
                 request.finish_by_reason(FinishReason.CANCELLED)
                 request.decoding_iter = request.py_decoding_iter
-                cancelled_responses[req_id] = request.create_response(
-                    False, self.dist.rank)
-                self.canceled_requests.append(request)
-                self.canceled_req_ids.erase(req_id)
-            else:
-                left_requests.append(request)
-        self.active_requests = left_requests
 
-        # When enable attention dp, each rank does not have full copy of requests
-        # so we need to remove the cancel requests not in the local rank
-        self.canceled_req_ids.clear()
-
-        # enqueue the cancelled requests' responses as they are not
-        # active_requests and be discarded in the sampler loop.
-        self._enqueue_responses(cancelled_responses)
+        if self.enable_attention_dp:
+            # TODO: revisit the cancel logic of attention dp
+            # When enable attention dp, each rank does not have full copy of requests
+            # so we need to remove the cancel requests not in the local rank
+            self.canceled_req_ids.clear()
 
     @nvtx_range("_enqueue_responses")
     def _enqueue_responses(self, responses: Dict[int, LlmResponse]):
diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
@@ -124,7 +124,7 @@ def top_p_sampling_batch(logits: torch.Tensor, top_p: float = 0.9):
     logits_dim = logits.dim()
     if logits_dim == 1:
         logits = logits.unsqueeze(0)
-    assert logits_dim == 2, "logits should be 2D： [batch_size, vocab_size]"
+    assert logits_dim == 2, "logits should be 2D: [batch_size, vocab_size]"
 
     # sort the logits of each sample in descending order
     sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
diff --git a/tests/unittest/llmapi/apps/_test_openai_misc.py b/tests/unittest/llmapi/apps/_test_openai_misc.py
@@ -86,8 +86,6 @@ async def test_request_cancellation(server: RemoteOpenAIServer,
                                     model_name: str):
     # clunky test: send an ungodly amount of load in with short timeouts
     # then ensure that it still responds quickly afterwards
-    pytest.skip("https://nvbugs/5310314")
-
     chat_input = [{"role": "user", "content": "Write a long story"}]
     client = server.get_async_client(timeout=0.5, max_retries=3)
     tasks = []