[None][feat] Graceful Error Handling for Guided Decoder (NVIDIA#9078)

jellysnack · syuoni · web-flow · commit 079ef8ae77ae · 2025-12-13T19:57:59.000+08:00
Signed-off-by: jellysnack &lt;oleg.jellysnack@gmail.com&gt;
Signed-off-by: jellysnack &lt;158609015+jellysnack@users.noreply.github.com&gt;
Co-authored-by: Enwei Zhu &lt;21126786+syuoni@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/guided_decoder.py b/tensorrt_llm/_torch/pyexecutor/guided_decoder.py
@@ -204,73 +204,84 @@ def __init__(self,
     def bitmask_size(self) -> int:
         return math.ceil(self.vocab_size_padded / 32)
 
-    def _build(self, requests: GuidedRequests) -> None:
+    def _build(self, requests: GuidedRequests) -> List[Tuple[int, str]]:
         """Build the bitmask for requests with guided decoding enabled.
 
         Specifically, this method:
         - build and advance the grammar matcher for context and generation requests, respectively;
         - call the grammar matcher to fill the bitmask on CPU;
         - asynchronously copy the bitmask to GPU.
         """
+        failed_requests = []
         self.token_mask_host[:requests.num_bitmask_tokens].fill_(0)
 
         for req, offset in requests.valid_requests_with_offsets():
             slot = req.seq_slot
-            self.num_advanced_tokens[slot] = 0
-            self.num_guided_tokens[slot] = 0
+            try:
+                self.num_advanced_tokens[slot] = 0
+                self.num_guided_tokens[slot] = 0
 
-            matcher_init: bool = req.require_matcher_init()
-            matcher_advance: bool = req.require_matcher_advance()
-            if not (matcher_init or matcher_advance):
-                continue
-
-            if matcher_init:
-                matcher = self.grammar_matcher_factory.create(
-                    req.guided_decoding_params)
-                self.grammar_matchers[slot] = matcher
-
-            if matcher_advance:
-                matcher = self.grammar_matchers[slot]
-                # The last new token must be acceptable unless the matcher is terminated:
-                # 1. For the main model loop, when overlap scheduler is enabled, the matcher may have accepted the EOS token in the draft tokens at the previous iteration.
-                # 2. For the draft model loop, the matcher may have accepted the EOS token at the previous drafting iteration.
-                if matcher.is_terminated() or self.is_draft_terminated[slot]:
+                matcher_init: bool = req.require_matcher_init()
+                matcher_advance: bool = req.require_matcher_advance()
+                if not (matcher_init or matcher_advance):
                     continue
-                accepted = matcher.accept_token(req.new_token)
-                if not accepted:
-                    if req.is_draft:
-                        self.is_draft_terminated[slot] = True
-                        logger.debug(
-                            f"Draft request {req.request_id} at slot {slot} failed to accept last new token: {req.new_token}."
-                        )
+
+                if matcher_init:
+                    matcher = self.grammar_matcher_factory.create(
+                        req.guided_decoding_params)
+                    self.grammar_matchers[slot] = matcher
+
+                if matcher_advance:
+                    matcher = self.grammar_matchers[slot]
+                    # The last new token must be acceptable unless the matcher is terminated or None:
+                    # 1. For the main model loop, when overlap scheduler is enabled, the matcher may have accepted the EOS token in the draft tokens at the previous iteration.
+                    # 2. For the draft model loop, the matcher may have accepted the EOS token at the previous drafting iteration.
+                    # 3. The matcher can be None if there was an error during its creation.
+                    if matcher is None or matcher.is_terminated(
+                    ) or self.is_draft_terminated[slot]:
                         continue
-                    # TODO: Make this an error response.
-                    raise ValueError(
-                        f"Request {req.request_id} at slot {slot} failed to accept last new token: {req.new_token}."
-                    )
-
-            self.num_advanced_tokens[slot] += 1
-            if not matcher.is_terminated():
-                matcher.fill_next_token_bitmask(self.bitmask_host, offset)
-                self.token_mask_host[offset] = 1
-                self.num_guided_tokens[slot] += 1
-                # Process draft tokens
-                for i, tid in enumerate(req.draft_tokens, 1):
-                    accepted = matcher.accept_token(tid)
+                    accepted = matcher.accept_token(req.new_token)
                     if not accepted:
-                        break
-                    self.num_advanced_tokens[slot] += 1
-                    if matcher.is_terminated():
-                        break
-                    matcher.fill_next_token_bitmask(self.bitmask_host,
-                                                    offset + i)
-                    self.token_mask_host[offset + i] = 1
+                        if req.is_draft:
+                            self.is_draft_terminated[slot] = True
+                            logger.debug(
+                                f"Draft request {req.request_id} at slot {slot} failed to accept last new token: {req.new_token}."
+                            )
+                            continue
+                        raise ValueError(
+                            f"Request {req.request_id} at slot {slot} failed to accept last new token: {req.new_token}."
+                        )
+
+                self.num_advanced_tokens[slot] += 1
+                if not matcher.is_terminated():
+                    matcher.fill_next_token_bitmask(self.bitmask_host, offset)
+                    self.token_mask_host[offset] = 1
                     self.num_guided_tokens[slot] += 1
+                    # Process draft tokens
+                    for i, tid in enumerate(req.draft_tokens, 1):
+                        accepted = matcher.accept_token(tid)
+                        if not accepted:
+                            break
+                        self.num_advanced_tokens[slot] += 1
+                        if matcher.is_terminated():
+                            break
+                        matcher.fill_next_token_bitmask(self.bitmask_host,
+                                                        offset + i)
+                        self.token_mask_host[offset + i] = 1
+                        self.num_guided_tokens[slot] += 1
+
+                if req.is_draft:
+                    assert len(req.draft_tokens) == 0
+                    self.num_advanced_draft_tokens[
+                        slot] += self.num_advanced_tokens[slot]
+            except Exception as e:
+                error_msg = f"Guided decoding error: {str(e)}"
+                failed_requests.append((req.request_id, error_msg))
+                logger.error(
+                    f"Request {req.request_id} at slot {slot} failed during guided decoding: {error_msg}"
+                )
 
-            if req.is_draft:
-                assert len(req.draft_tokens) == 0
-                self.num_advanced_draft_tokens[
-                    slot] += self.num_advanced_tokens[slot]
+        return failed_requests
 
     def _copy_bitmask(self,
                       requests: GuidedRequests,
@@ -306,8 +317,8 @@ def add_batch(self, scheduled_requests: ScheduledRequests) -> None:
             scheduled_requests, self.max_num_draft_tokens)
 
     @nvtx_range("GuideDecoder.build")
-    def build(self) -> None:
-        self._build(self.requests)
+    def build(self) -> List[Tuple[int, str]]:
+        return self._build(self.requests)
 
     @nvtx_range("GuideDecoder.copy_bitmask")
     def copy_bitmask(self, num_bitmask_tokens: Optional[int] = None) -> None:
@@ -325,8 +336,8 @@ def apply_bitmask(self,
 
     def execute(self,
                 logits: torch.Tensor,
-                d2t: Optional[torch.Tensor] = None) -> None:
-        self.build()
+                d2t: Optional[torch.Tensor] = None) -> List[Tuple[int, str]]:
+        failed_requests = self.build()
 
         with torch.cuda.stream(self.stream):
             torch.cuda.current_stream().wait_event(self.token_event)
@@ -337,6 +348,8 @@ def execute(self,
         self.apply_bitmask(logits, d2t=d2t)
         self.token_event.record()
 
+        return failed_requests
+
     def _rollback_rejected_tokens(self, requests: GuidedRequests) -> None:
         """Rollback the grammar matcher for rejected tokens.
 
@@ -460,23 +473,25 @@ def fetch_batch(self) -> None:
                                                                )
 
     @hostfunc
-    def build(self) -> None:
-        self._build(self.requests_hostfunc)
+    def build(self) -> List[Tuple[int, str]]:
+        return self._build(self.requests_hostfunc)
 
     def execute(self,
                 logits: torch.Tensor,
-                d2t: Optional[torch.Tensor] = None) -> None:
+                d2t: Optional[torch.Tensor] = None) -> List[Tuple[int, str]]:
         with torch.cuda.stream(self.stream):
             torch.cuda.current_stream().wait_event(self.token_event)
             self.fetch_batch()
             self.init_disagg_gen_requests()
-            self.build()
+            failed_requests = self.build()
             self.copy_bitmask()
             self.bitmask_event.record()
 
         torch.cuda.current_stream().wait_event(self.bitmask_event)
         self.apply_bitmask(logits, d2t=d2t)
 
+        return failed_requests
+
     @hostfunc
     def rollback_rejected_tokens(self) -> None:
         self._rollback_rejected_tokens(self.requests_hostfunc)
@@ -532,13 +547,13 @@ def fetch_draft_batch(self, draft_step: int = 0) -> None:
     def execute_draft_batch(self,
                             logits: torch.Tensor,
                             d2t: Optional[torch.Tensor] = None,
-                            draft_step: int = 0) -> None:
+                            draft_step: int = 0) -> List[Tuple[int, str]]:
         with torch.cuda.stream(self.stream):
             torch.cuda.current_stream().wait_event(self.token_event)
             self.fetch_draft_batch(draft_step=draft_step)
             if draft_step == 0:
                 self.rollback_rejected_tokens()
-            self.build()
+            failed_requests = self.build()
             if draft_step == self.max_num_draft_tokens - 1:
                 self.rollback_draft_tokens()
             # Overwrite num_bitmask_tokens since the request might not be updated on CUDA stream yet.
@@ -550,3 +565,5 @@ def execute_draft_batch(self,
         self.apply_bitmask(logits,
                            d2t=d2t,
                            num_bitmask_tokens=len(self.requests))
+
+        return failed_requests
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -984,14 +984,22 @@ def _executor_loop_pp(self):
 
                             batch_outputs = self._forward_step(scheduled_batch)
 
+                            guided_decoder_failed_requests = None
                             if self.guided_decoder is not None:
                                 self.guided_decoder.add_batch(scheduled_batch)
-                                self.guided_decoder.execute(
+                                guided_decoder_failed_requests = self.guided_decoder.execute(
                                     batch_outputs['logits'])
 
                             sample_state = self._sample_async(
                                 scheduled_batch, batch_outputs)
                             assert sample_state is not None, "Sampling failed"
+
+                            # Handle guided decoder errors after _sample_async to avoid state conflicts.
+                            # If called before, failed requests would be marked as GENERATION_COMPLETE,
+                            # causing _sample_async to fail when accessing context_chunk_size property.
+                            self._handle_guided_decoder_errors(
+                                scheduled_batch, guided_decoder_failed_requests)
+
                             self._update_request_states(scheduled_batch)
 
                     if self.enable_iter_perf_stats:
@@ -1306,11 +1314,21 @@ def _executor_loop(self):
                                 self.guided_decoder.rollback_draft_tokens()
 
                     batch_outputs = self._forward_step(scheduled_batch)
+
+                    guided_decoder_failed_requests = None
                     if self.guided_decoder is not None:
-                        self.guided_decoder.execute(batch_outputs['logits'])
+                        guided_decoder_failed_requests = self.guided_decoder.execute(
+                            batch_outputs['logits'])
 
                     sample_state = self._sample_async(scheduled_batch,
                                                       batch_outputs)
+
+                    # Handle guided decoder errors after _sample_async to avoid state conflicts.
+                    # If called before, failed requests would be marked as GENERATION_COMPLETE,
+                    # causing _sample_async to fail when accessing context_chunk_size property.
+                    self._handle_guided_decoder_errors(
+                        scheduled_batch, guided_decoder_failed_requests)
+
                     if self.drafter is not None:
                         self.drafter.run_drafter_post(scheduled_batch,
                                                       self.resource_manager,
@@ -1562,15 +1580,23 @@ def _executor_loop_overlap(self):
                     self.drafter.cleanup_previous_draft_resources()
 
                 if can_queue:
+                    guided_decoder_failed_requests = None
                     if self.guided_decoder is not None:
                         # add_batch must be called again to have updated new tokens.
                         self.guided_decoder.add_batch(scheduled_batch)
-                        self.guided_decoder.execute(batch_outputs['logits'])
+                        guided_decoder_failed_requests = self.guided_decoder.execute(
+                            batch_outputs['logits'])
 
                     sample_state = self._sample_async(scheduled_batch,
                                                       batch_outputs)
                     assert sample_state is not None, "Sampling failed"
 
+                    # Handle guided decoder errors after _sample_async to avoid state conflicts.
+                    # If called before, failed requests would be marked as GENERATION_COMPLETE,
+                    # causing _sample_async to fail when accessing context_chunk_size property.
+                    self._handle_guided_decoder_errors(
+                        scheduled_batch, guided_decoder_failed_requests)
+
                     self._update_request_states(scheduled_batch)
 
                     ctx_transmission_reqs = self._send_disagg_ctx_cache(
@@ -2694,6 +2720,27 @@ def _handle_speculative_decoding(self, scheduled_batch, previous_tensors,
     def reset_prefix_cache(self):
         self.kv_cache_manager.reset_reuse_state()
 
+    def _handle_guided_decoder_errors(
+            self, scheduled_batch: ScheduledRequests,
+            failed_requests: Optional[List[Tuple[int, str]]]):
+        """Handle errors that occurred during guided decoding.
+
+        Args:
+            scheduled_batch: The current batch of scheduled requests
+            failed_requests: List of (request_id, error_message) tuples for failed requests,
+                           or None if no failures occurred
+        """
+        if not failed_requests:
+            return
+
+        failed_req_id_to_err = {req_id: err for req_id, err in failed_requests}
+
+        for request in scheduled_batch.all_requests():
+            if request.py_request_id not in failed_req_id_to_err:
+                continue
+            error_msg = failed_req_id_to_err[request.py_request_id]
+            self._handle_errors(error_msg, requests=[request])
+
 
 class DisaggPPTerminationHandler:
     """Handles termination synchronization across pipeline parallel ranks under disaggregated serving.