[TRTLLM-9687][chore] Prevent draft requests from changing max_lengths_tensor

stnie · stnie · commit 3d3561e656d4 · 2026-01-21T09:34:04.000Z
Signed-off-by: Stefan Niebler &lt;82932102+stnie@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
@@ -1536,9 +1536,13 @@ def _process_draft_tokens_tree(
         return num_accepted_draft_tokens - 1
 
     def _is_new_request(self, request: LlmRequest) -> bool:
-        return not request.is_finished and (
-            (request.is_context_init_state and request.is_last_context_chunk)
-            or request.is_disagg_generation_transmission_complete
+        return (
+            not request.is_finished
+            and not request.py_is_draft
+            and (
+                (request.is_context_init_state and request.is_last_context_chunk)
+                or request.is_disagg_generation_transmission_complete
+            )
         )
 
     @override
diff --git a/tensorrt_llm/_torch/speculative/mtp.py b/tensorrt_llm/_torch/speculative/mtp.py
@@ -230,6 +230,10 @@ class Store(TorchSampler.Store):
         def __post_init__(self):
             pass  # finish_reasons has no size to compare against new_tokens in MTPSampler
 
+    def setup_sampler_step(self, scheduled_requests: ScheduledRequests):
+        # MTPSampler does not need to setup additional buffers before the sampler step
+        pass
+
     def __init__(self, args: TorchSampler.Args, *, nextn: int):
         self.mapping = None
         self.draft_len = nextn