Commit before rebase.

zheyuf · zheyuf · commit 42a70e084998 · 2025-11-18T17:00:15.000-08:00
Signed-off-by: Zheyu Fu &lt;zheyuf@NVIDIA.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py b/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py
@@ -99,7 +99,8 @@ def __init__(self, config: CUDAGraphRunnerConfig):
                                  Callable[[], Optional[torch.Tensor]]] = {}
         self.graph_metadata: Dict[Tuple[int, int, int], Dict[str, Any]] = {}
         self.memory_pool = config.cuda_graph_mem_pool
-        self.padding_dummy_request: Optional["Request"] = None
+        self.padding_dummy_requests: Dict[int, "Request"] = {
+        }  # draft_len -> dummy_request
 
         self.shared_static_tensors: Dict[str, torch.Tensor] = {}
         if self.enabled:
@@ -166,6 +167,7 @@ def maybe_get_cuda_graph(
         batch: ScheduledRequests,
         iter_counter: int,
         enable_spec_decode: bool,
+        runtime_draft_len: int,
         attn_metadata: Any,
         spec_metadata: Optional[Any] = None,
         draft_tokens_cuda: Optional[torch.Tensor] = None,
@@ -372,26 +374,31 @@ def _get_padded_batch(self, batch: ScheduledRequests,
         # No padding if it would create too many concurrent requests.
         # This is not strictly required, but we should probably
         # respect the requirement just in case that changes in the future.
-        if self.padding_dummy_request is None:
+        if runtime_draft_len not in self.padding_dummies:
             available_blocks = kv_cache_manager.get_num_free_blocks()
             # No padding if not enough KV cache space
             if available_blocks < 1:
                 return 0
-
-            self.padding_dummy_request = kv_cache_manager.add_dummy_requests(
-                [CUDA_GRAPH_DUMMY_REQUEST_ID],
+            # Create dummy for this specific draft_len (happens once per unique draft_len)
+            # Use unique request ID per draft_len to avoid conflicts
+            dummy_req_id = CUDA_GRAPH_DUMMY_REQUEST_ID - runtime_draft_len
+            dummy = kv_cache_manager.add_dummy_requests(
+                [dummy_req_id],
                 is_gen=True,
                 max_num_draft_tokens=runtime_draft_len,
                 use_mrope=self.config.use_mrope,
                 max_beam_width=self.config.max_beam_width)[0]
-            self.padding_dummy_request.is_cuda_graph_dummy = True
+            dummy.is_cuda_graph_dummy = True
             spec_res_mgr = resource_manager.get_resource_manager(
                 ResourceManagerType.SPEC_RESOURCE_MANAGER)
             if spec_res_mgr:
-                spec_res_mgr.add_dummy_requests([CUDA_GRAPH_DUMMY_REQUEST_ID])
+                spec_res_mgr.add_dummy_requests([dummy_req_id])
+
+            # Store for reuse
+            self.padding_dummies[runtime_draft_len] = dummy
 
-        batch.generation_requests.extend([self.padding_dummy_request] *
-                                         padding_size)
+        padding_dummy = self.padding_dummies[runtime_draft_len]
+        batch.generation_requests.extend([padding_dummy] * padding_size)
         return padding_size
 
     def _round_up_batch_size(self, batch_size: int) -> int:
@@ -426,7 +433,7 @@ def clear(self):
         self.graphs.clear()
         self.graph_outputs.clear()
         self.graph_metadata.clear()
-        self.padding_dummy_request = None
+        self.padding_dummies.clear()
         del self.memory_pool
         self.memory_pool = None
         torch.cuda.empty_cache()
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -658,6 +658,42 @@ def _run_cuda_graph_warmup(self, resource_manager: ResourceManager):
         self._capture_generation_cuda_graphs(resource_manager)
         self._capture_piecewise_cuda_graphs(resource_manager)
 
+    def _graphs_for_dynamic_draft_length(self):
+        """
+        Compute the set of (batch_size, draft_len) pairs that are actually reachable.
+        Used in dynamic draft length feature.
+        """
+        graphs_to_capture = []
+        schedule_thresholds = sorted(self.spec_config.draft_len_schedule.keys())
+
+        # Only iterate over actual CUDA graph batch sizes, not all possible batch sizes
+        for graph_bs in self._cuda_graph_batch_sizes:
+            idx = bisect.bisect_right(schedule_thresholds, graph_bs)
+            if idx == 0:
+                draft_len = 0  # Defensive
+            else:
+                draft_len = self.spec_config.draft_len_schedule[
+                    schedule_thresholds[idx - 1]]
+
+            graphs_to_capture.append((graph_bs, draft_len))
+
+        return list(
+            set(graphs_to_capture))  # Use set to remove duplicates if any
+
+    # def _round_up_to_graph_size(self, actual_bs: int) -> int:
+    #     """Round up actual batch size to nearest CUDA graph batch size using binary search."""
+    #     if not self._cuda_graph_batch_sizes:
+    #         return 0
+
+    #     idx = bisect.bisect_left(self._cuda_graph_batch_sizes, actual_bs)
+
+    #     # If exact match or idx points to next larger size
+    #     if idx < len(self._cuda_graph_batch_sizes):
+    #         return self._cuda_graph_batch_sizes[idx]
+
+    #     # actual_bs is larger than all available sizes
+    #     return self._cuda_graph_batch_sizes[-1]
+
     def _capture_generation_cuda_graphs(self,
                                         resource_manager: ResourceManager):
         """Captures CUDA graphs for pure generation steps."""
@@ -674,38 +710,48 @@ def _capture_generation_cuda_graphs(self,
         cuda_graph_batch_sizes = sorted(self._cuda_graph_batch_sizes,
                                         reverse=True)
         # Create CUDA graphs for different draft lengths
-        draft_lengths = []
+        # draft_lengths = []
         if self.is_draft_model:
             if self.model_is_wrapped and self.is_spec_decode and spec_resource_manager is not None and isinstance(
                     spec_resource_manager, Eagle3ResourceManager):
                 # The CDL path uses draft_len > 0 for the number of iterations in the drafting loop.
-                draft_lengths.append(self.original_max_total_draft_tokens)
+                draft_len = self.original_max_total_draft_tokens
             else:
-                draft_lengths.append(self.max_total_draft_tokens)
+                draft_len = self.max_total_draft_tokens
+            graphs_to_capture = [(bs, draft_len)
+                                 for bs in cuda_graph_batch_sizes]
+        elif (self.spec_config
+              and hasattr(self.spec_config, 'draft_len_schedule')
+              and self.spec_config.draft_len_schedule is not None):
+            # target model with draft_len_schedule: compute exact reachable set
+            graphs_to_capture = self._graphs_for_dynamic_draft_length()
         else:
             # For non-draft model, we also capture the CUDA graph instance for draft length 0,
             # so that when we disable spec decode at runtime, we can still run the captured graph.
             # Note that for one engine mode, we are not able to turn off spec decode at runtime.
+            graphs_to_capture = []
             if (self.max_total_draft_tokens > 0
                     and not self.spec_config.spec_dec_mode.use_one_engine()
                     # Assume that speculation is always on if the user didn't give us a max_concurrency
                     # value. This will save on memory.
                     and self.spec_config.max_concurrency is not None):
-                draft_lengths.append(0)
-            draft_lengths = [self.max_total_draft_tokens]
+                graphs_to_capture.extend([(bs, 0)
+                                          for bs in cuda_graph_batch_sizes])
+            else:
+                graphs_to_capture.extend([(bs, self.max_total_draft_tokens)
+                                          for bs in cuda_graph_batch_sizes])
 
-        for bs in cuda_graph_batch_sizes:
+        graphs_to_capture = sorted(graphs_to_capture, reverse=True)
+        for bs, draft_len in graphs_to_capture:
             if bs > self.batch_size:
                 continue
-
-            for draft_len in draft_lengths:
-                warmup_request = self._create_cuda_graph_warmup_request(
-                    resource_manager, bs, draft_len)
-                with self._release_batch_context(warmup_request,
-                                                 resource_manager) as batch:
-                    if batch is None:
-                        # No KV cache space, cannot continue capturing graphs
-                        return
+            warmup_request = self._create_cuda_graph_warmup_request(
+                resource_manager, bs, draft_len)
+            with self._release_batch_context(warmup_request,
+                                             resource_manager) as batch:
+                if batch is None:
+                    # No KV cache space, cannot continue capturing graphs
+                    return
 
                     logger.info(
                         f"Run generation-only CUDA graph warmup for batch size={bs}, draft_len={draft_len}"
diff --git a/tensorrt_llm/_torch/speculative/drafter.py b/tensorrt_llm/_torch/speculative/drafter.py
@@ -68,16 +68,24 @@ def should_use_spec_decode(self, requests: List[LlmRequest],
     def pad_draft_tokens_for_cuda_graph(
             self, scheduled_requests: ScheduledRequests) -> None:
         """
-        Pad draft tokens to the static max total draft tokens for CUDA graph compatibility.
+        Pad draft tokens for CUDA graph compatibility.
+        CUDA graphs require all requests in a batch to have the same tensor shape.
+        Individual requests may generate fewer draft tokens (e.g., NGram mismatches,
+        early stopping), but all must be padded to the same length.
 
         Args:
             scheduled_requests: The scheduled requests to pad
         """
         for req in scheduled_requests.generation_requests:
             num_draft_tokens = get_draft_token_length(req)
-            req.py_draft_tokens.extend(
-                0 for _ in range(self._static_max_total_draft_tokens -
-                                 num_draft_tokens))
+            if self.draft_len_schedule is not None:
+                # Pad to current iteration's (dynamic) max_draft_tokens if dynamic draft length is enabled
+                target_len = self.max_total_draft_tokens
+            else:
+                target_len = self._static_max_total_draft_tokens
+            if num_draft_tokens < target_len:
+                req.py_draft_tokens.extend(
+                    0 for _ in range(target_len - num_draft_tokens))
 
     def get_draft_len_for_batch_size(self, batch_size: int) -> int:
         """