[https://nvbugs/5537878][fix] Reserve an extra slot for padded batch (#7998)

ziyixiong-nv · web-flow · commit 7bc2d9e993b2 · 2025-10-03T08:42:52.000-07:00
Signed-off-by: ziyixiong-nv &lt;219238287+ziyixiong-nv@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py b/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py
@@ -344,7 +344,7 @@ def _get_padded_batch(self, batch: ScheduledRequests,
             self.padding_dummy_request = kv_cache_manager.add_dummy_requests(
                 [CUDA_GRAPH_DUMMY_REQUEST_ID],
                 is_gen=True,
-                max_num_draft_tokens=engine.max_draft_len,
+                max_num_draft_tokens=engine.runtime_draft_len,
                 use_mrope=engine.use_mrope,
                 max_beam_width=engine.max_beam_width)[0]
             self.padding_dummy_request.is_cuda_graph_dummy = True
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -61,8 +61,6 @@
 from .sampler import SampleStateTensors
 from .scheduler import ScheduledRequests
 
-MAX_UINT64 = (1 << 64) - 1
-
 
 class ModelEngine(ABC):
 
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -21,7 +21,6 @@
 
 from tensorrt_llm._torch.pyexecutor.resource_manager import (
     ResourceManagerType, request_context)
-from tensorrt_llm._torch.pyexecutor.seq_slot_manager import SeqSlotManager
 from tensorrt_llm._utils import (customized_gc_thresholds, global_mpi_rank,
                                  is_trace_enabled, nvtx_range, trace_func)
 from tensorrt_llm.bindings.executor import (DisServingRequestStats,
@@ -294,7 +293,6 @@ def __init__(self,
                 raise NotImplementedError(
                     "Drafting is not supported for selected executor loop. "
                     "Please disable disagg/pipeline parallelism scheduler.")
-            self.draft_seq_slot_manager = SeqSlotManager(max_num_sequences)
         self.garbage_collection_gen0_threshold = garbage_collection_gen0_threshold
         self.max_seq_len = max_seq_len
 
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -1058,6 +1058,13 @@ def fill_slot_id_tensor(self, requests: List[LlmRequest],
                 raise ValueError(f"Request {request.request_id} has no slot id")
 
     def add_slot(self, request_id: int):
+        if request_id in self.slot_mapping:
+            # CUDA graph dummy request could be added for different batches,
+            # but we only need to reserve slot for it once.
+            from .cuda_graph_runner import CUDA_GRAPH_DUMMY_REQUEST_ID
+            assert request_id == CUDA_GRAPH_DUMMY_REQUEST_ID
+            return self.slot_mapping[request_id]
+
         if len(self.free_slots) == 0:
             raise ValueError("No free slots")
         slot = self.free_slots.pop()
diff --git a/tensorrt_llm/_torch/speculative/eagle3.py b/tensorrt_llm/_torch/speculative/eagle3.py
@@ -35,7 +35,10 @@ def __init__(self, config: "EagleDecodingConfig", dtype: torch.dtype,
         self.hidden_size = hidden_size
         self.max_num_requests = max_num_requests
         self.max_seq_len = max_seq_len
-        self.slot_manager = SlotManager(max_num_requests)
+        # There could be dummy request for padding batch when using CUDA graph.
+        # Reserve one more slot for the dummy request.
+        slot_size = self.max_seq_len + 1
+        self.slot_manager = SlotManager(slot_size)
         self.max_total_draft_tokens = config.max_total_draft_tokens
 
         # empty hidden states tensor
@@ -46,9 +49,9 @@ def __init__(self, config: "EagleDecodingConfig", dtype: torch.dtype,
             dtype=self.dtype,
             device='cuda')
         # sequence length, only used for metadata preparation
-        self.seq_lens = {i: 0 for i in range(max_num_requests)}
+        self.seq_lens = {i: 0 for i in range(slot_size)}
         # start indices of each slot
-        self.start_indices = {i: 0 for i in range(max_num_requests)}
+        self.start_indices = {i: 0 for i in range(slot_size)}
         # whether the next draft forward is the first
         self.is_first_draft = True
         self.spec_tree_manager = None
diff --git a/tests/unittest/_torch/speculative/test_eagle3.py b/tests/unittest/_torch/speculative/test_eagle3.py
@@ -374,5 +374,64 @@ def test_multi_eagle3(use_one_model: bool):
             pass
 
 
+@pytest.mark.parametrize("disable_overlap_scheduler", [True, False])
+def test_eagle3_cuda_graph_padding(disable_overlap_scheduler: bool):
+    """Test CUDA graph padding with 3 requests and max_batch_size=4.
+
+    This test verifies that when using CUDA graph with padding enabled,
+    the system properly reserves one additional slot for the padded dummy request.
+    Without this fix, there would be errors caused by no free slot.
+    """
+    attn_backend = "TRTLLM"
+    enable_block_reuse = False
+    use_one_model = False
+    enable_chunked_prefill = False
+
+    total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
+    if total_mem_gb < 35:
+        pytest.skip("Not enough memory to load target + draft model")
+
+    models_path = llm_models_root()
+    eagle_model_dir = f"{models_path}/EAGLE3-LLaMA3.1-Instruct-8B"
+    target_model_dir = f"{models_path}/llama-3.1-model/Llama-3.1-8B-Instruct"
+
+    # Test with 3 requests and max_batch_size=4 to trigger padding
+    max_batch_size = 4
+    max_draft_len = 4
+    kv_cache_config = KvCacheConfig(enable_block_reuse=enable_block_reuse,
+                                    max_tokens=8192)
+    cuda_graph_config = CudaGraphConfig(batch_sizes=[1, 2, 4],
+                                        enable_padding=True)
+
+    llm_common_config = dict(
+        model=target_model_dir,
+        attn_backend=attn_backend,
+        disable_overlap_scheduler=disable_overlap_scheduler,
+        cuda_graph_config=cuda_graph_config,
+        max_batch_size=max_batch_size,
+        kv_cache_config=kv_cache_config,
+        max_seq_len=8192,
+        enable_chunked_prefill=enable_chunked_prefill,
+    )
+
+    spec_config = EagleDecodingConfig(
+        max_draft_len=max_draft_len,
+        speculative_model_dir=eagle_model_dir,
+        eagle3_one_model=use_one_model,
+    )
+
+    # Create the LLM instance
+    llm_spec = LLM(**llm_common_config, speculative_config=spec_config)
+
+    prompts = [
+        "The capital of France is", "The president of the United States is",
+        "The future of AI is"
+    ]
+
+    sampling_params = SamplingParams(max_tokens=20, temperature=0)
+    llm_spec.generate(prompts, sampling_params)
+    llm_spec.shutdown()
+
+
 if __name__ == "__main__":
     unittest.main()