Skip to content

Commit 7bc2d9e

Browse files
authored
[https://nvbugs/5537878][fix] Reserve an extra slot for padded batch (#7998)
Signed-off-by: ziyixiong-nv <219238287+ziyixiong-nv@users.noreply.github.com>
1 parent d821524 commit 7bc2d9e

File tree

6 files changed

+73
-8
lines changed

6 files changed

+73
-8
lines changed

tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -344,7 +344,7 @@ def _get_padded_batch(self, batch: ScheduledRequests,
344344
self.padding_dummy_request = kv_cache_manager.add_dummy_requests(
345345
[CUDA_GRAPH_DUMMY_REQUEST_ID],
346346
is_gen=True,
347-
max_num_draft_tokens=engine.max_draft_len,
347+
max_num_draft_tokens=engine.runtime_draft_len,
348348
use_mrope=engine.use_mrope,
349349
max_beam_width=engine.max_beam_width)[0]
350350
self.padding_dummy_request.is_cuda_graph_dummy = True

tensorrt_llm/_torch/pyexecutor/model_engine.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,6 @@
6161
from .sampler import SampleStateTensors
6262
from .scheduler import ScheduledRequests
6363

64-
MAX_UINT64 = (1 << 64) - 1
65-
6664

6765
class ModelEngine(ABC):
6866

tensorrt_llm/_torch/pyexecutor/py_executor.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121

2222
from tensorrt_llm._torch.pyexecutor.resource_manager import (
2323
ResourceManagerType, request_context)
24-
from tensorrt_llm._torch.pyexecutor.seq_slot_manager import SeqSlotManager
2524
from tensorrt_llm._utils import (customized_gc_thresholds, global_mpi_rank,
2625
is_trace_enabled, nvtx_range, trace_func)
2726
from tensorrt_llm.bindings.executor import (DisServingRequestStats,
@@ -294,7 +293,6 @@ def __init__(self,
294293
raise NotImplementedError(
295294
"Drafting is not supported for selected executor loop. "
296295
"Please disable disagg/pipeline parallelism scheduler.")
297-
self.draft_seq_slot_manager = SeqSlotManager(max_num_sequences)
298296
self.garbage_collection_gen0_threshold = garbage_collection_gen0_threshold
299297
self.max_seq_len = max_seq_len
300298

tensorrt_llm/_torch/pyexecutor/resource_manager.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1058,6 +1058,13 @@ def fill_slot_id_tensor(self, requests: List[LlmRequest],
10581058
raise ValueError(f"Request {request.request_id} has no slot id")
10591059

10601060
def add_slot(self, request_id: int):
1061+
if request_id in self.slot_mapping:
1062+
# CUDA graph dummy request could be added for different batches,
1063+
# but we only need to reserve slot for it once.
1064+
from .cuda_graph_runner import CUDA_GRAPH_DUMMY_REQUEST_ID
1065+
assert request_id == CUDA_GRAPH_DUMMY_REQUEST_ID
1066+
return self.slot_mapping[request_id]
1067+
10611068
if len(self.free_slots) == 0:
10621069
raise ValueError("No free slots")
10631070
slot = self.free_slots.pop()

tensorrt_llm/_torch/speculative/eagle3.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,10 @@ def __init__(self, config: "EagleDecodingConfig", dtype: torch.dtype,
3535
self.hidden_size = hidden_size
3636
self.max_num_requests = max_num_requests
3737
self.max_seq_len = max_seq_len
38-
self.slot_manager = SlotManager(max_num_requests)
38+
# There could be dummy request for padding batch when using CUDA graph.
39+
# Reserve one more slot for the dummy request.
40+
slot_size = self.max_seq_len + 1
41+
self.slot_manager = SlotManager(slot_size)
3942
self.max_total_draft_tokens = config.max_total_draft_tokens
4043

4144
# empty hidden states tensor
@@ -46,9 +49,9 @@ def __init__(self, config: "EagleDecodingConfig", dtype: torch.dtype,
4649
dtype=self.dtype,
4750
device='cuda')
4851
# sequence length, only used for metadata preparation
49-
self.seq_lens = {i: 0 for i in range(max_num_requests)}
52+
self.seq_lens = {i: 0 for i in range(slot_size)}
5053
# start indices of each slot
51-
self.start_indices = {i: 0 for i in range(max_num_requests)}
54+
self.start_indices = {i: 0 for i in range(slot_size)}
5255
# whether the next draft forward is the first
5356
self.is_first_draft = True
5457
self.spec_tree_manager = None

tests/unittest/_torch/speculative/test_eagle3.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,5 +374,64 @@ def test_multi_eagle3(use_one_model: bool):
374374
pass
375375

376376

377+
@pytest.mark.parametrize("disable_overlap_scheduler", [True, False])
378+
def test_eagle3_cuda_graph_padding(disable_overlap_scheduler: bool):
379+
"""Test CUDA graph padding with 3 requests and max_batch_size=4.
380+
381+
This test verifies that when using CUDA graph with padding enabled,
382+
the system properly reserves one additional slot for the padded dummy request.
383+
Without this fix, there would be errors caused by no free slot.
384+
"""
385+
attn_backend = "TRTLLM"
386+
enable_block_reuse = False
387+
use_one_model = False
388+
enable_chunked_prefill = False
389+
390+
total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
391+
if total_mem_gb < 35:
392+
pytest.skip("Not enough memory to load target + draft model")
393+
394+
models_path = llm_models_root()
395+
eagle_model_dir = f"{models_path}/EAGLE3-LLaMA3.1-Instruct-8B"
396+
target_model_dir = f"{models_path}/llama-3.1-model/Llama-3.1-8B-Instruct"
397+
398+
# Test with 3 requests and max_batch_size=4 to trigger padding
399+
max_batch_size = 4
400+
max_draft_len = 4
401+
kv_cache_config = KvCacheConfig(enable_block_reuse=enable_block_reuse,
402+
max_tokens=8192)
403+
cuda_graph_config = CudaGraphConfig(batch_sizes=[1, 2, 4],
404+
enable_padding=True)
405+
406+
llm_common_config = dict(
407+
model=target_model_dir,
408+
attn_backend=attn_backend,
409+
disable_overlap_scheduler=disable_overlap_scheduler,
410+
cuda_graph_config=cuda_graph_config,
411+
max_batch_size=max_batch_size,
412+
kv_cache_config=kv_cache_config,
413+
max_seq_len=8192,
414+
enable_chunked_prefill=enable_chunked_prefill,
415+
)
416+
417+
spec_config = EagleDecodingConfig(
418+
max_draft_len=max_draft_len,
419+
speculative_model_dir=eagle_model_dir,
420+
eagle3_one_model=use_one_model,
421+
)
422+
423+
# Create the LLM instance
424+
llm_spec = LLM(**llm_common_config, speculative_config=spec_config)
425+
426+
prompts = [
427+
"The capital of France is", "The president of the United States is",
428+
"The future of AI is"
429+
]
430+
431+
sampling_params = SamplingParams(max_tokens=20, temperature=0)
432+
llm_spec.generate(prompts, sampling_params)
433+
llm_spec.shutdown()
434+
435+
377436
if __name__ == "__main__":
378437
unittest.main()

0 commit comments

Comments
 (0)