Skip to content

Commit 0cfdfeb

Browse files
committed
Fix CI
Signed-off-by: Zheyu Fu <zheyuf@NVIDIA.com>
1 parent 459c768 commit 0cfdfeb

File tree

3 files changed

+5
-5
lines changed

3 files changed

+5
-5
lines changed

tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,6 @@ class CUDAGraphRunnerConfig:
6262
max_num_tokens: int
6363
spec_config: Optional[DecodingBaseConfig]
6464
cuda_graph_mem_pool: Any
65-
dynamic_draft_len_mapping: Optional[Dict[int, int]]
6665
use_mrope: bool
6766
original_max_draft_len: int
6867
original_max_total_draft_tokens: int
@@ -72,6 +71,7 @@ class CUDAGraphRunnerConfig:
7271
mapping: Optional[Mapping]
7372
dist: Optional[MPIDist]
7473
kv_cache_manager_key: Any
74+
dynamic_draft_len_mapping: Optional[Dict[int, int]]
7575

7676

7777
class CUDAGraphRunner:
@@ -452,7 +452,7 @@ def clear(self):
452452
self.graphs.clear()
453453
self.graph_outputs.clear()
454454
self.graph_metadata.clear()
455-
self.padding_dummy_requests = None
455+
self.padding_dummy_requests = {}
456456
del self.memory_pool
457457
self.memory_pool = None
458458
torch.cuda.empty_cache()

tensorrt_llm/_torch/pyexecutor/model_engine.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -562,8 +562,8 @@ def warmup(self, resource_manager: ResourceManager) -> None:
562562
return
563563

564564
# The lifetime of model engine and kv cache manager can be different.
565-
# Reset the global cuda graph dummy request to None in warmup.
566-
self.cuda_graph_runner.padding_dummy_request = None
565+
# Reset the global cuda graph dummy requests in warmup.
566+
self.cuda_graph_runner.padding_dummy_requests = {}
567567

568568
# TODO: current warmup_request is not suitable for context parallelism.
569569
cp_type = self.mapping.cp_config.get('cp_type', None)

tests/unittest/_torch/executor/test_pytorch_model_engine.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ def test_pad_generation_requests(self) -> None:
172172
batch.context_requests = []
173173
batch.generation_requests = requests
174174
pages_before = kv_cache_manager.get_num_free_blocks()
175-
new_dummy_block = 1 if model_engine.cuda_graph_runner.padding_dummy_request is None else 0
175+
new_dummy_block = 1 if not model_engine.cuda_graph_runner.padding_dummy_requests else 0
176176
with model_engine.cuda_graph_runner.pad_batch(
177177
batch, resource_manager) as padded_batch:
178178
if batch_size < 8 and max_seq_len < 25:

0 commit comments

Comments
 (0)