Fix CI

zheyuf · zheyuf · commit 0cfdfeb8e9f9 · 2025-11-25T22:40:09.000-08:00
Signed-off-by: Zheyu Fu &lt;zheyuf@NVIDIA.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py b/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py
@@ -62,7 +62,6 @@ class CUDAGraphRunnerConfig:
     max_num_tokens: int
     spec_config: Optional[DecodingBaseConfig]
     cuda_graph_mem_pool: Any
-    dynamic_draft_len_mapping: Optional[Dict[int, int]]
     use_mrope: bool
     original_max_draft_len: int
     original_max_total_draft_tokens: int
@@ -72,6 +71,7 @@ class CUDAGraphRunnerConfig:
     mapping: Optional[Mapping]
     dist: Optional[MPIDist]
     kv_cache_manager_key: Any
+    dynamic_draft_len_mapping: Optional[Dict[int, int]]
 
 
 class CUDAGraphRunner:
@@ -452,7 +452,7 @@ def clear(self):
         self.graphs.clear()
         self.graph_outputs.clear()
         self.graph_metadata.clear()
-        self.padding_dummy_requests = None
+        self.padding_dummy_requests = {}
         del self.memory_pool
         self.memory_pool = None
         torch.cuda.empty_cache()
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -562,8 +562,8 @@ def warmup(self, resource_manager: ResourceManager) -> None:
             return
 
         # The lifetime of model engine and kv cache manager can be different.
-        # Reset the global cuda graph dummy request to None in warmup.
-        self.cuda_graph_runner.padding_dummy_request = None
+        # Reset the global cuda graph dummy requests in warmup.
+        self.cuda_graph_runner.padding_dummy_requests = {}
 
         # TODO: current warmup_request is not suitable for context parallelism.
         cp_type = self.mapping.cp_config.get('cp_type', None)
diff --git a/tests/unittest/_torch/executor/test_pytorch_model_engine.py b/tests/unittest/_torch/executor/test_pytorch_model_engine.py
@@ -172,7 +172,7 @@ def test_pad_generation_requests(self) -> None:
             batch.context_requests = []
             batch.generation_requests = requests
             pages_before = kv_cache_manager.get_num_free_blocks()
-            new_dummy_block = 1 if model_engine.cuda_graph_runner.padding_dummy_request is None else 0
+            new_dummy_block = 1 if not model_engine.cuda_graph_runner.padding_dummy_requests else 0
             with model_engine.cuda_graph_runner.pad_batch(
                     batch, resource_manager) as padded_batch:
                 if batch_size < 8 and max_seq_len < 25: