[TRTLLM-4923][feat] Enable CUDA graphs for Nemotron-H (NVIDIA#5646)

tomeras91 · web-flow · commit 7dbecf7272ba · 2025-07-03T11:07:51.000+03:00
Signed-off-by: Tomer Asida &lt;57313761+tomeras91@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py b/tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py
@@ -163,15 +163,8 @@ def forward(
         seqlen_split_size = [num_prefill_tokens, num_decode_tokens]
         batch_split_size = [num_prefills, num_decodes]
 
-        state_indices = attn_metadata.kv_cache_manager.get_state_indices()
-
-        # warm up does not prepare resources, so no relevant state indices
-        is_warmup = state_indices.numel() == 0
-        if is_warmup:
-            # in this case, assume batch takes first indices in mamba cache
-            state_indices = torch.arange(num_prefills + num_decodes,
-                                         device=state_indices.device,
-                                         dtype=state_indices.dtype)
+        state_indices = attn_metadata.kv_cache_manager.get_state_indices(
+        )[:num_prefills + num_decodes]
 
         state_indices_p, state_indices_d = torch.split(state_indices,
                                                        batch_split_size)
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -812,7 +812,7 @@ def __init__(
         self.mamba_cache_index: Dict[int, int] = {}
 
         # mamba cache state indices
-        self.state_indices: torch.Tensor = torch.tensor([],
+        self.state_indices: torch.Tensor = torch.arange(max_batch_size,
                                                         device=device,
                                                         dtype=torch.int32)
 
@@ -829,9 +829,8 @@ def prepare_mamba_cache_blocks(self, request_ids: List[int]):
                 block = self.mamba_cache_free_blocks.pop()
                 self.mamba_cache_index[r] = block
                 state_indices.append(block)
-        self.state_indices = torch.as_tensor(state_indices,
-                                             dtype=torch.int32,
-                                             device=self.ssm_states.device)
+        self.state_indices[:len(state_indices)] = torch.as_tensor(
+            state_indices, dtype=torch.int32, device=self.ssm_states.device)
 
     def free_mamba_cache_blocks(self, request_id: int):
         if request_id in self.mamba_cache_index:
diff --git a/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py b/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py
@@ -5,6 +5,7 @@
 from tensorrt_llm import LLM
 from tensorrt_llm.llmapi import KvCacheConfig
 from tensorrt_llm.llmapi.llm import RequestOutput
+from tensorrt_llm.llmapi.llm_args import CudaGraphConfig
 from tensorrt_llm.sampling_params import SamplingParams
 
 
@@ -28,25 +29,36 @@ def extract_decode_logprobs(result: RequestOutput,
     return get_logprobs(token_ids, logits)
 
 
+def create_nemotron_h_llm(use_cuda_graph, disable_overlap_scheduler,
+                          max_batch_size):
+    """Create LLM with specific overlap scheduler setting"""
+    model_dir = f"{llm_models_root(check=True)}/Nemotron-H-8B-Base-8K"
+    return LLM(
+        model=model_dir,
+        tensor_parallel_size=1,
+        max_batch_size=max_batch_size,
+        cuda_graph_config=CudaGraphConfig() if use_cuda_graph else None,
+        disable_overlap_scheduler=disable_overlap_scheduler,
+        kv_cache_config=KvCacheConfig(enable_block_reuse=False),
+        enable_trtllm_sampler=True,
+    )
+
+
 @skip_gpu_memory_less_than(
     (2 * 8 + 1) * 2**30)  # 8B, bf16, plus 1 GB for good measure
 def test_nemotron_h_correctness():
     # This test is close to memory limit on A30 (with 24GB), so empty cache first
     torch.cuda.empty_cache()
 
-    model_dir = f"{llm_models_root(check=True)}/Nemotron-H-8B-Base-8K"
     text_prompts = [
         "The future of AI is",
         "The president of the United States is",
     ]
     num_prompts = len(text_prompts)
 
-    nemotron_h = LLM(
-        model=model_dir,
-        max_batch_size=num_prompts,
-        kv_cache_config=KvCacheConfig(enable_block_reuse=False),
-        enable_trtllm_sampler=True,
-    )
+    nemotron_h = create_nemotron_h_llm(use_cuda_graph=False,
+                                       disable_overlap_scheduler=False,
+                                       max_batch_size=num_prompts)
 
     expected_completions = [
         " bright, with endless possibilities for innovation and growth",
@@ -223,3 +235,68 @@ def test_nemotron_h_correctness():
 
     finally:
         nemotron_h.shutdown()
+
+
+def test_nemotron_h_cuda_graph_overlap_scheduler():
+    prompts = [
+        "Tell me something I don't know about the future of AI",
+        "The president of the United States is",
+        "The capital of France is",
+        "Hello, this is a beautiful day and I'm eager to start my day and",
+    ]
+    sampling_config = SamplingParams(max_tokens=12,
+                                     temperature=0.0,
+                                     return_generation_logits=True)
+
+    # Test without cg and overlap scheduler disabled
+    with create_nemotron_h_llm(use_cuda_graph=False,
+                               disable_overlap_scheduler=True,
+                               max_batch_size=16) as llm:
+        outputs_no_cg_no_overlap = llm.generate(prompts,
+                                                sampling_params=sampling_config,
+                                                use_tqdm=True)
+
+    # Test with cg and overlap scheduler disabled
+    with create_nemotron_h_llm(use_cuda_graph=True,
+                               disable_overlap_scheduler=True,
+                               max_batch_size=16) as llm:
+        outputs_with_cg_no_overlap = llm.generate(
+            prompts, sampling_params=sampling_config, use_tqdm=True)
+
+    # Test with cg and overlap scheduler enabled
+    with create_nemotron_h_llm(use_cuda_graph=True,
+                               disable_overlap_scheduler=False,
+                               max_batch_size=16) as llm:
+        outputs_with_cg_with_overlap = llm.generate(
+            prompts, sampling_params=sampling_config, use_tqdm=True)
+
+    # Verify outputs are consistent
+    for (no_cg_no_overlap, with_cg_no_overlap,
+         with_cg_with_overlap) in zip(outputs_no_cg_no_overlap,
+                                      outputs_with_cg_no_overlap,
+                                      outputs_with_cg_with_overlap):
+
+        assert (no_cg_no_overlap.outputs[0].text ==
+                with_cg_no_overlap.outputs[0].text)
+        assert (with_cg_no_overlap.outputs[0].text ==
+                with_cg_with_overlap.outputs[0].text)
+
+        # similar to other unittests comparing with / without CG, compare logits of first generation step (2nd generated token)
+        torch.testing.assert_close(
+            no_cg_no_overlap.outputs[0].generation_logits[1, :],
+            with_cg_no_overlap.outputs[0].generation_logits[1, :],
+            atol=0.2,
+            rtol=0.2)
+
+        # compare logprobs of all generated tokens
+        torch.testing.assert_close(extract_decode_logprobs(no_cg_no_overlap),
+                                   extract_decode_logprobs(with_cg_no_overlap),
+                                   atol=0.2,
+                                   rtol=0.2)
+
+        # overlap scheduler should have no effect on all logits - low tolerance
+        torch.testing.assert_close(
+            with_cg_no_overlap.outputs[0].generation_logits,
+            with_cg_with_overlap.outputs[0].generation_logits,
+            atol=0.05,
+            rtol=0.05)