[https://nvbugs/5606268][fix] Separate cuda graph workspace to prevent IMA (#8685)

JunyiXu-nv · mikeiovine · commit 51493c169640 · 2025-11-04T13:18:01.000-08:00
Signed-off-by: Junyi Xu &lt;219237550+JunyiXu-nv@users.noreply.github.com&gt;
Signed-off-by: Mike Iovine &lt;6158008+mikeiovine@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py
@@ -565,6 +565,7 @@ def is_nvfp4_output_kernel_available(
 @dataclass(kw_only=True)
 class TrtllmAttentionMetadata(AttentionMetadata):
     workspace: Optional[torch.Tensor] = None
+    cuda_graph_workspace: Optional[torch.Tensor] = None
 
     # TrtllmAttention needs to know the beam width to access to the cache indirection buffer,
     # when beam search is enabled.
@@ -680,6 +681,14 @@ def _post_init_with_buffers(self, buffers) -> None:
                 device='cuda',
                 dtype=torch.int8,
             )
+
+        if self.cuda_graph_workspace is None:
+            self.cuda_graph_workspace = torch.empty(
+                (0, ),
+                device='cuda',
+                dtype=torch.int8,
+            )
+
         if self.kv_cache_manager is not None:
             self.kv_cache_block_offsets = self.get_empty(
                 buffers,
@@ -1317,8 +1326,9 @@ def forward(
             host_kv_cache_pool_pointers=metadata.host_kv_cache_pool_pointers,
             host_kv_cache_pool_mapping=metadata.host_kv_cache_pool_mapping,
             block_ids_per_seq=metadata.block_ids_per_seq,
-            workspace=metadata.
-            workspace,  # re-enable it, if pass None to it, fp8 mla will encounter invalid cuda free issue.
+            # re-enable it, if pass None to it, fp8 mla will encounter invalid cuda free issue.
+            workspace=metadata.workspace
+            if not metadata.is_cuda_graph else metadata.cuda_graph_workspace,
             cache_indirection=metadata.cache_indirection,
             kv_scale_orig_quant=self.kv_scale_orig_quant,
             kv_scale_quant_orig=self.kv_scale_quant_orig,