Add wrapper class of tensor

HuiGao-NV · HuiGao-NV · commit 2191b8f270d3 · 2025-10-28T08:24:45.000Z
Support to share more buffers

Signed-off-by: Hui Gao &lt;huig@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/attention_backend/flashinfer.py b/tensorrt_llm/_torch/attention_backend/flashinfer.py
@@ -124,14 +124,20 @@ def positions(self) -> torch.Tensor:
 
     def __post_init__(self) -> None:
         super().__post_init__()
+        self._post_init_with_buffers(self.cuda_graph_buffers)
+
+    def _post_init_with_buffers(self, buffers) -> None:
+        capture_graph = torch.cuda.is_current_stream_capturing()
 
         if self.workspace_buffer is None:
             # Note: even though flashinfer only recommends 128 MB, we have to push it
             # a bit higher to cover all possible CUDA graph cases. If it's too small,
             # warmup will crash.
-            self.workspace_buffer = torch.empty(320 * 1024 * 1024,
-                                                dtype=torch.uint8,
-                                                device="cuda")
+            self.workspace_buffer = self.get_empty(
+                buffers, (320 * 1024 * 1024, ),
+                dtype=torch.uint8,
+                cache_name="workspace_buffer",
+                capture_graph=capture_graph)
 
         self.paged_kv_indptr_decode = torch.empty((self.max_num_requests + 1, ),
                                                   device='cuda',
@@ -163,9 +169,11 @@ def __post_init__(self) -> None:
 
         if self.kv_cache_manager is not None:
             max_num_pages = self.kv_cache_manager.blocks_in_primary_pool
-            self._paged_kv_indices = torch.empty((max_num_pages, ),
-                                                 device='cuda',
-                                                 dtype=torch.int)
+            self._paged_kv_indices = self.get_empty(
+                buffers, (max_num_pages, ),
+                dtype=torch.int,
+                cache_name="_paged_kv_indices",
+                capture_graph=capture_graph)
 
     def create_cuda_graph_metadata(self,
                                    max_batch_size: int,
diff --git a/tensorrt_llm/_torch/attention_backend/interface.py b/tensorrt_llm/_torch/attention_backend/interface.py
@@ -349,6 +349,47 @@ def update_for_spec_dec(self) -> None:
         Hook to be called during forward when using spec-dec one-model mode.
         """
 
+    @staticmethod
+    def get_empty(buffers,
+                  tensor_shape: list[int],
+                  dtype: torch.dtype,
+                  cache_name: str,
+                  capture_graph: bool = False) -> torch.Tensor:
+        """
+        Finds a compatible, reusable buffer from a cache or creates a new one.
+
+        This function searches for a pre-allocated tensor (buffer) that can be
+        reused for an operation involving a tensor with the shape of `tensor_shape`.
+
+        The compatibility rules are: The buffer's total elements must be >= tensor_shape's.
+
+        If a compatible buffer is found, it's returned immediately. Otherwise, a new
+        buffer is allocated on the 'cuda' device with the give properties of 'tensor_shape' and 'dtype'.
+
+        Args:
+            tensor_shape: The required shape.
+            dtype: The required dtype.
+            cache_name: The key for the specific list of buffers to search in.
+        Returns:
+            An existing compatible buffer or a newly created one.
+        """
+        if buffers is None:
+            return torch.zeros(tensor_shape, device='cuda', dtype=dtype)
+
+        return buffers.get_buffer(tensor_shape, dtype, cache_name,
+                                  capture_graph)
+
+    @staticmethod
+    def get_empty_like(buffers,
+                       like_tensor: torch.Tensor,
+                       cache_name: str,
+                       capture_graph: bool = False) -> torch.Tensor:
+        return AttentionMetadata.get_empty(buffers,
+                                           like_tensor.shape,
+                                           dtype=like_tensor.dtype,
+                                           cache_name=cache_name,
+                                           capture_graph=capture_graph)
+
 
 class PositionalEmbedder(Protocol):
     """
diff --git a/tensorrt_llm/_torch/attention_backend/sparse/dsa.py b/tensorrt_llm/_torch/attention_backend/sparse/dsa.py
@@ -286,18 +286,11 @@ def __post_init__(self):
 
         capture_graph = torch.cuda.is_current_stream_capturing()
 
-        def get_empty(tensor_shape: list[int], dtype: torch.dtype,
-                      cache_name: str) -> torch.Tensor:
-            if self.cuda_graph_buffers is None:
-                return torch.zeros(tensor_shape, device='cuda', dtype=dtype)
-            return self.cuda_graph_buffers.get_buffer(tensor_shape, dtype,
-                                                      cache_name, capture_graph)
-
         self.indexer_k_cache_block_offsets = get_empty(
             [self.max_num_sequences, self.kv_cache_manager.max_blocks_per_seq],
             cache_name="indexer_k_cache_block_offsets",
             dtype=torch.int32,
-        )
+            capture_graph=capture_graph)
         self.host_indexer_k_cache_block_offsets = torch.zeros_like(
             self.indexer_k_cache_block_offsets,
             device='cpu',
@@ -310,17 +303,16 @@ def get_empty(tensor_shape: list[int], dtype: torch.dtype,
                 (self.max_num_requests + 1, ),
                 cache_name="ctx_cached_token_indptr",
                 dtype=torch.int64,
-            )
+                capture_graph=capture_graph)
             self.host_ctx_cached_token_indptr = torch.zeros_like(
                 self.ctx_cached_token_indptr,
                 device='cpu',
                 pin_memory=True,
             )
-            self.ctx_kv_indptr = get_empty(
-                (self.max_num_requests + 1, ),
-                cache_name="ctx_kv_indptr",
-                dtype=torch.int64,
-            )
+            self.ctx_kv_indptr = get_empty((self.max_num_requests + 1, ),
+                                           cache_name="ctx_kv_indptr",
+                                           dtype=torch.int64,
+                                           capture_graph=capture_graph)
             self.host_ctx_kv_indptr = torch.zeros_like(
                 self.ctx_kv_indptr,
                 device='cpu',
@@ -331,71 +323,65 @@ def get_empty(tensor_shape: list[int], dtype: torch.dtype,
             (self.max_num_requests + 1, ),
             cache_name="gen_cached_token_indptr",
             dtype=torch.int64,
-        )
+            capture_graph=capture_graph)
         self.host_gen_cached_token_indptr = torch.zeros_like(
             self.gen_cached_token_indptr,
             device='cpu',
             pin_memory=True,
         )
-        self.gen_kv_indptr = get_empty(
-            (self.max_num_requests + 1, ),
-            cache_name="gen_kv_indptr",
-            dtype=torch.int64,
-        )
+        self.gen_kv_indptr = get_empty((self.max_num_requests + 1, ),
+                                       cache_name="gen_kv_indptr",
+                                       dtype=torch.int64,
+                                       capture_graph=capture_graph)
         self.host_gen_kv_indptr = torch.zeros_like(
             self.gen_kv_indptr,
             device='cpu',
             pin_memory=True,
         )
         # Indexer metadata
         # Separate slot mappings for non-interleaved layout (flat byte indices)
-        self.slot_mapping_fp8 = get_empty(
-            (self.max_num_tokens, ),
-            cache_name="slot_mapping_fp8",
-            dtype=torch.int64,
-        )
+        self.slot_mapping_fp8 = get_empty((self.max_num_tokens, ),
+                                          cache_name="slot_mapping_fp8",
+                                          dtype=torch.int64,
+                                          capture_graph=capture_graph)
         self.host_slot_mapping_fp8 = torch.zeros_like(
             self.slot_mapping_fp8,
             device='cpu',
             pin_memory=True,
         )
-        self.slot_mapping_scale = get_empty(
-            (self.max_num_tokens, ),
-            cache_name="slot_mapping_scale",
-            dtype=torch.int64,
-        )
+        self.slot_mapping_scale = get_empty((self.max_num_tokens, ),
+                                            cache_name="slot_mapping_scale",
+                                            dtype=torch.int64,
+                                            capture_graph=capture_graph)
         self.host_slot_mapping_scale = torch.zeros_like(
             self.slot_mapping_scale,
             device='cpu',
             pin_memory=True,
         )
         # Per-token request index buffer for topk_indices conversion
-        self.req_idx_per_token = get_empty(
-            (self.max_num_tokens, ),
-            cache_name="req_idx_per_token",
-            dtype=torch.int32,
-        )
+        self.req_idx_per_token = get_empty((self.max_num_tokens, ),
+                                           cache_name="req_idx_per_token",
+                                           dtype=torch.int32,
+                                           capture_graph=capture_graph)
         # Block table for topk_indices conversion (shared for context and generation)
         self.block_table = get_empty(
             (self.max_num_requests, self.kv_cache_manager.max_blocks_per_seq),
             cache_name="block_table",
             dtype=torch.int32,
-        )
+            capture_graph=capture_graph)
         self.scheduler_metadata_buffer = get_empty(
             (self.num_sms + 1, 2),
             cache_name="scheduler_metadata_buffer",
             dtype=torch.int32,
-        )
-        self.cu_seqlen_ks = get_empty(
-            (self.max_num_tokens, ),
-            cache_name="cu_seqlen_ks",
-            dtype=torch.int32,
-        )
-        self.cu_seqlen_ke = get_empty(
-            (self.max_num_tokens, ),
-            cache_name="cu_seqlen_ke",
-            dtype=torch.int32,
-        )
+            capture_graph=capture_graph)
+        self.cu_seqlen_ks = get_empty((self.max_num_tokens, ),
+                                      cache_name="cu_seqlen_ks",
+                                      dtype=torch.int32,
+                                      capture_graph=capture_graph)
+        self.cu_seqlen_ke = get_empty((self.max_num_tokens, ),
+                                      cache_name="cu_seqlen_ke",
+                                      dtype=torch.int32,
+                                      capture_graph=capture_graph)
 
     def prepare(self):
         super().prepare()
diff --git a/tensorrt_llm/_torch/attention_backend/sparse/rocket.py b/tensorrt_llm/_torch/attention_backend/sparse/rocket.py
@@ -35,14 +35,17 @@ def __post_init__(self):
         if self.sparse_attention_config is None:
             raise ValueError("Sparse attention config is not set")
         self.prompt_budget = self.sparse_attention_config.prompt_budget
-        self.kt_cache_block_offsets = torch.empty(
-            [
+
+        capture_graph = torch.cuda.is_current_stream_capturing()
+        self.kt_cache_block_offsets = self.get_empty(
+            self.cuda_graph_buffers, [
                 self.max_num_sequences,
                 self.kv_cache_manager.max_kt_blocks_per_seq
             ],
             dtype=torch.int32,
-            device='cuda',
-        )
+            cache_name="kt_cache_block_offsets",
+            capture_graph=capture_graph)
+
         self.host_kt_cache_block_offsets = torch.zeros_like(
             self.kt_cache_block_offsets,
             device='cpu',
diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py
@@ -641,50 +641,20 @@ def _post_init_with_buffers(self, buffers) -> None:
 
         capture_graph = torch.cuda.is_current_stream_capturing()
 
-        def get_empty(tensor_shape: list[int], dtype: torch.dtype,
-                      cache_name: str) -> torch.Tensor:
-            """
-            Finds a compatible, reusable buffer from a cache or creates a new one.
-
-            This function searches for a pre-allocated tensor (buffer) that can be
-            reused for an operation involving a tensor with the shape of `tensor_shape`.
-
-            The compatibility rules are: The buffer's total elements must be >= tensor_shape's.
-
-            If a compatible buffer is found, it's returned immediately. Otherwise, a new
-            buffer is allocated on the 'cuda' device with the give properties of 'tensor_shape' and 'dtype'.
-
-            Args:
-                tensor_shape: The required shape.
-                dtype: The required dtype.
-                cache_name: The key for the specific list of buffers to search in.
-            Returns:
-                An existing compatible buffer or a newly created one.
-            """
-            if buffers is None:
-                return torch.zeros(tensor_shape, device='cuda', dtype=dtype)
-
-            return buffers.get_buffer(tensor_shape, dtype, cache_name,
-                                      capture_graph)
-
-        def get_empty_like(like_tensor: torch.Tensor,
-                           cache_name: str) -> torch.Tensor:
-            return get_empty(like_tensor.shape,
-                             cache_name=cache_name,
-                             dtype=like_tensor.dtype)
-
-        self.prompt_lens_cuda = get_empty(
-            (self.max_num_sequences, ),
-            cache_name="prompt_lens_cuda",
-            dtype=torch.int,
-        )
+        self.prompt_lens_cuda = self.get_empty(buffers,
+                                               (self.max_num_sequences, ),
+                                               cache_name="prompt_lens_cuda",
+                                               dtype=torch.int,
+                                               capture_graph=capture_graph)
         self.prompt_lens_cpu = torch.empty_like(
             self.prompt_lens_cuda,
             device='cpu',
             pin_memory=True,
         )
-        self.kv_lens_cuda = get_empty_like(self.prompt_lens_cuda,
-                                           cache_name="kv_lens_cuda")
+        self.kv_lens_cuda = self.get_empty_like(buffers,
+                                                self.prompt_lens_cuda,
+                                                cache_name="kv_lens_cuda",
+                                                capture_graph=capture_graph)
         self.kv_lens = torch.empty_like(self.kv_lens_cuda,
                                         device='cpu',
                                         pin_memory=True)
@@ -699,14 +669,14 @@ def get_empty_like(like_tensor: torch.Tensor,
                 dtype=torch.int8,
             )
         if self.kv_cache_manager is not None:
-            self.kv_cache_block_offsets = get_empty(
-                [
+            self.kv_cache_block_offsets = self.get_empty(
+                buffers, [
                     self.kv_cache_manager.num_pools, self.max_num_sequences, 2,
                     self.kv_cache_manager.max_blocks_per_seq
                 ],
                 cache_name="kv_cache_block_offsets",
                 dtype=torch.int32,
-            )
+                capture_graph=capture_graph)
             self.host_kv_cache_block_offsets = torch.empty_like(
                 self.kv_cache_block_offsets,
                 device='cpu',
@@ -715,50 +685,50 @@ def get_empty_like(like_tensor: torch.Tensor,
             self.block_ids_per_seq = None
             self.kv_block_ids_per_seq = None
             if self.enable_flash_mla:
-                self.block_ids_per_seq = get_empty(
-                    [
+                self.block_ids_per_seq = self.get_empty(
+                    buffers, [
                         self.kv_cache_manager.max_batch_size,
                         self.kv_cache_manager.max_blocks_per_seq
                     ],
                     cache_name="block_ids_per_seq",
                     dtype=torch.int32,
-                )
-                self.kv_block_ids_per_seq = get_empty(
-                    [
+                    capture_graph=capture_graph)
+                self.kv_block_ids_per_seq = self.get_empty(
+                    buffers, [
                         self.kv_cache_manager.max_batch_size,
                         self.kv_cache_manager.max_blocks_per_seq
                     ],
                     cache_name="kv_block_ids_per_seq",
                     dtype=torch.int32,
-                )
+                    capture_graph=capture_graph)
             if self.enable_context_mla_with_cached_kv:
                 # for kv cache reuse/chunked context in MLA
-                self.ctx_cached_token_indptr = get_empty(
-                    (self.max_num_requests + 1, ),
+                self.ctx_cached_token_indptr = self.get_empty(
+                    buffers, (self.max_num_requests + 1, ),
                     cache_name="ctx_cached_token_indptr",
                     dtype=torch.int64,
-                )
+                    capture_graph=capture_graph)
                 self.host_ctx_cached_token_indptr = torch.zeros_like(
                     self.ctx_cached_token_indptr,
                     device='cpu',
                     pin_memory=True,
                 )
-                self.ctx_uncached_token_indptr = get_empty(
-                    (self.max_num_requests + 1, ),
+                self.ctx_uncached_token_indptr = self.get_empty(
+                    buffers, (self.max_num_requests + 1, ),
                     cache_name="ctx_uncached_token_indptr",
                     dtype=torch.int64,
-                )
+                    capture_graph=capture_graph)
                 self.host_ctx_uncached_token_indptr = torch.zeros_like(
                     self.ctx_uncached_token_indptr,
                     device='cpu',
                     pin_memory=True,
                 )
                 # context full seqlens include cached tokens and uncached tokens
-                self.ctx_kv_indptr = get_empty(
-                    (self.max_num_requests + 1, ),
+                self.ctx_kv_indptr = self.get_empty(
+                    buffers, (self.max_num_requests + 1, ),
                     cache_name="ctx_kv_indptr",
                     dtype=torch.int64,
-                )
+                    capture_graph=capture_graph)
                 self.host_ctx_kv_indptr = torch.zeros_like(
                     self.ctx_kv_indptr,
                     device='cpu',
diff --git a/tensorrt_llm/_torch/memory_buffer_utils.py b/tensorrt_llm/_torch/memory_buffer_utils.py
diff --git a/tensorrt_llm/_torch/modules/fused_moe/ops/moe_op_deepgemm.py b/tensorrt_llm/_torch/modules/fused_moe/ops/moe_op_deepgemm.py