Reuse tensor

HuiGao-NV · HuiGao-NV · commit 876dd685f707 · 2026-02-26T14:36:09.000Z
Signed-off-by: Hui Gao &lt;huig@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/compilation/backend.py b/tensorrt_llm/_torch/compilation/backend.py
@@ -24,6 +24,7 @@
 class Backend:
 
     _custom_pass_instances: List[PatternMatcherPass] = None
+    _graph_pool: torch.cuda.MemPool = None
     _graph_pool_handle: tuple[int, int] = None
 
     # Following classes are used to let weakref ref the stream and eventlist objects.
@@ -60,7 +61,8 @@ def __init__(
         inductor_config.enable_auto_functionalized_v2 = False
 
         if Backend._graph_pool_handle is None:
-            Backend._graph_pool_handle = torch.cuda.graph_pool_handle()
+            Backend._graph_pool = torch.cuda.MemPool()
+            Backend._graph_pool_handle = Backend._graph_pool.id
 
         self.match_count = []
 
diff --git a/tensorrt_llm/_torch/memory_buffer_utils.py b/tensorrt_llm/_torch/memory_buffer_utils.py
@@ -79,40 +79,55 @@ def get_buffer(self, tensor_shape: list[int], dtype: torch.dtype,
                 best_fit_block = block
                 smallest_sufficient_size = block.buffer.numel()
 
+        for block in list(candidate_blocks):
+            if not block.is_reserved:
+                if best_fit_block is not None:
+                    if block is not best_fit_block:
+                        # Need to call del BufferBlock.buffer, otherwise memory isn't
+                        # released and OOM may happen.
+                        del block.buffer
+                        candidate_blocks.remove(block)
+                else:
+                    del block.buffer
+                    candidate_blocks.remove(block)
+
         if best_fit_block is not None:
             if reserve_buffer:
+                # A suitable buffer was found, so reuse it.
                 best_fit_block.is_reserved = True
-            # A suitable buffer was found, so reuse it.
-            return self._view_as(best_fit_block.buffer, tensor_shape, dtype)
-
-        for block in list(candidate_blocks):
-            if not block.is_reserved:
-                # Need to call del BufferBlock.buffer, otherwise memory isn't
-                # released and OOM may happen.
-                buffer_size = block.buffer.numel()
-                del block.buffer
-                if buffer_size >= 1024 * 1024 * 1024:
-                    torch.cuda.empty_cache()
-                candidate_blocks.remove(block)
+                return self._view_as(best_fit_block.buffer, tensor_shape, dtype)
+            else:
+                # TODO: to reuse tensors both in graph pool and normal pool.
+                if best_fit_block.is_reserved:
+                    return self._view_as(best_fit_block.buffer, tensor_shape,
+                                         dtype)
+                else:
+                    del best_fit_block.buffer
+                    candidate_blocks.remove(best_fit_block)
+
+        def _create_buffer():
+            return torch.zeros((required_memory_size, ),
+                               device='cuda',
+                               dtype=torch.uint8)
 
         # No suitable buffer was found, so allocate a new one.
         # The new buffer is created with uint8 to represent raw bytes.
         new_buffer_tensor = None
         try:
-            with torch.cuda.memory.use_mem_pool(get_shared_pool()):
-                new_buffer_tensor = torch.empty((required_memory_size, ),
-                                                device='cuda',
-                                                dtype=torch.uint8)
+            new_buffer_tensor = _create_buffer()
         except Exception as ex:
-            # Need to check if this is an OOM exception
+            # Need to check if this is an OOM exception``
             logger.debug(
                 f"Exception happened to create tensor from given memory pool: {str(ex)}"
             )
-            # if exception happens during allocating memory from shared pool, retry
-            # to allocate from default pool
-            new_buffer_tensor = torch.empty((required_memory_size, ),
-                                            device='cuda',
-                                            dtype=torch.uint8)
+            # if exception happens during allocating memory from default pool, retry
+            # to allocate from shared pool. Try best to avoid fragmentation in shared pool.
+            mem_pool = get_shared_pool()
+            if mem_pool is not None:
+                with torch.cuda.memory.use_mem_pool(mem_pool):
+                    new_buffer_tensor = _create_buffer()
+            else:
+                raise ex
 
         new_block = BufferBlock(buffer=new_buffer_tensor,
                                 is_reserved=reserve_buffer)
diff --git a/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py b/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py
@@ -68,7 +68,7 @@ class CUDAGraphRunnerConfig:
     max_beam_width: int
     max_num_tokens: int
     spec_config: Optional[DecodingBaseConfig]
-    cuda_graph_mem_pool: Any
+    cuda_graph_mem_pool: torch.cuda.MemPool
     use_mrope: bool
     original_max_draft_len: int
     original_max_total_draft_tokens: int
@@ -107,7 +107,9 @@ def __init__(self, config: CUDAGraphRunnerConfig):
         self.graph_outputs: Dict[KeyType,
                                  Callable[[], Optional[torch.Tensor]]] = {}
         self.graph_metadata: Dict[KeyType, Dict[str, Any]] = {}
-        self.memory_pool = config.cuda_graph_mem_pool
+        self.memory_pool = config.cuda_graph_mem_pool if config.cuda_graph_mem_pool else torch.cuda.MemPool(
+        )
+        self.memory_pool_handle = self.memory_pool.id
         self.padding_dummy_request: Optional["Request"] = None
 
         self.shared_static_tensors: Dict[str, torch.Tensor] = {}
@@ -343,6 +345,10 @@ def _setup_spec_decoding_and_forward(key: KeyType, forward_fn: Callable,
                 capture_inputs['attn_metadata'].use_spec_decoding = True
             return forward_fn(capture_inputs)
 
+        if self.memory_pool_handle is None or self.memory_pool is None:
+            self.memory_pool = torch.cuda.MemPool()
+            self.memory_pool_handle = self.memory_pool.id
+
         # We have to do warm up runs to initialize PyTorch's
         # internal states according to the docs:
         # https://pytorch.org/docs/stable/notes/cuda.html#cuda-graph-semantics
@@ -355,15 +361,14 @@ def _setup_spec_decoding_and_forward(key: KeyType, forward_fn: Callable,
                 if postprocess_fn is not None:
                     postprocess_fn(capture_inputs)
 
-            with torch.cuda.graph(graph, pool=self.memory_pool):
+            with torch.cuda.graph(graph, pool=self.memory_pool_handle):
                 output = _setup_spec_decoding_and_forward(
                     key, forward_fn, capture_inputs)
             if postprocess_fn is not None:
                 postprocess_fn(capture_inputs)
 
         self.graphs[key] = graph
         self.graph_outputs[key] = make_weak_ref(output)
-        self.memory_pool = graph.pool()
 
     def replay(self, key: KeyType,
                current_inputs: Dict[str, Any]) -> Optional[torch.Tensor]:
@@ -503,6 +508,8 @@ def clear(self):
         self.graph_outputs.clear()
         self.graph_metadata.clear()
         self.padding_dummy_request = None
+        del self.memory_pool_handle
+        self.memory_pool_handle = None
         del self.memory_pool
         self.memory_pool = None
         torch.cuda.empty_cache()
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -360,7 +360,9 @@ def __init__(
         # the model engine.
         self.attn_metadata = None
         self.iter_states = {}
-        self._cuda_graph_mem_pool = self._torch_compile_backend._graph_pool_handle if self._torch_compile_enabled else None
+
+        self._cuda_graph_mem_pool = self._torch_compile_backend._graph_pool if self._torch_compile_enabled else None
+        self._cuda_graph_mem_pool_handle = self._cuda_graph_mem_pool.id if self._cuda_graph_mem_pool else None
 
         self._cuda_graph_padding_enabled = cuda_graph_padding_enabled