Use correct memory pool

HuiGao-NV · HuiGao-NV · commit 729bf8fefa2b · 2025-11-17T01:27:49.000Z
Signed-off-by: Hui Gao &lt;huig@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/compilation/backend.py b/tensorrt_llm/_torch/compilation/backend.py
@@ -23,6 +23,7 @@
 class Backend:
 
     _custom_pass_instances: List[PatternMatcherPass] = None
+    _graph_pool: torch.cuda.MemPool = None
     _graph_pool_handle: tuple[int, int] = None
 
     # Following classes are used to let weakref ref the stream and eventlist objects.
@@ -58,7 +59,8 @@ def __init__(
         inductor_config.enable_auto_functionalized_v2 = False
 
         if Backend._graph_pool_handle is None:
-            Backend._graph_pool_handle = torch.cuda.graph_pool_handle()
+            Backend._graph_pool = torch.cuda.MemPool()
+            Backend._graph_pool_handle = Backend._graph_pool.id
 
         self.match_count = []
 
diff --git a/tensorrt_llm/_torch/memory_buffer_utils.py b/tensorrt_llm/_torch/memory_buffer_utils.py
@@ -57,27 +57,28 @@ def get_buffer(self, tensor_shape: list[int], dtype: torch.dtype,
 
         candidate_blocks = self.buffers.get(buffer_name, [])
 
-        # Find the best-fit available buffer.
-        best_fit_block: Optional[BufferBlock] = None
-        smallest_sufficient_size = float('inf')
-        for block in candidate_blocks:
-            # Skip buffers that are too small.
-            if block.buffer.numel() < required_memory_size:
-                continue
-
-            # Find the smallest buffer that is still large enough (best-fit).
-            if block.buffer.numel() < smallest_sufficient_size:
-                # Use reserved block if find one.
-                if best_fit_block is not None and best_fit_block.is_reserved and not block.is_reserved:
+        if reserve_buffer:
+            # Find the best-fit available buffer.
+            best_fit_block: Optional[BufferBlock] = None
+            smallest_sufficient_size = float('inf')
+            for block in candidate_blocks:
+                # Skip buffers that are too small.
+                if block.buffer.numel() < required_memory_size:
                     continue
 
-                best_fit_block = block
-                smallest_sufficient_size = block.buffer.numel()
+                # Find the smallest buffer that is still large enough (best-fit).
+                if block.buffer.numel() < smallest_sufficient_size:
+                    # Use reserved block if find one.
+                    if best_fit_block is not None and best_fit_block.is_reserved and not block.is_reserved:
+                        continue
 
-        if reserve_buffer and best_fit_block is not None:
-            # A suitable buffer was found, so reuse it.
-            best_fit_block.is_reserved = True
-            return self._view_as(best_fit_block.buffer, tensor_shape, dtype)
+                    best_fit_block = block
+                    smallest_sufficient_size = block.buffer.numel()
+
+            if best_fit_block is not None:
+                # A suitable buffer was found, so reuse it.
+                best_fit_block.is_reserved = True
+                return self._view_as(best_fit_block.buffer, tensor_shape, dtype)
 
         for block in list(candidate_blocks):
             if not block.is_reserved:
@@ -88,22 +89,27 @@ def get_buffer(self, tensor_shape: list[int], dtype: torch.dtype,
 
         # No suitable buffer was found, so allocate a new one.
         # The new buffer is created with uint8 to represent raw bytes.
+        def _create_buffer():
+            return torch.zeros((required_memory_size, ),
+                               device='cuda',
+                               dtype=torch.uint8)
+
         new_buffer_tensor = None
         try:
-            with torch.cuda.memory.use_mem_pool(get_shared_pool()):
-                new_buffer_tensor = torch.zeros((required_memory_size, ),
-                                                device='cuda',
-                                                dtype=torch.uint8)
+            mem_pool = get_shared_pool()
+            if mem_pool is not None:
+                with torch.cuda.memory.use_mem_pool():
+                    new_buffer_tensor = _create_buffer()
+            else:
+                new_buffer_tensor = _create_buffer()
         except Exception as ex:
             # Need to check if this is an OOM exception
             logger.debug(
                 f"Exception happened to create tensor from given memory pool: {str(ex)}"
             )
             # if exception happens during allocating memory from shared pool, retry
             # to allocate from default pool
-            new_buffer_tensor = torch.zeros((required_memory_size, ),
-                                            device='cuda',
-                                            dtype=torch.uint8)
+            new_buffer_tensor = _create_buffer()
 
         new_block = BufferBlock(buffer=new_buffer_tensor,
                                 is_reserved=reserve_buffer)
diff --git a/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py b/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py
@@ -61,7 +61,7 @@ class CUDAGraphRunnerConfig:
     max_beam_width: int
     max_num_tokens: int
     spec_config: Optional[DecodingBaseConfig]
-    cuda_graph_mem_pool: Any
+    cuda_graph_mem_pool: torch.cuda.MemPool
     use_mrope: bool
     original_max_draft_len: int
     original_max_total_draft_tokens: int
@@ -98,7 +98,9 @@ def __init__(self, config: CUDAGraphRunnerConfig):
         self.graph_outputs: Dict[Tuple[int, int, int],
                                  Callable[[], Optional[torch.Tensor]]] = {}
         self.graph_metadata: Dict[Tuple[int, int, int], Dict[str, Any]] = {}
-        self.memory_pool = config.cuda_graph_mem_pool
+        self.memory_pool = config.cuda_graph_mem_pool if config.cuda_graph_mem_pool else torch.cuda.MemPool(
+        )
+        self.memory_pool_handle = self.memory_pool.id
         self.padding_dummy_request: Optional["Request"] = None
 
         self.shared_static_tensors: Dict[str, torch.Tensor] = {}
@@ -293,15 +295,14 @@ def _setup_spec_decoding_and_forward(key: Tuple[int, int, int],
                 if postprocess_fn is not None:
                     postprocess_fn(capture_inputs)
 
-            with torch.cuda.graph(graph, pool=self.memory_pool):
+            with torch.cuda.graph(graph, pool=self.memory_pool_handle):
                 output = _setup_spec_decoding_and_forward(
                     key, forward_fn, capture_inputs)
             if postprocess_fn is not None:
                 postprocess_fn(capture_inputs)
 
         self.graphs[key] = graph
         self.graph_outputs[key] = make_weak_ref(output)
-        self.memory_pool = graph.pool()
 
     def replay(self, key: Tuple[int, int, int],
                current_inputs: Dict[str, Any]) -> Optional[torch.Tensor]:
@@ -427,6 +428,6 @@ def clear(self):
         self.graph_outputs.clear()
         self.graph_metadata.clear()
         self.padding_dummy_request = None
-        del self.memory_pool
-        self.memory_pool = None
+        del self.memory_pool_handle
+        self.memory_pool_handle = None
         torch.cuda.empty_cache()
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -340,7 +340,8 @@ def __init__(
         # the model engine.
         self.attn_metadata = None
         self.iter_states = {}
-        self._cuda_graph_mem_pool = self._torch_compile_backend._graph_pool_handle if self._torch_compile_enabled else None
+        self._cuda_graph_mem_pool = self._torch_compile_backend._graph_pool if self._torch_compile_enabled else None
+        self._cuda_graph_mem_pool_handle = self._cuda_graph_mem_pool.id if self._cuda_graph_mem_pool else None
 
         self._cuda_graph_padding_enabled = cuda_graph_padding_enabled
 
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@@ -43,7 +43,7 @@ l0_dgx_b200:
   - accuracy/test_llm_api_pytorch.py::TestQwen3NextThinking::test_auto_dtype[tp4ep4]
   - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm-auto]
-  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass-auto]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass-auto] ISOLATION
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-triton-auto]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm-auto]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm-fp8]
@@ -187,7 +187,7 @@ l0_dgx_b200:
   - accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_prefill[tp4ep4-cuda_graph=True]
   - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=0]
   - accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2]
-  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto] ISOLATION
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-triton-auto]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-auto]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-fp8]
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -374,8 +374,6 @@ accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_auto_dtype[tp8ep4
 accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_reasoning_fp8_prequantized[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5640697)
 accuracy/test_llm_api_pytorch.py::TestQwQ_32B::test_auto_dtype_tp4 SKIP (https://nvbugs/5640697)
 accuracy/test_llm_api_pytorch_multimodal.py::TestLlava_V1_6_Mistral_7B::test_auto_dtype SKIP (https://nvbugs/5644187)
-accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True] SKIP (https://nvbugs/5644632)
-accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False] SKIP (https://nvbugs/5644632)
 test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-0.8-image] SKIP (https://nvbugs/5644190)
 test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] SKIP (https://nvbugs/5648560)
 test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False] SKIP (https://nvbugs/5648560)