fix: handle OOMs during KV cache estimation (NVIDIA#4690)

ixlmar · web-flow · commit 6437756da8b1 · 2025-06-05T10:02:26.000+02:00
Signed-off-by: ixlmar &lt;206748156+ixlmar@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -20,6 +20,7 @@
 from ..speculative import get_spec_decoder
 from .config_utils import is_mla, is_nemotron_hybrid
 from .kv_cache_transceiver import AttentionTypeCpp, create_kv_cache_transceiver
+from .llm_request import ExecutorResponse
 from .model_engine import (DRAFT_KV_CACHE_MANAGER_KEY, KV_CACHE_MANAGER_KEY,
                            PyTorchModelEngine)
 from .py_executor import PyExecutor
@@ -203,16 +204,29 @@ def estimate_max_tokens(self, py_executor: PyExecutor) -> None:
         req_ids = py_executor.dist.broadcast(req_ids, root=0)
         py_executor.is_warmup = True
         py_executor.start_worker()
-        py_executor.await_responses(req_ids)
+        try:
+            responses = py_executor.await_responses(req_ids)
+            for response_or_list in responses:
+                response_list = [response_or_list] if isinstance(
+                    response_or_list, ExecutorResponse) else response_or_list
+                for response in response_list:
+                    if response.has_error():
+                        raise RuntimeError(response.error_msg)
+
+            torch_peak_memory = torch.cuda.memory_stats(
+            )["allocated_bytes.all.peak"]
+
+            # Clear the caching allocator before measuring the current memory usage
+            torch.cuda.empty_cache()
+            end, total_gpu_memory = torch.cuda.mem_get_info()
+            torch_used_bytes = torch.cuda.memory_stats(
+            )["allocated_bytes.all.current"]
+        finally:
+            py_executor.shutdown()
+            py_executor.is_warmup = False
+            py_executor.enable_iter_perf_stats = origin_iter_stats
+            py_executor.set_gather_responses(False)
 
-        torch_peak_memory = torch.cuda.memory_stats(
-        )["allocated_bytes.all.peak"]
-
-        # Clear the caching allocator before measuring the current memory usage
-        torch.cuda.empty_cache()
-        end, total_gpu_memory = torch.cuda.mem_get_info()
-        torch_used_bytes = torch.cuda.memory_stats(
-        )["allocated_bytes.all.current"]
         total_used_bytes = total_gpu_memory - end
         activation_bytes = torch_peak_memory - model_bytes
         extra_cost = max(total_used_bytes - torch_used_bytes, 0)
@@ -235,15 +249,6 @@ def estimate_max_tokens(self, py_executor: PyExecutor) -> None:
                                       self._max_kv_tokens_in)
 
         logger.info(f"Estimated max tokens in KV cache : {kv_cache_max_tokens}")
-
-        py_executor.resource_manager.resource_managers.get(
-            "kv_cache_manager").shutdown()
-
-        py_executor.shutdown()
-        py_executor.is_warmup = False
-        py_executor.set_gather_responses(False)
-        py_executor.enable_iter_perf_stats = origin_iter_stats
-
         executor_config.kv_cache_config.max_tokens = kv_cache_max_tokens
 
     def _create_kv_cache_manager(
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -280,6 +280,8 @@ def _event_loop_wrapper(self):
             logger.error(f"Error in event loop: {e}")
             logger.error(traceback.format_exc())
             raise e
+        finally:
+            self._executor_loop_cleanup()
 
     def start_worker(self):
         self.worker_lock.acquire()
@@ -833,7 +835,6 @@ def _executor_loop_pp(self):
                     self._process_iter_stats(finished_requests,
                                              self.active_requests,
                                              previous_batch)
-        self._executor_loop_cleanup()
 
     def _executor_loop(self):
         torch.cuda.set_device(self.device_id)
@@ -959,8 +960,6 @@ def _executor_loop(self):
                                    iter_stats=iter_stats,
                                    iter_start_time=iter_start_time))
 
-        self._executor_loop_cleanup()
-
     def _prepare_draft_requests(self):
         try:
             # Set draft tokens here to make the KV cache manager
@@ -1108,8 +1107,6 @@ def _executor_loop_overlap(self):
                 if self.kv_cache_transceiver and self.ctx_in_transmission_requests:
                     self._terminate_ctx_finished_requests()
 
-        self._executor_loop_cleanup()
-
     def _process_previous_batch(self):
         self._update_requests(self.previous_batch.sample_state)
 
@@ -1634,7 +1631,8 @@ def _update_request_states_tp(self, scheduled_requests: ScheduledRequests):
             self.active_requests.remove(request)
 
         for request in scheduled_requests.context_requests:
-            request.move_to_next_context_chunk()
+            if request.state != LlmRequestState.GENERATION_COMPLETE:  # skip failed requests
+                request.move_to_next_context_chunk()
             if request.get_context_remaining_length() == 0:
                 request.state = LlmRequestState.GENERATION_IN_PROGRESS
 
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -31,6 +31,7 @@ class _ExecutorCreationStage(enum.Enum):
     SAMPLER = "Sampler"
     INIT_KV_CACHE = "Initial KV cache (temporary for KV cache size estimation)"
     INIT_EXTRA_RESOURCES = "Additional executor resources (temporary for KV cache size estimation)"
+    MODEL_EXTRA = "Model resources created during usage"
     EXTRA_RESOURCES = "Additional executor resources"
     KV_CACHE = "KV cache"
     MODEL_ENGINE_MAIN = "Model"
@@ -86,6 +87,8 @@ def _maybe_explain_if_oom(self, e: Exception, *,
             "reduce max_num_tokens",
             _ExecutorCreationStage.EXTRA_RESOURCES:
             "reduce max_num_tokens",
+            _ExecutorCreationStage.MODEL_EXTRA:
+            "reduce max_num_tokens",
         }
 
         msg = "\n".join([
@@ -334,7 +337,9 @@ def create_py_executor(executor_config: ExecutorConfig,
 
     if estimating_kv_cache:
         assert kv_cache_creator is not None
-        kv_cache_creator.estimate_max_tokens(py_executor)
+        with mem_monitor.observe_creation_stage(
+                _ExecutorCreationStage.MODEL_EXTRA):
+            kv_cache_creator.estimate_max_tokens(py_executor)
         kv_cache_creator.teardown_managers(resources)
         del py_executor  # free before constructing new