[feat] Add llm args to tune python gc threshold (NVIDIA#5141)

nv-yilinf · web-flow · commit dd2906353881 · 2025-06-16T17:45:22.000+08:00
Signed-off-by: Yilin Fan &lt;206948969+nv-yilinf@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -384,7 +384,8 @@ def create_py_executor_instance(
         draft_model_engine,
         start_worker,
         sampler,
-        lora_config: Optional[LoraConfig] = None) -> PyExecutor:
+        lora_config: Optional[LoraConfig] = None,
+        garbage_collection_gen0_threshold: Optional[int] = None) -> PyExecutor:
     kv_cache_manager = resources.get(KV_CACHE_MANAGER_KEY, None)
 
     spec_config = model_engine.spec_config
@@ -496,19 +497,21 @@ def create_py_executor_instance(
     kv_cache_transceiver = create_kv_cache_transceiver(
         mapping, kv_cache_manager, attention_type, cache_transceiver_config)
 
-    return PyExecutor(resource_manager,
-                      scheduler,
-                      model_engine=model_engine,
-                      sampler=sampler,
-                      dist=dist,
-                      disable_overlap_scheduler=pytorch_backend_config.
-                      disable_overlap_scheduler,
-                      max_batch_size=executor_config.max_batch_size,
-                      max_draft_tokens=spec_config.max_draft_tokens
-                      if spec_config is not None else 0,
-                      kv_cache_transceiver=kv_cache_transceiver,
-                      draft_model_engine=draft_model_engine,
-                      start_worker=start_worker)
+    return PyExecutor(
+        resource_manager,
+        scheduler,
+        model_engine=model_engine,
+        sampler=sampler,
+        dist=dist,
+        disable_overlap_scheduler=pytorch_backend_config.
+        disable_overlap_scheduler,
+        max_batch_size=executor_config.max_batch_size,
+        max_draft_tokens=spec_config.max_draft_tokens
+        if spec_config is not None else 0,
+        kv_cache_transceiver=kv_cache_transceiver,
+        draft_model_engine=draft_model_engine,
+        start_worker=start_worker,
+        garbage_collection_gen0_threshold=garbage_collection_gen0_threshold)
 
 
 def instantiate_sampler(model_engine: PyTorchModelEngine,
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -16,8 +16,8 @@
 
 import torch
 
-from tensorrt_llm._utils import (global_mpi_rank, is_trace_enabled, nvtx_range,
-                                 trace_func)
+from tensorrt_llm._utils import (customized_gc_thresholds, global_mpi_rank,
+                                 is_trace_enabled, nvtx_range, trace_func)
 from tensorrt_llm.bindings.executor import (DisServingRequestStats,
                                             FinishReason, InflightBatchingStats,
                                             IterationStats, KvCacheStats,
@@ -171,6 +171,7 @@ def __init__(self,
                  max_draft_tokens: int = 0,
                  kv_cache_transceiver: KvCacheTransceiver = None,
                  draft_model_engine: Optional[ModelEngine] = None,
+                 garbage_collection_gen0_threshold: Optional[int] = None,
                  start_worker: bool = True):
         super(PyExecutor, self).__init__()
         self.device_id = torch.cuda.current_device()
@@ -268,14 +269,18 @@ def __init__(self,
                 "Drafting is not supported for selected executor loop. "
                 "Please disable disagg/pipeline parallelism/overlap scheduler.")
 
+        self.garbage_collection_gen0_threshold = garbage_collection_gen0_threshold
+
         self.worker_started = False
         self.worker_lock = threading.Lock()
         if start_worker:
             self.start_worker()
 
     def _event_loop_wrapper(self):
         try:
-            self.event_loop()
+            with customized_gc_thresholds(
+                    self.garbage_collection_gen0_threshold):
+                self.event_loop()
         except Exception as e:
             logger.error(f"Error in event loop: {e}")
             logger.error(traceback.format_exc())
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -176,10 +176,12 @@ def _get_mapping(executor_config: ExecutorConfig) -> Mapping:
     return mapping
 
 
-def create_py_executor(executor_config: ExecutorConfig,
-                       checkpoint_dir: str = None,
-                       engine_dir: str = None,
-                       lora_config: Optional[LoraConfig] = None) -> PyExecutor:
+def create_py_executor(
+        executor_config: ExecutorConfig,
+        checkpoint_dir: str = None,
+        engine_dir: str = None,
+        lora_config: Optional[LoraConfig] = None,
+        garbage_collection_gen0_threshold: Optional[int] = None) -> PyExecutor:
     _mangle_executor_config(executor_config)
     pytorch_backend_config = executor_config.pytorch_backend_config
 
@@ -334,7 +336,7 @@ def create_py_executor(executor_config: ExecutorConfig,
         py_executor = create_py_executor_instance(
             dist, resources, mapping, pytorch_backend_config, executor_config,
             ctx_chunk_config, model_engine, draft_model_engine, False, sampler,
-            lora_config)
+            lora_config, garbage_collection_gen0_threshold)
 
     if estimating_kv_cache:
         assert kv_cache_creator is not None
@@ -365,7 +367,8 @@ def create_py_executor(executor_config: ExecutorConfig,
             py_executor = create_py_executor_instance(
                 dist, resources, mapping, pytorch_backend_config,
                 executor_config, ctx_chunk_config, model_engine,
-                draft_model_engine, False, sampler, lora_config)
+                draft_model_engine, False, sampler, lora_config,
+                garbage_collection_gen0_threshold)
 
     py_executor.start_worker()
     return py_executor
diff --git a/tensorrt_llm/_utils.py b/tensorrt_llm/_utils.py
@@ -781,6 +781,26 @@ def __getitem__(self, index):
         return self.objs[index]
 
 
+PYTHON_DEFAULT_GC_THRESHOLDS = gc.get_threshold()
+
+
+@contextmanager
+def customized_gc_thresholds(gen0_threshold: Optional[int] = None):
+    try:
+        if gen0_threshold:
+            gc.set_threshold(gen0_threshold)
+            logger.debug(
+                f'Set Python GC threshold to customized value: {gen0_threshold}'
+            )
+        yield
+    finally:
+        if gen0_threshold:
+            gc.set_threshold(*PYTHON_DEFAULT_GC_THRESHOLDS)
+            logger.debug(
+                f'Reset Python GC thresholds to default value: {PYTHON_DEFAULT_GC_THRESHOLDS}'
+            )
+
+
 @contextmanager
 def _null_context_manager():
     yield
diff --git a/tensorrt_llm/executor/executor.py b/tensorrt_llm/executor/executor.py
@@ -350,6 +350,7 @@ def create(
         postproc_worker_config: Optional[PostprocWorkerConfig] = None,
         is_llm_executor: Optional[bool] = None,
         lora_config: Optional[LoraConfig] = None,
+        garbage_collection_gen0_threshold: Optional[int] = None,
     ) -> Union["GenerationExecutorProxy", "GenerationExecutorWorker"]:
         # local imports to avoid cyclic importing
         from .proxy import GenerationExecutorProxy
@@ -393,7 +394,9 @@ def create(
                 model_world_size=model_world_size,
                 mpi_session=mpi_session,
                 postproc_worker_config=postproc_worker_config,
-                is_llm_executor=is_llm_executor)
+                is_llm_executor=is_llm_executor,
+                garbage_collection_gen0_threshold=
+                garbage_collection_gen0_threshold)
 
         # WAR: For the performance of gathering logits, we use single process worker
         # for TP1 to avoid the large overhead of IPC.
@@ -404,7 +407,9 @@ def create(
                 "Using single process worker for TP1, this may hurt streaming generation performance."
             )
             return GenerationExecutorWorker(**worker_kwargs,
-                                            is_llm_executor=is_llm_executor)
+                                            is_llm_executor=is_llm_executor,
+                                            garbage_collection_gen0_threshold=
+                                            garbage_collection_gen0_threshold)
 
         # For single-gpu case:
         # Partition the workload to multiple process for streaming performance.
@@ -416,7 +421,9 @@ def create(
                 model_world_size=model_world_size,
                 mpi_session=None,  # use mpi4py
                 postproc_worker_config=postproc_worker_config,
-                is_llm_executor=is_llm_executor)
+                is_llm_executor=is_llm_executor,
+                garbage_collection_gen0_threshold=
+                garbage_collection_gen0_threshold)
         else:
             ctx = multiprocessing.get_context("spawn")
             # The ProcessPoolExecutorSession is used to support Windows, as mpi4py cannot.
@@ -427,7 +434,9 @@ def create(
                 model_world_size=model_world_size,
                 mpi_session=mpi_session,
                 postproc_worker_config=postproc_worker_config,
-                is_llm_executor=is_llm_executor)
+                is_llm_executor=is_llm_executor,
+                garbage_collection_gen0_threshold=
+                garbage_collection_gen0_threshold)
 
     def wait_first_completed(
         self, futures: List[GenerationResult]
diff --git a/tensorrt_llm/executor/proxy.py b/tensorrt_llm/executor/proxy.py
@@ -11,7 +11,7 @@
 
 from tensorrt_llm.logger import logger
 
-from .._utils import mpi_rank, nvtx_range_debug
+from .._utils import customized_gc_thresholds, mpi_rank, nvtx_range_debug
 from ..llmapi.mpi_session import (MpiCommSession, MpiPoolSession, MpiSession,
                                   RemoteMpiCommSessionClient)
 from ..llmapi.tracer import enable_llm_tracer, get_tracer, global_tracer
@@ -44,6 +44,7 @@ def __init__(
         worker_cls: type = GenerationExecutorWorker,
         postproc_worker_config: Optional[PostprocWorkerConfig] = None,
         is_llm_executor: Optional[bool] = None,
+        garbage_collection_gen0_threshold: Optional[int] = None,
     ) -> None:
         postproc_worker_config = postproc_worker_config or PostprocWorkerConfig(
         )
@@ -86,10 +87,14 @@ def __init__(
 
         self.model_world_size = model_world_size
 
+        self.garbage_collection_gen0_threshold = garbage_collection_gen0_threshold
+
         worker_kwargs = dict(**worker_kwargs,
                              worker_queues=self._setup_queues(),
                              postproc_worker_config=postproc_worker_config,
-                             is_llm_executor=False)
+                             is_llm_executor=False,
+                             garbage_collection_gen0_threshold=self.
+                             garbage_collection_gen0_threshold)
 
         if "log_level" not in worker_kwargs:
             worker_kwargs["log_level"] = logger.level
@@ -152,8 +157,9 @@ def abort_request(self, request_id: int) -> None:
     def dispatch_result_task(self) -> bool:
         # TODO[chunweiy]: convert the dispatch_result_task to async, that should
         # benefit from zmq.asyncio.Context
-        if (res := self.result_queue.get()) is None:
-            return False  # shutdown the thread
+        with customized_gc_thresholds(self.garbage_collection_gen0_threshold):
+            if (res := self.result_queue.get()) is None:
+                return False  # shutdown the thread
 
         async_queues = []
         event_loop = None
diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py
@@ -58,6 +58,7 @@ def __init__(
         postproc_worker_config: Optional[PostprocWorkerConfig] = None,
         is_llm_executor: Optional[bool] = None,
         lora_config: Optional[LoraConfig] = None,
+        garbage_collection_gen0_threshold: Optional[int] = None,
     ) -> None:
         postproc_config = postproc_worker_config or PostprocWorkerConfig()
         super().__init__(
@@ -125,6 +126,8 @@ def _create_engine():
                     create_py_executor
                 create_executor = create_py_executor
                 args["lora_config"] = lora_config
+                args[
+                    "garbage_collection_gen0_threshold"] = garbage_collection_gen0_threshold
             elif executor_config.backend == "_autodeploy":
                 from tensorrt_llm._torch.auto_deploy.shim.ad_executor import \
                     create_autodeploy_executor
@@ -595,6 +598,7 @@ def worker_main(
     is_llm_executor: Optional[
         bool] = True,  # whether it's the main executor instance
     lora_config: Optional[LoraConfig] = None,
+    garbage_collection_gen0_threshold: Optional[int] = None,
 ) -> None:
     mpi_comm().barrier()
     print_colored_debug(f"Worker {mpi_rank()} entering worker_main...\n",
@@ -720,7 +724,8 @@ def notify_proxy_threads_to_quit():
             batched_logits_processor,
             postproc_worker_config=postproc_worker_config,
             is_llm_executor=is_llm_executor,
-            lora_config=lora_config)
+            lora_config=lora_config,
+            garbage_collection_gen0_threshold=garbage_collection_gen0_threshold)
     except Exception as e:
         logger.error(f"Failed to initialize executor on rank {mpi_rank()}: {e}")
         logger.error(traceback.format_exc())
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
@@ -708,7 +708,9 @@ def _build_model(self):
                 postprocess_tokenizer_dir=self.args.postprocess_tokenizer_dir,
             ),
             is_llm_executor=True,
-            lora_config=self.args.lora_config)
+            lora_config=self.args.lora_config,
+            garbage_collection_gen0_threshold=self.args.
+            garbage_collection_gen0_threshold)
 
     @property
     def _on_trt_backend(self) -> bool:
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -941,6 +941,12 @@ class BaseLlmArgs(BaseModel):
         default=None,
         description="The parser to separate reasoning content from output.")
 
+    garbage_collection_gen0_threshold: int = Field(
+        default=20000,
+        description=
+        "Threshold for Python garbage collection of generation 0 objects."
+        "Lower values trigger more frequent garbage collection.")
+
     # TODO[Superjomn]: To deprecate this config.
     decoding_config: Optional[object] = Field(
         default=None,
diff --git a/tests/unittest/api_stability/references_committed/llm.yaml b/tests/unittest/api_stability/references_committed/llm.yaml
@@ -105,6 +105,9 @@ methods:
       kv_cache_config:
         annotation: tensorrt_llm.llmapi.llm_args.KvCacheConfig
         default: null
+      garbage_collection_gen0_threshold:
+        annotation: int
+        default: 20000
     return_annotation: None
   generate:
     parameters: