Skip to content

Commit ed297d7

Browse files
authored
[None][chore] Optimize perf for the RPC executor and add some profile utilities to llm-api (#8415)
Signed-off-by: Superjomn <[email protected]>
1 parent 6a63177 commit ed297d7

File tree

8 files changed

+325
-193
lines changed

8 files changed

+325
-193
lines changed

tensorrt_llm/_torch/pyexecutor/py_executor.py

Lines changed: 0 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
11
import dataclasses
22
import datetime
33
import functools
4-
import gc
54
import os
65
import pickle # nosec B403
76
import threading
87
import time
98
import traceback
10-
import weakref
119
from contextlib import contextmanager
1210
from typing import Dict, Iterable, List, Optional, Tuple, Union
1311

@@ -59,10 +57,6 @@
5957
# Format: "start1-stop1,start2-stop2,..." or single iterations "iter1,iter2,..."
6058
PROFILE_START_STOP_ENV_VAR_NAME = "TLLM_PROFILE_START_STOP"
6159

62-
# Environment variable to enable garbage collection profiling.
63-
# Set to "1" to enable recording of garbage collection events during profiling.
64-
PROFILE_RECORD_GC_ENV_VAR_NAME = "TLLM_PROFILE_RECORD_GC"
65-
6660
# Environment variable to enable PyTorch profiler tracing.
6761
# Set to a path to save detailed tracing of PyTorch operations.
6862
PROFILE_TRACE_ENV_VAR_NAME = "TLLM_TORCH_PROFILE_TRACE"
@@ -97,40 +91,6 @@ def _load_iteration_indexes(env_var: str):
9791
return frozenset(starts), frozenset(stops)
9892

9993

100-
class _GCNvtxHandle:
101-
pass
102-
103-
104-
def _gc_nvtx_watcher():
105-
enabled = os.environ.get(PROFILE_RECORD_GC_ENV_VAR_NAME, None)
106-
if not enabled:
107-
return None
108-
109-
range_id: Optional[int] = None
110-
111-
def gc_callback(phase, _):
112-
nonlocal range_id
113-
if phase == "start":
114-
assert range_id is None, "Unexpected state in GC callback: another GC while last GC not finished?"
115-
range_id = torch.cuda.nvtx.range_start("Python GC")
116-
elif phase == "stop":
117-
assert range_id is not None, "Unexpected state in GC callback: no active GC but got GC finished?"
118-
torch.cuda.nvtx.range_end(range_id)
119-
range_id = None
120-
121-
gc.callbacks.append(gc_callback)
122-
123-
def gc_cleanup(callback):
124-
try:
125-
gc.callbacks.remove(callback)
126-
except ValueError:
127-
pass
128-
129-
handle = _GCNvtxHandle()
130-
weakref.finalize(handle, gc_cleanup, gc_callback)
131-
return handle
132-
133-
13494
@dataclasses.dataclass
13595
class BatchState:
13696
sample_state: SampleState
@@ -178,7 +138,6 @@ def __init__(self,
178138
# profile config
179139
self.profile_start_iters, self.profile_stop_iters = _load_iteration_indexes(
180140
PROFILE_START_STOP_ENV_VAR_NAME)
181-
self.gc_nvtx_watcher_handle = _gc_nvtx_watcher()
182141

183142
# related modules
184143
self.resource_manager = resource_manager

tensorrt_llm/_utils.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -918,6 +918,18 @@ def nvtx_range_debug(msg: str,
918918
return _null_context_manager()
919919

920920

921+
def nvtx_mark_debug(msg: str,
922+
color: str = "grey",
923+
domain: str = "TensorRT-LLM",
924+
category: Optional[str] = None) -> None:
925+
"""
926+
Creates an NVTX marker for debugging purposes.
927+
"""
928+
if os.getenv("TLLM_LLMAPI_ENABLE_NVTX", "0") == "1" or \
929+
os.getenv("TLLM_NVTX_DEBUG", "0") == "1":
930+
nvtx_mark(msg, color=color, domain=domain, category=category)
931+
932+
921933
def nvtx_mark(msg: str,
922934
color: str = "grey",
923935
domain: str = "TensorRT-LLM",
@@ -1195,3 +1207,71 @@ def is_device_integrated() -> bool:
11951207
if not torch.cuda.is_available():
11961208
return False
11971209
return torch.cuda.get_device_properties().is_integrated
1210+
1211+
1212+
# Environment variable to enable garbage collection profiling.
1213+
# Set to "1" to enable recording of garbage collection events during profiling.
1214+
PROFILE_RECORD_GC_ENV_VAR_NAME = "TLLM_PROFILE_RECORD_GC"
1215+
1216+
1217+
class _GCNvtxHandle:
1218+
"""Handle object for GC NVTX watcher to keep it alive."""
1219+
1220+
1221+
# Singleton for the GC NVTX watcher handle.
1222+
_gc_watcher_handle: Optional[_GCNvtxHandle] = None
1223+
1224+
1225+
def _setup_gc_nvtx_profiling() -> Optional[_GCNvtxHandle]:
1226+
"""
1227+
Set up NVTX range markers for Python garbage collection events (singleton).
1228+
This helps in profiling to visualize when GC occurs during execution.
1229+
1230+
This function is called automatically at module import time. The environment
1231+
variable TLLM_PROFILE_RECORD_GC must be set before importing this module.
1232+
1233+
This is an internal function and should not be called directly by users.
1234+
1235+
Returns:
1236+
_GCNvtxHandle or None: A handle object that keeps the GC callback alive,
1237+
or None if GC profiling is not enabled.
1238+
"""
1239+
global _gc_watcher_handle
1240+
1241+
# Return existing handle if already initialized
1242+
if _gc_watcher_handle is not None:
1243+
return _gc_watcher_handle
1244+
1245+
enabled = os.environ.get(PROFILE_RECORD_GC_ENV_VAR_NAME, None)
1246+
if not enabled:
1247+
return None
1248+
1249+
range_id: Optional[int] = None
1250+
1251+
def gc_callback(phase, _):
1252+
nonlocal range_id
1253+
if phase == "start":
1254+
assert range_id is None, "Unexpected state in GC callback: another GC while last GC not finished?"
1255+
range_id = torch.cuda.nvtx.range_start("Python GC")
1256+
elif phase == "stop":
1257+
assert range_id is not None, "Unexpected state in GC callback: no active GC but got GC finished?"
1258+
torch.cuda.nvtx.range_end(range_id)
1259+
range_id = None
1260+
1261+
gc.callbacks.append(gc_callback)
1262+
1263+
def gc_cleanup(callback):
1264+
try:
1265+
gc.callbacks.remove(callback)
1266+
except ValueError:
1267+
pass
1268+
1269+
handle = _GCNvtxHandle()
1270+
weakref.finalize(handle, gc_cleanup, gc_callback)
1271+
1272+
_gc_watcher_handle = handle
1273+
return handle
1274+
1275+
1276+
# Initialize GC NVTX profiling singleton at module import time
1277+
_setup_gc_nvtx_profiling()

tensorrt_llm/bench/benchmark/utils/general.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from __future__ import annotations
22

33
import json
4-
from importlib.metadata import version
54
from pathlib import Path
65
from random import choices, shuffle
76
from typing import Dict, List, Tuple, Union
@@ -170,7 +169,7 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str,
170169

171170
backend = params.get("backend", "pytorch")
172171
return {
173-
"sw_version": version("tensorrt_llm"),
172+
"sw_version": "1.2",
174173
"model_path": model_path,
175174
"settings_config": {
176175
"max_batch_size": int(max_batch_size),

tensorrt_llm/executor/executor.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,7 @@ def _create_ray_executor(
368368
is_llm_executor: bool,
369369
tp_size: int,
370370
):
371+
logger.warning(f"Orchestrator is creating Ray executor")
371372
from .ray_executor import RayExecutor
372373

373374
return RayExecutor(worker_kwargs,
@@ -386,6 +387,7 @@ def _create_rpc_executor(
386387
):
387388
"""Create RPC-based executor (GenerationExecutorRpcProxy)."""
388389
from .rpc_proxy import GenerationExecutorRpcProxy
390+
logger.warning(f"Orchestrator is creating RPC executor")
389391
return GenerationExecutorRpcProxy(
390392
worker_kwargs,
391393
model_world_size=model_world_size,
@@ -408,6 +410,7 @@ def _create_ipc_executor(
408410
use_worker: If True, creates GenerationExecutorWorker (single process).
409411
If False, creates GenerationExecutorProxy (multi-process with IPC).
410412
"""
413+
logger.warning(f"Orchestrator is creating IPC executor")
411414
if use_worker:
412415
from .worker import GenerationExecutorWorker
413416
return GenerationExecutorWorker(**worker_kwargs,

0 commit comments

Comments
 (0)