Skip to content

Commit 66a3a74

Browse files
nv-yilinfdominicshanshan
authored andcommitted
[None][feat] Add request timing breakdown option in benchmark_serving (NVIDIA#8128)
Signed-off-by: nv-yilinf <206948969+nv-yilinf@users.noreply.github.com>
1 parent cca8057 commit 66a3a74

File tree

8 files changed

+1353
-0
lines changed

8 files changed

+1353
-0
lines changed

tensorrt_llm/serve/scripts/benchmark_serving.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
SampleRequest, ShareGPTDataset, SonnetDataset, VisionArenaDataset)
4646
from tensorrt_llm.serve.scripts.benchmark_utils import (
4747
convert_to_pytorch_benchmark_format, write_to_json)
48+
from tensorrt_llm.serve.scripts.time_breakdown import RequestTimeBreakdown
4849
# isort: on
4950

5051
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@@ -598,6 +599,34 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
598599
write_to_json(pt_file, pt_records)
599600

600601

602+
async def fetch_perf_metrics(base_url: str) -> dict:
603+
"""
604+
Fetch performance metrics from the /perf_metrics endpoint.
605+
606+
Args:
607+
base_url: The base URL of the server
608+
609+
Returns:
610+
Dictionary containing the performance metrics
611+
"""
612+
perf_url = f"{base_url}/perf_metrics"
613+
614+
async with aiohttp.ClientSession(trust_env=True,
615+
timeout=AIOHTTP_TIMEOUT) as session:
616+
try:
617+
async with session.get(perf_url) as response:
618+
if response.status == 200:
619+
return await response.json()
620+
else:
621+
print(
622+
f"Failed to fetch performance metrics. Status: {response.status}"
623+
)
624+
return {}
625+
except Exception as e:
626+
print(f"Error fetching performance metrics: {e}")
627+
return {}
628+
629+
601630
def main(args: argparse.Namespace):
602631
print(args)
603632
random.seed(args.seed)
@@ -877,6 +906,55 @@ def create_dataset_and_sample(dataset_name: str):
877906
json.dump(result_json, outfile)
878907
save_to_pytorch_benchmark_format(args, result_json, file_name)
879908

909+
# Save per-request breakdown if requested
910+
if args.save_request_time_breakdown:
911+
print("Fetching request performance metrics...")
912+
perf_metrics = asyncio.run(fetch_perf_metrics(base_url))
913+
914+
if perf_metrics:
915+
# Generate filename for perf metrics
916+
current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
917+
base_model_id = model_id.split("/")[-1]
918+
max_concurrency_str = (f"-concurrency{args.max_concurrency}"
919+
if args.max_concurrency is not None else "")
920+
perf_filename = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}-perf_metrics.json"
921+
922+
if args.result_dir:
923+
perf_filename = os.path.join(args.result_dir, perf_filename)
924+
925+
# Save perf metrics to JSON file
926+
with open(perf_filename, "w", encoding='utf-8') as outfile:
927+
try:
928+
json.dump(perf_metrics, outfile, indent=2)
929+
except Exception as e:
930+
print(f"Failed to save perf metrics: {e}")
931+
932+
print(f"Request performance metrics saved to: {perf_filename}")
933+
934+
# Create timing diagram from the saved JSON file
935+
try:
936+
analyzer = RequestTimeBreakdown()
937+
938+
print("Creating time diagram from request time breakdown...")
939+
timing_data = analyzer.parse_json_file(perf_filename)
940+
941+
if timing_data:
942+
# Generate HTML filename for the timing diagram
943+
diagram_filename = f"{os.path.splitext(perf_filename)[0]}-time_diagram.html"
944+
analyzer.create_timing_diagram(timing_data,
945+
diagram_filename)
946+
947+
print(f"Time diagram saved to: {diagram_filename}")
948+
else:
949+
print(
950+
"No time data found in request time breakdown - skipping diagram creation."
951+
)
952+
except Exception as e:
953+
print(f"Failed to create time diagram: {e}")
954+
print("Performance metrics were still saved successfully.")
955+
else:
956+
print("Failed to fetch per-request performance metrics.")
957+
880958

881959
if __name__ == "__main__":
882960
parser = FlexibleArgumentParser(
@@ -1260,6 +1338,13 @@ def create_dataset_and_sample(dataset_name: str):
12601338
help="Skip initial test run with a single prompt.",
12611339
)
12621340

1341+
parser.add_argument(
1342+
"--save-request-time-breakdown",
1343+
action="store_true",
1344+
help=
1345+
"After benchmarking, call the /perf_metric endpoint, save the result as JSON, and create an interactive time breakdown diagram.",
1346+
)
1347+
12631348
args = parser.parse_args()
12641349

12651350
main(args)
Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
# Time Breakdown Tool
2+
3+
A standalone tool for analyzing and visualizing TensorRT-LLM server request time breakdown.
4+
5+
## Overview
6+
7+
The Time Breakdown tool analyzes performance metrics from TensorRT-LLM servers and creates interactive visualizations showing how time is spent processing each request. It supports both aggregated and disaggregated server configurations.
8+
9+
10+
The tool generates:
11+
12+
1. **Interactive HTML Diagram**: A stacked bar chart showing timing breakdown per request with hover tooltips
13+
2. **Statistics**: Median times for each timing segment (optional)
14+
15+
### Example Visualization
16+
17+
![Request Time Breakdown Example](images/request_time_breakdown_example.png)
18+
19+
*Example of the interactive time diagram showing request time breakdown across different processing stages.*
20+
21+
### Timing Metrics
22+
23+
The tool aims to track detailed timing segments throughout the request lifecycle (currently we only track timing segments related to TTFT (Time-To-First-Token), full lifecycle tracking will be added soon):
24+
25+
#### Context/Prefill Stage Metrics
26+
27+
1. **Context Preprocessing** (`ctx_preprocessing`)
28+
- **Time Period**: `server_arrival_time``arrival_time`
29+
- **Description**: Python overhead & initialization when the context server receives the request
30+
- **Includes**: Request parsing, pre-processing (e.g., tokenization) before queuing
31+
32+
2. **Context Queue** (`ctx_queue`)
33+
- **Time Period**: `arrival_time``first_scheduled_time`
34+
- **Description**: Time spent waiting in queue and resource allocation
35+
- **Includes**: Queueing delay, memory allocation, scheduling wait time
36+
37+
3. **Context Processing** (`ctx_processing`)
38+
- **Time Period**: `first_scheduled_time``first_token_time`
39+
- **Description**: Actual prefill computation time
40+
- **Includes**: Model forward pass for the context/prompt tokens
41+
42+
4. **Context Postprocessing** (`ctx_postprocessing`)
43+
- **Time Period**: `first_token_time``server_first_token_time`
44+
- **Description**: Time to prepare and send the first token response
45+
- **Includes**: Response preparation, serialization, network overhead
46+
47+
#### Generation/Decode Stage Metrics (Disaggregated Mode Only)
48+
49+
5. **Generation Preprocessing** (`gen_preprocessing`)
50+
- **Time Period**: `gen_server_arrival_time``gen_arrival_time`
51+
- **Description**: Python overhead & initialization when generation server receives the request
52+
- **Includes**: Request parsing, KV cache transfer preparation
53+
54+
6. **Generation Queue** (`gen_queue`)
55+
- **Time Period**: `gen_arrival_time``gen_first_scheduled_time`
56+
- **Description**: Time spent in queue and resource allocation, including KV cache transfer
57+
- **Includes**:
58+
Queueing delay, KV cache transfer, memory allocation for generation
59+
60+
7. **Generation First Token Postprocessing** (`gen_postprocessing`)
61+
- **Time Period**: `gen_first_scheduled_time``gen_server_first_token_time`
62+
- **Description**: Time to generate and send first token from generation server
63+
- **Includes**: Token generation, response preparation
64+
65+
#### Disaggregation Server Metrics
66+
67+
8. **Disaggregation Preprocessing** (`disagg_preprocessing`)
68+
- **Time Period**: `disagg_server_arrival_time``ctx_server_arrival_time`
69+
- **Description**: Routing overhead from disagg server to context server
70+
- **Includes**: Request forwarding, network latency
71+
72+
9. **Disaggregation Postprocessing** (`disagg_postprocessing`)
73+
- **Time Period**: `gen_server_first_token_time``disagg_server_first_token_time`
74+
- **Description**: Routing overhead from generation server back through disagg server
75+
- **Includes**: Response forwarding, aggregation
76+
## Input Format
77+
78+
The tool expects a JSON file containing an array of request performance metrics (unit: seconds).
79+
80+
### Aggregated Format
81+
82+
```json
83+
[
84+
{
85+
"request_id": 0,
86+
"perf_metrics": {
87+
"timing_metrics": {
88+
"server_arrival_time": 1.000,
89+
"arrival_time": 1.002,
90+
"first_scheduled_time": 1.005,
91+
"first_token_time": 1.025,
92+
"server_first_token_time": 1.027
93+
}
94+
}
95+
}
96+
]
97+
```
98+
99+
### Disaggregated Format
100+
101+
```json
102+
[
103+
{
104+
"ctx_perf_metrics": {
105+
"request_id": 3,
106+
"perf_metrics": {
107+
"timing_metrics": {
108+
"server_arrival_time": 2.000,
109+
"arrival_time": 2.003,
110+
"first_scheduled_time": 2.008,
111+
"first_token_time": 2.035,
112+
"server_first_token_time": 2.038
113+
}
114+
}
115+
},
116+
"gen_perf_metrics": {
117+
"perf_metrics": {
118+
"timing_metrics": {
119+
"server_arrival_time": 2.050,
120+
"arrival_time": 2.052,
121+
"first_scheduled_time": 2.055,
122+
"first_token_time": 2.080,
123+
"server_first_token_time": 2.083
124+
}
125+
}
126+
},
127+
"disagg_server_arrival_time": 1.995,
128+
"disagg_server_first_token_time": 2.090
129+
}
130+
]
131+
```
132+
## Usage
133+
134+
### Integration with Benchmark Serving
135+
Step 1:
136+
Set
137+
```
138+
return_perf_metrics: True
139+
perf_metrics_max_requests: <INTEGER>
140+
```
141+
in the `extra-llm-api-config.yaml`. If you are running disaggregated serving, you should add configs for all servers (disagg, context and generation server).
142+
143+
Step 2:
144+
Add `--save-request-time-breakdown` when running `benchmark_serving.py`
145+
```
146+
python -m tensorrt_llm.serve.scripts.benchmark_serving \
147+
--model ${model_name} \
148+
--dataset-name random \
149+
--ignore-eos \
150+
--num-prompts 1000 \
151+
--random-input-len 1024 \
152+
--random-output-len 2048 \
153+
--random-ids \
154+
--max-concurrency 64 \
155+
--save-result \
156+
--result-dir <RESULT_DIR> \
157+
--percentile-metrics "ttft,tpot,itl,e2e" \
158+
--save-request-time-breakdown
159+
```
160+
You will be able find the interactive time diagram in `<RESULT_DIR>`.
161+
### As a CLI Tool
162+
Step 1:
163+
Query the perf_metrics.json using the `/perf_metrics` endpoint of the trtllm server (in case of disaggreated serving, you only need to query the disagg server). Make sure the servers have `perf_metrics_max_requests` and `return_perf_metric` configured.
164+
```
165+
curl -o perf_metrics.json <HOST>:<PORT>/perf_metrics
166+
```
167+
Step 2:
168+
Process the `perf_metrics.json` with `time_breakdown.py`
169+
```bash
170+
# Basic usage - analyze and create time diagram
171+
python time_breakdown.py perf_metrics.json
172+
173+
# Specify custom output file
174+
python time_breakdown.py perf_metrics.json -o my_time_diagram.html
175+
176+
# Show statistics only (no diagram)
177+
python time_breakdown.py perf_metrics.json --stats-only
178+
179+
# Create diagram and show statistics
180+
python time_breakdown.py perf_metrics.json --show-stats
181+
```
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#!/usr/bin/env python3
2+
# SPDX-License-Identifier: Apache-2.0
3+
"""
4+
Time Breakdown Analysis Tool
5+
6+
This module provides tools for analyzing and visualizing request time breakdown
7+
from TensorRT-LLM server performance metrics.
8+
"""
9+
10+
from .time_breakdown import (RequestDataParser, RequestTimeBreakdown,
11+
TimingMetric, TimingMetricsConfig, main)
12+
13+
__all__ = [
14+
'TimingMetric',
15+
'TimingMetricsConfig',
16+
'RequestDataParser',
17+
'RequestTimeBreakdown',
18+
'main',
19+
]
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/usr/bin/env python3
2+
# SPDX-License-Identifier: Apache-2.0
3+
"""
4+
Entry point for running time_breakdown as a module.
5+
6+
Usage:
7+
python -m tensorrt_llm.serve.scripts.time_breakdown perf_metrics.json [options]
8+
"""
9+
10+
from .time_breakdown import main
11+
12+
if __name__ == '__main__':
13+
main()
127 KB
Loading

0 commit comments

Comments
 (0)