dominicshanshan
diff --git a/‎tensorrt_llm/serve/scripts/benchmark_serving.py‎
Lines changed: 85 additions & 0 deletions b/‎tensorrt_llm/serve/scripts/benchmark_serving.py‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎tensorrt_llm/serve/scripts/time_breakdown/README.md‎
Lines changed: 181 additions & 0 deletions b/‎tensorrt_llm/serve/scripts/time_breakdown/README.md‎
Lines changed: 181 additions & 0 deletions
diff --git a/‎tensorrt_llm/serve/scripts/time_breakdown/__init__.py‎
Lines changed: 19 additions & 0 deletions b/‎tensorrt_llm/serve/scripts/time_breakdown/__init__.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎tensorrt_llm/serve/scripts/time_breakdown/__main__.py‎
Lines changed: 13 additions & 0 deletions b/‎tensorrt_llm/serve/scripts/time_breakdown/__main__.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎tensorrt_llm/serve/scripts/time_breakdown/images/request_time_breakdown_example.png‎
127 KB b/‎tensorrt_llm/serve/scripts/time_breakdown/images/request_time_breakdown_example.png‎
127 KB
@@ -45,6 +45,7 @@
     SampleRequest, ShareGPTDataset, SonnetDataset, VisionArenaDataset)
 from tensorrt_llm.serve.scripts.benchmark_utils import (
     convert_to_pytorch_benchmark_format, write_to_json)
+from tensorrt_llm.serve.scripts.time_breakdown import RequestTimeBreakdown
 # isort: on
 
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@@ -598,6 +599,34 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
         write_to_json(pt_file, pt_records)
 
 
+async def fetch_perf_metrics(base_url: str) -> dict:
+    """
+    Fetch performance metrics from the /perf_metrics endpoint.
+
+    Args:
+        base_url: The base URL of the server
+
+    Returns:
+        Dictionary containing the performance metrics
+    """
+    perf_url = f"{base_url}/perf_metrics"
+
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        try:
+            async with session.get(perf_url) as response:
+                if response.status == 200:
+                    return await response.json()
+                else:
+                    print(
+                        f"Failed to fetch performance metrics. Status: {response.status}"
+                    )
+                    return {}
+        except Exception as e:
+            print(f"Error fetching performance metrics: {e}")
+            return {}
+
+
 def main(args: argparse.Namespace):
     print(args)
     random.seed(args.seed)
@@ -877,6 +906,55 @@ def create_dataset_and_sample(dataset_name: str):
             json.dump(result_json, outfile)
         save_to_pytorch_benchmark_format(args, result_json, file_name)
 
+    # Save per-request breakdown if requested
+    if args.save_request_time_breakdown:
+        print("Fetching request performance metrics...")
+        perf_metrics = asyncio.run(fetch_perf_metrics(base_url))
+
+        if perf_metrics:
+            # Generate filename for perf metrics
+            current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
+            base_model_id = model_id.split("/")[-1]
+            max_concurrency_str = (f"-concurrency{args.max_concurrency}"
+                                   if args.max_concurrency is not None else "")
+            perf_filename = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}-perf_metrics.json"
+
+            if args.result_dir:
+                perf_filename = os.path.join(args.result_dir, perf_filename)
+
+            # Save perf metrics to JSON file
+            with open(perf_filename, "w", encoding='utf-8') as outfile:
+                try:
+                    json.dump(perf_metrics, outfile, indent=2)
+                except Exception as e:
+                    print(f"Failed to save perf metrics: {e}")
+
+            print(f"Request performance metrics saved to: {perf_filename}")
+
+            # Create timing diagram from the saved JSON file
+            try:
+                analyzer = RequestTimeBreakdown()
+
+                print("Creating time diagram from request time breakdown...")
+                timing_data = analyzer.parse_json_file(perf_filename)
+
+                if timing_data:
+                    # Generate HTML filename for the timing diagram
+                    diagram_filename = f"{os.path.splitext(perf_filename)[0]}-time_diagram.html"
+                    analyzer.create_timing_diagram(timing_data,
+                                                   diagram_filename)
+
+                    print(f"Time diagram saved to: {diagram_filename}")
+                else:
+                    print(
+                        "No time data found in request time breakdown - skipping diagram creation."
+                    )
+            except Exception as e:
+                print(f"Failed to create time diagram: {e}")
+                print("Performance metrics were still saved successfully.")
+        else:
+            print("Failed to fetch per-request performance metrics.")
+
 
 if __name__ == "__main__":
     parser = FlexibleArgumentParser(
@@ -1260,6 +1338,13 @@ def create_dataset_and_sample(dataset_name: str):
         help="Skip initial test run with a single prompt.",
     )
 
+    parser.add_argument(
+        "--save-request-time-breakdown",
+        action="store_true",
+        help=
+        "After benchmarking, call the /perf_metric endpoint, save the result as JSON, and create an interactive time breakdown diagram.",
+    )
+
     args = parser.parse_args()
 
     main(args)
@@ -0,0 +1,181 @@
+# Time Breakdown Tool
+
+A standalone tool for analyzing and visualizing TensorRT-LLM server request time breakdown.
+
+## Overview
+
+The Time Breakdown tool analyzes performance metrics from TensorRT-LLM servers and creates interactive visualizations showing how time is spent processing each request. It supports both aggregated and disaggregated server configurations.
+
+
+The tool generates:
+
+1. **Interactive HTML Diagram**: A stacked bar chart showing timing breakdown per request with hover tooltips
+2. **Statistics**: Median times for each timing segment (optional)
+
+### Example Visualization
+
+![Request Time Breakdown Example](images/request_time_breakdown_example.png)
+
+*Example of the interactive time diagram showing request time breakdown across different processing stages.*
+
+### Timing Metrics
+
+The tool aims to track detailed timing segments throughout the request lifecycle (currently we only track timing segments related to TTFT (Time-To-First-Token), full lifecycle tracking will be added soon):
+
+#### Context/Prefill Stage Metrics
+
+1. **Context Preprocessing** (`ctx_preprocessing`)
+   - **Time Period**: `server_arrival_time` → `arrival_time`
+   - **Description**: Python overhead & initialization when the context server receives the request
+   - **Includes**: Request parsing, pre-processing (e.g., tokenization) before queuing
+
+2. **Context Queue** (`ctx_queue`)
+   - **Time Period**: `arrival_time` → `first_scheduled_time`
+   - **Description**: Time spent waiting in queue and resource allocation
+   - **Includes**: Queueing delay, memory allocation, scheduling wait time
+
+3. **Context Processing** (`ctx_processing`)
+   - **Time Period**: `first_scheduled_time` → `first_token_time`
+   - **Description**: Actual prefill computation time
+   - **Includes**: Model forward pass for the context/prompt tokens
+
+4. **Context Postprocessing** (`ctx_postprocessing`)
+   - **Time Period**: `first_token_time` → `server_first_token_time`
+   - **Description**: Time to prepare and send the first token response
+   - **Includes**:  Response preparation, serialization, network overhead
+
+#### Generation/Decode Stage Metrics (Disaggregated Mode Only)
+
+5. **Generation Preprocessing** (`gen_preprocessing`)
+   - **Time Period**: `gen_server_arrival_time` → `gen_arrival_time`
+   - **Description**: Python overhead & initialization when generation server receives the request
+   - **Includes**: Request parsing, KV cache transfer preparation
+
+6. **Generation Queue** (`gen_queue`)
+   - **Time Period**: `gen_arrival_time` → `gen_first_scheduled_time`
+   - **Description**: Time spent in queue and resource allocation, including KV cache transfer
+   - **Includes**: 
+     Queueing delay, KV cache transfer, memory allocation for generation
+
+7. **Generation First Token Postprocessing** (`gen_postprocessing`)
+   - **Time Period**: `gen_first_scheduled_time` → `gen_server_first_token_time`
+   - **Description**: Time to generate and send first token from generation server
+   - **Includes**: Token generation, response preparation
+
+#### Disaggregation Server Metrics
+
+8. **Disaggregation Preprocessing** (`disagg_preprocessing`)
+   - **Time Period**: `disagg_server_arrival_time` → `ctx_server_arrival_time`
+   - **Description**: Routing overhead from disagg server to context server
+   - **Includes**: Request forwarding, network latency
+
+9. **Disaggregation Postprocessing** (`disagg_postprocessing`)
+   - **Time Period**: `gen_server_first_token_time` → `disagg_server_first_token_time`
+   - **Description**: Routing overhead from generation server back through disagg server
+   - **Includes**: Response forwarding, aggregation
+## Input Format
+
+The tool expects a JSON file containing an array of request performance metrics (unit: seconds).
+
+### Aggregated Format
+
+```json
+[
+  {
+    "request_id": 0,
+    "perf_metrics": {
+      "timing_metrics": {
+        "server_arrival_time": 1.000,
+        "arrival_time": 1.002,
+        "first_scheduled_time": 1.005,
+        "first_token_time": 1.025,
+        "server_first_token_time": 1.027
+      }
+    }
+  }
+]
+```
+
+### Disaggregated Format
+
+```json
+[
+  {
+    "ctx_perf_metrics": {
+      "request_id": 3,
+      "perf_metrics": {
+        "timing_metrics": {
+          "server_arrival_time": 2.000,
+          "arrival_time": 2.003,
+          "first_scheduled_time": 2.008,
+          "first_token_time": 2.035,
+          "server_first_token_time": 2.038
+        }
+      }
+    },
+    "gen_perf_metrics": {
+      "perf_metrics": {
+        "timing_metrics": {
+          "server_arrival_time": 2.050,
+          "arrival_time": 2.052,
+          "first_scheduled_time": 2.055,
+          "first_token_time": 2.080,
+          "server_first_token_time": 2.083
+        }
+      }
+    },
+    "disagg_server_arrival_time": 1.995,
+    "disagg_server_first_token_time": 2.090
+  }
+]
+```
+## Usage
+
+### Integration with Benchmark Serving
+Step 1:
+Set 
+```
+ return_perf_metrics: True
+ perf_metrics_max_requests: <INTEGER>
+```
+in the `extra-llm-api-config.yaml`. If you are running disaggregated serving, you should add configs for all servers (disagg, context and generation server).
+
+Step 2:
+Add `--save-request-time-breakdown` when running `benchmark_serving.py`
+```
+python -m tensorrt_llm.serve.scripts.benchmark_serving \
+        --model ${model_name} \
+        --dataset-name random \
+        --ignore-eos \
+        --num-prompts 1000 \
+        --random-input-len 1024 \
+        --random-output-len 2048 \
+        --random-ids \
+        --max-concurrency 64 \
+        --save-result \
+        --result-dir <RESULT_DIR> \
+        --percentile-metrics "ttft,tpot,itl,e2e" \
+        --save-request-time-breakdown 
+```
+You will be able find the interactive time diagram in `<RESULT_DIR>`.
+### As a CLI Tool
+Step 1:
+Query the perf_metrics.json using the `/perf_metrics` endpoint of the trtllm server (in case of disaggreated serving, you only need to query the disagg server). Make sure the servers have `perf_metrics_max_requests` and `return_perf_metric` configured.
+```
+curl -o perf_metrics.json <HOST>:<PORT>/perf_metrics
+```
+Step 2:
+Process the `perf_metrics.json` with `time_breakdown.py`
+```bash
+# Basic usage - analyze and create time diagram
+python time_breakdown.py perf_metrics.json
+
+# Specify custom output file
+python time_breakdown.py perf_metrics.json -o my_time_diagram.html
+
+# Show statistics only (no diagram) 
+python time_breakdown.py perf_metrics.json --stats-only
+
+# Create diagram and show statistics
+python time_breakdown.py perf_metrics.json --show-stats
+```
@@ -0,0 +1,19 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+"""
+Time Breakdown Analysis Tool
+
+This module provides tools for analyzing and visualizing request time breakdown
+from TensorRT-LLM server performance metrics.
+"""
+
+from .time_breakdown import (RequestDataParser, RequestTimeBreakdown,
+                             TimingMetric, TimingMetricsConfig, main)
+
+__all__ = [
+    'TimingMetric',
+    'TimingMetricsConfig',
+    'RequestDataParser',
+    'RequestTimeBreakdown',
+    'main',
+]
@@ -0,0 +1,13 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+"""
+Entry point for running time_breakdown as a module.
+
+Usage:
+    python -m tensorrt_llm.serve.scripts.time_breakdown perf_metrics.json [options]
+"""
+
+from .time_breakdown import main
+
+if __name__ == '__main__':
+    main()