Skip to content

Commit 6cf1c3f

Browse files
authored
[TRTLLM-8260][feat] Add Server-Client Perf Test in pytest for B200 and B300 (NVIDIA#7985)
Signed-off-by: Chenfei Zhang <chenfeiz@nvidia.com>
1 parent 50149ac commit 6cf1c3f

File tree

12 files changed

+1786
-1606
lines changed

12 files changed

+1786
-1606
lines changed

jenkins/L0_Test.groovy

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2655,6 +2655,9 @@ def launchTestJobs(pipeline, testFilter)
26552655
"DGX_B200-8_GPUs-PyTorch-1": ["b200-x8", "l0_dgx_b200", 1, 1, 8],
26562656
"DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-trtllm", "l0_dgx_b200", 1, 1, 4, 1, true],
26572657
"DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 1, 4],
2658+
// Perf sanity post merge test
2659+
"DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "perf_sanity_l0_dgx_b200", 1, 1, 4],
2660+
"DGX_B300-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b300-x4", "perf_sanity_l0_dgx_b300", 1, 1, 4],
26582661
]
26592662
fullSet += x86SlurmTestConfigs.keySet()
26602663

tests/integration/defs/perf/test_perf.py

Lines changed: 501 additions & 15 deletions
Large diffs are not rendered by default.

tests/integration/defs/perf/utils.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,9 +90,20 @@ class PerfMetricType(str, Enum):
9090
set up special threshold criteria for each type of metrics (like >50MB for engine size increase, etc.).
9191
"""
9292
INFERENCE_TIME = "INFERENCE_TIME"
93+
MEDIAN_INFERENCE_TIME = "MEDIAN_INFERENCE_TIME"
94+
P99_INFERENCE_TIME = "P99_INFERENCE_TIME"
95+
INTER_TOKEN_TIME = "INTER_TOKEN_TIME"
96+
MEDIAN_INTER_TOKEN_TIME = "MEDIAN_INTER_TOKEN_TIME"
97+
P99_INTER_TOKEN_TIME = "P99_INTER_TOKEN_TIME"
9398
FIRST_TOKEN_TIME = "FIRST_TOKEN_TIME"
99+
MEDIAN_FIRST_TOKEN_TIME = "MEDIAN_FIRST_TOKEN_TIME"
100+
P99_FIRST_TOKEN_TIME = "P99_FIRST_TOKEN_TIME"
94101
OUTPUT_TOKEN_TIME = "OUTPUT_TOKEN_TIME"
102+
MEDIAN_OUTPUT_TOKEN_TIME = "MEDIAN_OUTPUT_TOKEN_TIME"
103+
P99_OUTPUT_TOKEN_TIME = "P99_OUTPUT_TOKEN_TIME"
95104
TOKEN_THROUGHPUT = "TOKEN_THROUGHPUT"
105+
TOTAL_TOKEN_THROUGHPUT = "TOTAL_TOKEN_THROUGHPUT"
106+
USER_THROUGHPUT = "USER_THROUGHPUT"
96107
BUILD_TIME = "BUILD_TIME"
97108
BUILD_PEAK_CPU_MEMORY = "BUILD_PEAK_CPU_MEMORY"
98109
BUILD_PEAK_GPU_MEMORY = "BUILD_PEAK_GPU_MEMORY"
@@ -311,6 +322,53 @@ def get_cmd_str(self, cmd_idx) -> List[str]:
311322
return cmd_str
312323

313324

325+
class PerfServerClientBenchmarkCmds(NamedTuple):
326+
server_cmds: List[str]
327+
client_cmds: List[List[str]]
328+
names: List[str]
329+
working_dir: str
330+
331+
def wait_for_endpoint_ready(self, url: str, timeout: int = 5400):
332+
start = time.monotonic()
333+
while time.monotonic() - start < timeout:
334+
try:
335+
time.sleep(10)
336+
if requests.get(url).status_code == 200:
337+
print(f"endpoint {url} is ready")
338+
return
339+
except Exception as err:
340+
print(f"endpoint {url} is not ready, with exception: {err}")
341+
print_error(
342+
f"Endpoint {url} did not become ready within {timeout} seconds")
343+
344+
def run_cmd(self, cmd_idx: int, venv) -> str:
345+
output = ""
346+
server_file_path = os.path.join(
347+
self.working_dir, f"trtllm-serve.{self.names[cmd_idx]}.log")
348+
client_file_path = os.path.join(
349+
self.working_dir, f"trtllm-benchmark.{self.names[cmd_idx]}.log")
350+
try:
351+
with ( # Start server process
352+
open(server_file_path, 'w') as server_ctx,
353+
popen(self.server_cmds[cmd_idx],
354+
stdout=server_ctx,
355+
stderr=subprocess.STDOUT,
356+
env=venv._new_env,
357+
shell=True) as server_proc):
358+
self.wait_for_endpoint_ready(
359+
"http://localhost:8000/v1/models",
360+
timeout=5400) # 90 minutes for large models
361+
output += subprocess.check_output(self.client_cmds[cmd_idx],
362+
env=venv._new_env).decode()
363+
finally:
364+
server_proc.terminate()
365+
server_proc.wait()
366+
return output
367+
368+
def get_cmd_str(self, cmd_idx) -> List[str]:
369+
return ["server-benchmark tests, please check config files"]
370+
371+
314372
class PerfDisaggScriptTestCmds(NamedTuple):
315373
ctx_cmd: str
316374
gen_cmd: str
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
version: 0.0.1
2+
perf_sanity_l0_dgx_b200:
3+
- condition:
4+
ranges:
5+
system_gpu_count:
6+
gte: 4
7+
lte: 4
8+
wildcards:
9+
gpu:
10+
- '*b200*'
11+
linux_distribution_name: ubuntu*
12+
cpu: x86_64
13+
terms:
14+
stage: pre_merge
15+
backend: pytorch
16+
orchestrator: mpi
17+
tests:
18+
- perf/test_perf.py::test_perf[perf_sanity-l0_dgx_b200-r1_fp4_dep4:con1_iter1_1024_1024]
19+
- perf/test_perf.py::test_perf[perf_sanity-l0_dgx_b300-r1_fp4_dep4:con1_iter1_1024_1024]
20+
21+
- condition:
22+
ranges:
23+
system_gpu_count:
24+
gte: 4
25+
lte: 4
26+
wildcards:
27+
gpu:
28+
- '*b200*'
29+
linux_distribution_name: ubuntu*
30+
cpu: x86_64
31+
terms:
32+
stage: post_merge
33+
backend: pytorch
34+
orchestrator: mpi
35+
tests:
36+
- perf/test_perf.py::test_perf[perf_sanity-l0_dgx_b200-r1_fp4_dep4:con1_iter1_1024_1024]
37+
- perf/test_perf.py::test_perf[perf_sanity-l0_dgx_b300-r1_fp4_dep4:con1_iter1_1024_1024]
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
version: 0.0.1
2+
perf_sanity_l0_dgx_b300:
3+
- condition:
4+
ranges:
5+
system_gpu_count:
6+
gte: 4
7+
lte: 4
8+
wildcards:
9+
gpu:
10+
- '*gb110*'
11+
linux_distribution_name: ubuntu*
12+
cpu: x86_64
13+
terms:
14+
stage: pre_merge
15+
backend: pytorch
16+
orchestrator: mpi
17+
tests:
18+
- perf/test_perf.py::test_perf[perf_sanity-l0_dgx_b200-r1_fp4_dep4:con1_iter1_1024_1024]
19+
- perf/test_perf.py::test_perf[perf_sanity-l0_dgx_b300-r1_fp4_dep4:con1_iter1_1024_1024]
20+
21+
- condition:
22+
ranges:
23+
system_gpu_count:
24+
gte: 4
25+
lte: 4
26+
wildcards:
27+
gpu:
28+
- '*gb110*'
29+
linux_distribution_name: ubuntu*
30+
cpu: x86_64
31+
terms:
32+
stage: post_merge
33+
backend: pytorch
34+
orchestrator: mpi
35+
tests:
36+
- perf/test_perf.py::test_perf[perf_sanity-l0_dgx_b200-r1_fp4_dep4:con1_iter1_1024_1024]
37+
- perf/test_perf.py::test_perf[perf_sanity-l0_dgx_b300-r1_fp4_dep4:con1_iter1_1024_1024]

tests/scripts/perf-sanity/README.md

Lines changed: 76 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ Benchmarking scripts for TensorRT-LLM serving performance tests with configurati
66

77
- Run performance benchmarks across multiple model configurations
88
- Manage test cases through YAML configuration files
9-
- Generate comprehensive CSV reports with complete test case coverage
109
- Support selective execution of specific test cases
1110

1211
## Scripts Overview
@@ -16,123 +15,120 @@ Benchmarking scripts for TensorRT-LLM serving performance tests with configurati
1615

1716
**Structure**:
1817
```yaml
19-
test_cases:
20-
- id: 1
21-
model: "70B-FP8"
22-
gpus: 1
23-
tp: 1
24-
ep: 1
25-
attn_backend: "TRTLLM"
26-
moe_backend: ""
27-
enable_attention_dp: false
28-
free_gpu_mem_fraction: 0.9
29-
max_batch_size: 512
30-
isl: 1024
31-
osl: 1024
32-
max_num_tokens: 16384
18+
server_configs:
19+
- name: "r1_fp4_dep4"
20+
model_name: "deepseek_r1_0528_fp4"
21+
tp: 4
22+
ep: 4
23+
pp: 1
24+
attention_backend: "TRTLLM"
25+
moe_backend: "CUTLASS"
26+
moe_max_num_tokens: ""
27+
enable_attention_dp: true
28+
enable_chunked_prefill: false
29+
max_num_tokens: 2176
30+
disable_overlap_scheduler: false
31+
kv_cache_dtype: "fp8"
32+
enable_block_reuse: false
33+
free_gpu_memory_fraction: 0.8
34+
max_batch_size: 256
35+
enable_padding: true
36+
client_configs:
37+
- name: "con1_iter1_1024_1024"
38+
concurrency: 1
39+
iterations: 1
40+
isl: 1024
41+
osl: 1024
42+
random_range_ratio: 0.0
43+
- name: "con8_iter1_1024_1024"
44+
concurrency: 8
45+
iterations: 1
46+
isl: 1024
47+
osl: 1024
48+
random_range_ratio: 0.0
49+
50+
- name: "r1_fp4_tep4"
51+
model_name: "deepseek_r1_0528_fp4"
52+
tp: 4
53+
ep: 4
54+
pp: 1
55+
attention_backend: "TRTLLM"
56+
moe_backend: "CUTLASS"
3357
moe_max_num_tokens: ""
34-
concurrency_iterations:
35-
- [1, 10]
36-
- [8, 10]
37-
- [64, 5]
38-
- [512, 2]
58+
enable_attention_dp: false
59+
enable_chunked_prefill: false
60+
max_num_tokens: 2176
61+
disable_overlap_scheduler: false
62+
kv_cache_dtype: "fp8"
63+
enable_block_reuse: false
64+
free_gpu_memory_fraction: 0.8
65+
max_batch_size: 256
66+
enable_padding: true
67+
client_configs:
68+
- name: "con1_iter1_1024_1024"
69+
concurrency: 1
70+
iterations: 1
71+
isl: 1024
72+
osl: 1024
73+
random_range_ratio: 0.0
74+
- name: "con8_iter1_1024_1024"
75+
concurrency: 8
76+
iterations: 1
77+
isl: 1024
78+
osl: 1024
79+
random_range_ratio: 0.0
3980
```
4081
41-
**Configuration Fields**:
42-
- `id`: Unique identifier for the test case
43-
- `model`: Model name (e.g., "70B-FP8", "Scout-FP4")
44-
- `gpus`: Number of GPUs to use
45-
- `tp`: Tensor parallelism size
46-
- `ep`: Expert parallelism size
47-
- `attn_backend`: Attention backend ("TRTLLM", "FLASHINFER")
48-
- `moe_backend`: MoE backend ("DEEPGEMM", "TRTLLM", "CUTLASS", "")
49-
- `enable_attention_dp`: Enable attention data parallelism
50-
- `free_gpu_mem_fraction`: GPU memory fraction to reserve
51-
- `max_batch_size`: Maximum batch size
52-
- `isl`: Input sequence length
53-
- `osl`: Output sequence length
54-
- `max_num_tokens`: Maximum number of tokens
55-
- `moe_max_num_tokens`: Maximum number of tokens for MoE
56-
- `concurrency_iterations`: List of [concurrency, iteration] pairs
57-
58-
5982
### 2. `run_benchmark_serve.py` - Main Benchmark Runner
6083
**Purpose**: Executes performance benchmarks based on YAML configuration files.
6184

6285
**Usage**:
6386
```bash
64-
python run_benchmark_serve.py --output_folder <output_folder> --config_file <config_file> [--skip <skip_pattern>] [--select <select_pattern>]
87+
python run_benchmark_serve.py --log_folder <log_folder> --config_file <config_file> [--select <select_pattern>] [--timeout 5400]
6588
```
6689

6790
**Arguments**:
68-
- `--output_folder`: Directory to store benchmark results (required)
91+
- `--log_folder`: Directory to store benchmark logs (required)
6992
- `--config_file`: Path to YAML configuration file (required)
70-
- `--skip`: Skip pattern for specific test cases/concurrencies (optional, default: no skipping)
71-
- `--select`: Select pattern for specific test cases/concurrencies (optional, default: all test cases)
93+
- `--select`: Select pattern for specific Server and Client Config. (optional, default: all test cases)
94+
- `--timeout`: Timeout for server setup. (optional, default: 3600 seconds)
7295

7396
**Examples**:
7497
```bash
75-
# Run all test cases
76-
python run_benchmark_serve.py --output_folder results --config_file benchmark_config.yaml --skip default --select default
77-
78-
# Skip specific test cases
79-
python run_benchmark_serve.py --output_folder results --config_file benchmark_config.yaml --skip "2-1,4"
80-
81-
# Run specific concurrencies from specific test cases
82-
python run_benchmark_serve.py --output_folder results --config_file benchmark_config.yaml --select "1,2-3"
98+
# Select
99+
python run_benchmark_serve.py --log_folder ./results --config_file benchmark_config.yaml --select "r1_fp4_dep4:con8_iter1_1024_1024,r1_fp4_tep4:con1_iter1_1024_1024"
83100
84101
```
85102

86-
**Skip Pattern**:
87-
Format: `"test_case1,test_case2,test_case3"` or `"test_case1-concurrency1,test_case2-concurrency3"`
88-
- `"2,4"`: Skip test cases 2 and 4 entirely
89-
- `"2-1,4-2"`: Skip test case 2's 1st concurrency and test case 4's 2nd concurrency
90-
- `"default"` or empty: No skipping (default)
91-
92-
**Select Pattern**:
93-
Format: `"test_case1,test_case2,test_case3"` or `"test_case1-concurrency1,test_case2-concurrency3"`
94-
- `"1,3,5"`: Run only test cases 1, 3, and 5 (all concurrencies)
95-
- `"1-1,2-3"`: Run test case 1's 1st concurrency and test case 2's 3rd concurrency
96-
- `"default"` or empty: Run all test cases (default)
97-
98-
99103
### 3. `parse_benchmark_results.py` - Results Parser
100-
**Purpose**: Parses benchmark log files and generates comprehensive CSV reports with all test cases from the configuration file.
101-
102-
**Usage**:
103-
```bash
104-
python parse_benchmark_results.py --input_folder <input_folder> --output_csv <output_csv> --config_file <config_file>
105-
```
104+
**Purpose**: Print log's perf.
106105

107106
**Arguments**:
108-
- `input_folder`: Folder containing benchmark log files (serve.*.log) (required)
109-
- `output_csv`: Output CSV filename for the results table (required)
110-
- `config_file`: Path to benchmark_config.yaml file (required)
107+
- `--log_folder`: Directory to store benchmark logs (required)
111108

112-
**Examples**:
109+
**Usage**:
113110
```bash
114-
python parse_benchmark_results.py --config_file ./benchmark_logs --output_csv results.csv --input_folder ./benchmark_config.yaml
115-
111+
python parse_benchmark_results.py --log_folder <log_folder>
116112
```
117113

114+
118115
### 4. `benchmark-serve.sh` - SLURM Job Script
119116
**Usage**:
120117
```bash
121-
sbatch benchmark-serve.sh [IMAGE] [bench_dir] [output_dir] [select_pattern] [skip_pattern]
118+
sbatch benchmark-serve.sh [IMAGE] [bench_dir] [log_folder] [select_pattern]
122119
```
123120

124121
**Parameters**:
125122
- `IMAGE`: Docker image (default: tensorrt-llm-staging/release:main-x86_64)
126123
- `bench_dir`: Directory containing config file and benchmark scripts (default: current directory)
127-
- `output_dir`: Directory containing output logs and csv. (default: current directory)
124+
- `log_folder`: Directory containing output logs and csv. (default: current directory)
128125
- `select_pattern`: Select pattern (default: default - all test cases)
129-
- `skip_pattern`: Skip pattern (default: default - no skipping)
130126

131127
**Examples**:
132128
```bash
133129
134130
bench_dir="/path/to/benchmark/scripts"
135-
output_dir="/path/to/store/output/files"
136-
sbatch --reservation=RES--COM-3970 --qos=reservation -D ${output_dir} ${bench_dir}/benchmark-serve.sh urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/release:main-x86_64 ${bench_dir} ${output_dir} "1-1" ""
131+
log_folder="/path/to/store/output/files"
132+
sbatch --reservation=RES--COM-3970 --qos=reservation -D ${log_folder} ${bench_dir}/benchmark-serve.sh urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/release:main-x86_64 ${bench_dir} ${log_folder} "r1_fp4_dep4:con8_iter1_1024_1024,r1_fp4_tep4:con1_iter1_1024_1024"
137133
138134
```

0 commit comments

Comments
 (0)