vincentzed · vincentzed · Jul 26, 2025 · Jul 26, 2025 · Jul 26, 2025 · Jul 26, 2025
diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
@@ -28,6 +28,7 @@ See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performanc
 ## Trigger the benchmark
 
 Performance benchmark will be triggered when:
+
 - A PR being merged into vllm.
 - Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
 
@@ -38,6 +39,7 @@ bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
 ```
 
 Runtime environment variables:
+
 - `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
 - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
 - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
@@ -46,12 +48,14 @@ Runtime environment variables:
 - `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
 
 Nightly benchmark will be triggered when:
+
 - Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
 
 ## Performance benchmark details
 
 See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
 > NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
+>
 ### Latency test
 
 Here is an example of one test inside `latency-tests.json`:
@@ -74,21 +78,21 @@ Here is an example of one test inside `latency-tests.json`:
 In this example:
 
 - The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
-- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
+- The `parameters` attribute control the command line arguments to be used for `vllm bench latency`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `vllm bench latency`. For example, the corresponding command line arguments for `vllm bench latency` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
 
 Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
 
 WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
 
 ### Throughput test
 
-The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
+The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `vllm bench throughput`.
 
 The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
 
 ### Serving test
 
-We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
+We test the throughput by using `vllm bench serve` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
 
 ```json
 [
@@ -100,7 +104,6 @@ We test the throughput by using `benchmark_serving.py` with request rate = inf t
             "tensor_parallel_size": 1,
             "swap_space": 16,
             "disable_log_stats": "",
-            "disable_log_requests": "",
             "load_format": "dummy"
         },
         "client_parameters": {
@@ -118,8 +121,8 @@ Inside this example:
 
 - The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
 - The `server-parameters` includes the command line arguments for vLLM server.
-- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
-- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
+- The `client-parameters` includes the command line arguments for `vllm bench serve`.
+- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `vllm bench serve`
 
 The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
 
@@ -149,6 +152,7 @@ Here is an example using the script to compare result_a and result_b without det
 
 Here is an example using the script to compare result_a and result_b with detail test name.
 `python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
+
 |   | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio        |
 |---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------|
 | 0 | serving_llama8B_tp1_sharegpt_qps_1          | 142.633982                             | serving_llama8B_tp1_sharegpt_qps_1          | 156.526018                             | 1.097396 |

diff --git a/.buildkite/nightly-benchmarks/nightly-annotation.md b/.buildkite/nightly-benchmarks/nightly-annotation.md
@@ -1,3 +1,4 @@
+# Nightly benchmark annotation
 
 ## Description
 
@@ -13,15 +14,15 @@ Please download the visualization scripts in the post
 
 - Find the docker we use in `benchmarking pipeline`
 - Deploy the docker, and inside the docker:
-  - Download `nightly-benchmarks.zip`.
-  - In the same folder, run the following code:
-
-  ```bash
-  export HF_TOKEN=<your HF token>
-  apt update
-  apt install -y git
-  unzip nightly-benchmarks.zip
-  VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
-  ```
+    - Download `nightly-benchmarks.zip`.
+    - In the same folder, run the following code:
+
+    ```bash
+    export HF_TOKEN=<your HF token>
+    apt update
+    apt install -y git
+    unzip nightly-benchmarks.zip
+    VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+    ```
 
 And the results will be inside `./benchmarks/results`.
diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -13,25 +13,25 @@ Latest reproduction guilde: [github issue link](https://github.com/vllm-project/
 ## Setup
 
 - Docker images:
-  - vLLM: `vllm/vllm-openai:v0.6.2`
-  - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
-  - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
-  - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
-    - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
-  - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
+    - vLLM: `vllm/vllm-openai:v0.6.2`
+    - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
+    - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
+    - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
+        - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
+    - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
 - Hardware
-  - 8x Nvidia A100 GPUs
+    - 8x Nvidia A100 GPUs
 - Workload:
-  - Dataset
-    - ShareGPT dataset
-    - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
-    - Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
-    - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
-  - Models: llama-3 8B, llama-3 70B.
-    - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
-  - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
-    - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
-  - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
+    - Dataset
+        - ShareGPT dataset
+        - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
+        - Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
+        - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
+    - Models: llama-3 8B, llama-3 70B.
+        - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
+    - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
+        - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
+    - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
 
 ## Known issues
 

diff --git a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
@@ -1,3 +1,4 @@
+# Performance benchmarks descriptions
 
 ## Latency tests
 

diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -44,6 +44,7 @@
     "test_name": "Test name",
     "gpu_type": "GPU",
     "completed": "# of req.",
+    "max_concurrency": "# of max concurrency.",
     "request_throughput": "Tput (req/s)",
     "total_token_throughput": "Total Token Tput (tok/s)",
     "output_throughput": "Output Tput (tok/s)",
@@ -100,7 +101,7 @@ def get_size_with_unit(bytes, suffix="B"):
             raw_result = json.loads(f.read())
 
         if "serving" in str(test_file):
-            # this result is generated via `benchmark_serving.py`
+            # this result is generated via `vllm bench serve` command
 
             # attach the benchmarking command to raw_result
             try:
@@ -120,7 +121,7 @@ def get_size_with_unit(bytes, suffix="B"):
             continue
 
         elif "latency" in f.name:
-            # this result is generated via `benchmark_latency.py`
+            # this result is generated via `vllm bench latency` command
 
             # attach the benchmarking command to raw_result
             try:
@@ -148,7 +149,7 @@ def get_size_with_unit(bytes, suffix="B"):
             continue
 
         elif "throughput" in f.name:
-            # this result is generated via `benchmark_throughput.py`
+            # this result is generated via `vllm bench throughput` command
 
             # attach the benchmarking command to raw_result
             try:

diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@@ -73,7 +73,7 @@ get_current_llm_serving_engine() {
     echo "Container: vllm"
     # move to a completely irrelevant directory, to avoid import vllm from current folder
     export CURRENT_LLM_SERVING_ENGINE=vllm
-    
+
     return
   fi
 }
@@ -95,12 +95,14 @@ json2args() {
 }
 
 kill_gpu_processes() {
-  pkill -f python
-  pkill -f python3
-  pkill -f tritonserver
-  pkill -f pt_main_thread
-  pkill -f text-generation
-  pkill -f lmdeploy
+  pkill -f '[p]ython'
+  pkill -f '[p]ython3'
+  pkill -f '[t]ritonserver'
+  pkill -f '[p]t_main_thread'
+  pkill -f '[t]ext-generation'
+  pkill -f '[l]mdeploy'
+  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
+  pkill -f '[V]LLM'
 
   while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
     sleep 1
@@ -125,7 +127,7 @@ ensure_installed() {
 }
 
 run_serving_tests() {
-  # run serving tests using `benchmark_serving.py`
+  # run serving tests using `vllm bench serve` command
   # $1: a json file specifying serving test cases
 
   local serving_test_file
@@ -225,7 +227,7 @@ run_serving_tests() {
 
       if [[ "$dataset_name" = "sharegpt" ]]; then
 
-        client_command="python3 benchmark_serving.py \
+        client_command="vllm bench serve \
           --backend $backend \
           --tokenizer /tokenizer_cache \
           --model $model \
@@ -246,7 +248,7 @@ run_serving_tests() {
         sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
         sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
 
-        client_command="python3 benchmark_serving.py \
+        client_command="vllm bench serve \
           --backend $backend \
           --tokenizer /tokenizer_cache \
           --model $model \
@@ -265,13 +267,13 @@ run_serving_tests() {
           $client_args"
 
       else
-  
+
         echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
         exit 1
 
       fi
 
-        
+
 
       echo "Running test case $test_name with qps $qps"
       echo "Client command: $client_command"
@@ -302,7 +304,7 @@ run_serving_tests() {
 }
 
 run_genai_perf_tests() {
-  # run genai-perf tests 
+  # run genai-perf tests
 
   # $1: a json file specifying genai-perf test cases
   local genai_perf_test_file
@@ -311,14 +313,14 @@ run_genai_perf_tests() {
   # Iterate over genai-perf tests
   jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
     # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')    
-    
+    test_name=$(echo "$params" | jq -r '.test_name')
+
     # if TEST_SELECTOR is set, only run the test cases that match the selector
     if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
       echo "Skip test case $test_name."
       continue
     fi
-    
+
     # prepend the current serving engine to the test name
     test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
 
@@ -369,10 +371,10 @@ run_genai_perf_tests() {
         qps=$num_prompts
         echo "now qps is $qps"
       fi
-    
+
       new_test_name=$test_name"_qps_"$qps
       backend=$CURRENT_LLM_SERVING_ENGINE
-      
+
       if [[ "$backend" == *"vllm"* ]]; then
         backend="vllm"
       fi
@@ -413,7 +415,7 @@ prepare_dataset() {
   do
     cat sonnet.txt >> sonnet_4x.txt
   done
-  
+
 }
 
 main() {
Original file line number	Diff line number	Diff line change
		@@ -1,3 +1,4 @@
		# Performance benchmarks descriptions

		## Latency tests

Expand Down