Skip to content

Commit 1f85814

Browse files
authored
Merge pull request ROCm#583 from ROCm/upstream_merge_2025_06_25
Upstream merge 2025 06 25
2 parents c4258f4 + 4ed2d76 commit 1f85814

File tree

155 files changed

+4281
-1595
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

155 files changed

+4281
-1595
lines changed

.buildkite/nightly-benchmarks/nightly-annotation.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ Please download the visualization scripts in the post
1616
- Download `nightly-benchmarks.zip`.
1717
- In the same folder, run the following code:
1818

19-
```console
19+
```bash
2020
export HF_TOKEN=<your HF token>
2121
apt update
2222
apt install -y git

.buildkite/release-pipeline.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ steps:
102102
commands:
103103
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
104104
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
105+
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
105106
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
106107
env:
107108
DOCKER_BUILDKIT: "1"
@@ -117,6 +118,7 @@ steps:
117118
commands:
118119
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
119120
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
121+
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
120122
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
121123
env:
122124
DOCKER_BUILDKIT: "1"

.buildkite/scripts/tpu/config_v6e_1.env

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@ CONTAINER_NAME=vllm-tpu
44

55
# vllm config
66
MODEL=meta-llama/Llama-3.1-8B-Instruct
7-
MAX_NUM_SEQS=512
8-
MAX_NUM_BATCHED_TOKENS=512
7+
MAX_NUM_SEQS=256
8+
MAX_NUM_BATCHED_TOKENS=1024
99
TENSOR_PARALLEL_SIZE=1
1010
MAX_MODEL_LEN=2048
1111
DOWNLOAD_DIR=/mnt/disks/persist

.buildkite/test-pipeline.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -615,13 +615,16 @@ steps:
615615
- vllm/executor/
616616
- vllm/model_executor/models/
617617
- tests/distributed/
618+
- tests/examples/offline_inference/data_parallel.py
618619
commands:
619620
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
620621
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
622+
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
621623
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
622624
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
623625
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
624626
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
627+
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
625628

626629
- label: Distributed Tests (2 GPUs) # 40min
627630
mirror_hardwares: [amdexperimental]

benchmarks/README.md

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,21 @@ python3 vllm/benchmarks/benchmark_serving.py \
269269
--num-prompts 10
270270
```
271271

272+
### Running With Ramp-Up Request Rate
273+
274+
The benchmark tool also supports ramping up the request rate over the
275+
duration of the benchmark run. This can be useful for stress testing the
276+
server or finding the maximum throughput that it can handle, given some latency budget.
277+
278+
Two ramp-up strategies are supported:
279+
- `linear`: Increases the request rate linearly from a start value to an end value.
280+
- `exponential`: Increases the request rate exponentially.
281+
282+
The following arguments can be used to control the ramp-up:
283+
- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
284+
- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
285+
- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
286+
272287
---
273288
## Example - Offline Throughput Benchmark
274289

@@ -387,3 +402,178 @@ python3 vllm/benchmarks/benchmark_throughput.py \
387402
--enable-lora \
388403
--lora-path yard1/llama-2-7b-sql-lora-test
389404
```
405+
406+
---
407+
## Example - Structured Output Benchmark
408+
409+
Benchmark the performance of structured output generation (JSON, grammar, regex).
410+
411+
### Server Setup
412+
413+
```bash
414+
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
415+
```
416+
417+
### JSON Schema Benchmark
418+
419+
```bash
420+
python3 benchmarks/benchmark_serving_structured_output.py \
421+
--backend vllm \
422+
--model NousResearch/Hermes-3-Llama-3.1-8B \
423+
--dataset json \
424+
--structured-output-ratio 1.0 \
425+
--request-rate 10 \
426+
--num-prompts 1000
427+
```
428+
429+
### Grammar-based Generation Benchmark
430+
431+
```bash
432+
python3 benchmarks/benchmark_serving_structured_output.py \
433+
--backend vllm \
434+
--model NousResearch/Hermes-3-Llama-3.1-8B \
435+
--dataset grammar \
436+
--structure-type grammar \
437+
--request-rate 10 \
438+
--num-prompts 1000
439+
```
440+
441+
### Regex-based Generation Benchmark
442+
443+
```bash
444+
python3 benchmarks/benchmark_serving_structured_output.py \
445+
--backend vllm \
446+
--model NousResearch/Hermes-3-Llama-3.1-8B \
447+
--dataset regex \
448+
--request-rate 10 \
449+
--num-prompts 1000
450+
```
451+
452+
### Choice-based Generation Benchmark
453+
454+
```bash
455+
python3 benchmarks/benchmark_serving_structured_output.py \
456+
--backend vllm \
457+
--model NousResearch/Hermes-3-Llama-3.1-8B \
458+
--dataset choice \
459+
--request-rate 10 \
460+
--num-prompts 1000
461+
```
462+
463+
### XGrammar Benchmark Dataset
464+
465+
```bash
466+
python3 benchmarks/benchmark_serving_structured_output.py \
467+
--backend vllm \
468+
--model NousResearch/Hermes-3-Llama-3.1-8B \
469+
--dataset xgrammar_bench \
470+
--request-rate 10 \
471+
--num-prompts 1000
472+
```
473+
474+
---
475+
## Example - Long Document QA Throughput Benchmark
476+
477+
Benchmark the performance of long document question-answering with prefix caching.
478+
479+
### Basic Long Document QA Test
480+
481+
```bash
482+
python3 benchmarks/benchmark_long_document_qa_throughput.py \
483+
--model meta-llama/Llama-2-7b-chat-hf \
484+
--enable-prefix-caching \
485+
--num-documents 16 \
486+
--document-length 2000 \
487+
--output-len 50 \
488+
--repeat-count 5
489+
```
490+
491+
### Different Repeat Modes
492+
493+
```bash
494+
# Random mode (default) - shuffle prompts randomly
495+
python3 benchmarks/benchmark_long_document_qa_throughput.py \
496+
--model meta-llama/Llama-2-7b-chat-hf \
497+
--enable-prefix-caching \
498+
--num-documents 8 \
499+
--document-length 3000 \
500+
--repeat-count 3 \
501+
--repeat-mode random
502+
503+
# Tile mode - repeat entire prompt list in sequence
504+
python3 benchmarks/benchmark_long_document_qa_throughput.py \
505+
--model meta-llama/Llama-2-7b-chat-hf \
506+
--enable-prefix-caching \
507+
--num-documents 8 \
508+
--document-length 3000 \
509+
--repeat-count 3 \
510+
--repeat-mode tile
511+
512+
# Interleave mode - repeat each prompt consecutively
513+
python3 benchmarks/benchmark_long_document_qa_throughput.py \
514+
--model meta-llama/Llama-2-7b-chat-hf \
515+
--enable-prefix-caching \
516+
--num-documents 8 \
517+
--document-length 3000 \
518+
--repeat-count 3 \
519+
--repeat-mode interleave
520+
```
521+
522+
---
523+
## Example - Prefix Caching Benchmark
524+
525+
Benchmark the efficiency of automatic prefix caching.
526+
527+
### Fixed Prompt with Prefix Caching
528+
529+
```bash
530+
python3 benchmarks/benchmark_prefix_caching.py \
531+
--model meta-llama/Llama-2-7b-chat-hf \
532+
--enable-prefix-caching \
533+
--num-prompts 1 \
534+
--repeat-count 100 \
535+
--input-length-range 128:256
536+
```
537+
538+
### ShareGPT Dataset with Prefix Caching
539+
540+
```bash
541+
# download dataset
542+
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
543+
544+
python3 benchmarks/benchmark_prefix_caching.py \
545+
--model meta-llama/Llama-2-7b-chat-hf \
546+
--dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \
547+
--enable-prefix-caching \
548+
--num-prompts 20 \
549+
--repeat-count 5 \
550+
--input-length-range 128:256
551+
```
552+
553+
---
554+
## Example - Request Prioritization Benchmark
555+
556+
Benchmark the performance of request prioritization in vLLM.
557+
558+
### Basic Prioritization Test
559+
560+
```bash
561+
python3 benchmarks/benchmark_prioritization.py \
562+
--model meta-llama/Llama-2-7b-chat-hf \
563+
--input-len 128 \
564+
--output-len 64 \
565+
--num-prompts 100 \
566+
--scheduling-policy priority
567+
```
568+
569+
### Multiple Sequences per Prompt
570+
571+
```bash
572+
python3 benchmarks/benchmark_prioritization.py \
573+
--model meta-llama/Llama-2-7b-chat-hf \
574+
--input-len 128 \
575+
--output-len 64 \
576+
--num-prompts 100 \
577+
--scheduling-policy priority \
578+
--n 2
579+
```

benchmarks/auto_tune.sh

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
# 3. Set variables (ALL REQUIRED)
1111
# BASE: your directory for vllm repo
1212
# MODEL: the model served by vllm
13+
# SYSTEM: the hardware, choice TPU or GPU, for other systems, "get best profile" might not support.
1314
# TP: ways of tensor parallelism
1415
# DOWNLOAD_DIR: directory to download and load model weights.
1516
# INPUT_LEN: request input len
@@ -34,6 +35,7 @@
3435
TAG=$(date +"%Y_%m_%d_%H_%M")
3536
BASE=""
3637
MODEL="meta-llama/Llama-3.1-8B-Instruct"
38+
SYSTEM="TPU"
3739
TP=1
3840
DOWNLOAD_DIR=""
3941
INPUT_LEN=4000
@@ -45,12 +47,15 @@ NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
4547

4648
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
4749
RESULT="$LOG_FOLDER/result.txt"
50+
PROFILE_PATH="$LOG_FOLDER/profile"
4851

4952
echo "result file: $RESULT"
5053
echo "model: $MODEL"
5154

5255
rm -rf $LOG_FOLDER
56+
rm -rf $PROFILE_PATH
5357
mkdir -p $LOG_FOLDER
58+
mkdir -p $PROFILE_PATH
5459

5560
cd "$BASE/vllm"
5661

@@ -70,10 +75,11 @@ start_server() {
7075
local max_num_seqs=$2
7176
local max_num_batched_tokens=$3
7277
local vllm_log=$4
78+
local profile_dir=$5
7379

7480
pkill -f vllm
7581

76-
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
82+
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
7783
--disable-log-requests \
7884
--port 8004 \
7985
--gpu-memory-utilization $gpu_memory_utilization \
@@ -105,19 +111,37 @@ start_server() {
105111
fi
106112
}
107113

114+
update_best_profile() {
115+
local profile_dir=$1
116+
local profile_index=$2
117+
sorted_paths=($(find "$profile_dir" -maxdepth 1 -not -path "$profile_dir" | sort))
118+
selected_profile_file=
119+
if [[ "$SYSTEM" == "TPU" ]]; then
120+
selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
121+
fi
122+
if [[ "$SYSTEM" == "GPU" ]]; then
123+
selected_profile_file="${sorted_paths[$profile_index]}"
124+
fi
125+
rm -f $PROFILE_PATH/*
126+
cp $selected_profile_file $PROFILE_PATH
127+
}
128+
108129
run_benchmark() {
109130
local max_num_seqs=$1
110131
local max_num_batched_tokens=$2
111132
local gpu_memory_utilization=$3
112133
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
113134
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
135+
local profile_dir="$LOG_FOLDER/profile_${max_num_seqs}_${max_num_batched_tokens}"
114136
echo "vllm_log: $vllm_log"
115137
echo
116138
rm -f $vllm_log
139+
mkdir -p $profile_dir
117140
pkill -f vllm
141+
local profile_index=0
118142

119143
echo "starting server..."
120-
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log
144+
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log $profile_dir
121145
result=$?
122146
if [[ "$result" -eq 1 ]]; then
123147
echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
@@ -144,7 +168,8 @@ run_benchmark() {
144168
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
145169
--num-prompts 1000 \
146170
--random-prefix-len $prefix_len \
147-
--port 8004 &> "$bm_log"
171+
--port 8004 \
172+
--profile &> "$bm_log"
148173
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
149174
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
150175
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
@@ -158,6 +183,7 @@ run_benchmark() {
158183
# start from request-rate as int(throughput) + 1
159184
request_rate=$((${throughput%.*} + 1))
160185
while ((request_rate > 0)); do
186+
profile_index=$((profile_index+1))
161187
# clear prefix cache
162188
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
163189
sleep 5
@@ -195,6 +221,12 @@ run_benchmark() {
195221
best_max_num_seqs=$max_num_seqs
196222
best_num_batched_tokens=$max_num_batched_tokens
197223
best_goodput=$goodput
224+
if [[ "$SYSTEM" == "TPU" ]]; then
225+
update_best_profile "$profile_dir/plugins/profile" $profile_index
226+
fi
227+
if [[ "$SYSTEM" == "GPU" ]]; then
228+
update_best_profile "$profile_dir" $profile_index
229+
fi
198230
fi
199231
else
200232
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
@@ -239,6 +271,6 @@ for num_seqs in "${num_seqs_list[@]}"; do
239271
done
240272
done
241273
echo "finish permutations"
242-
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
243-
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" >> "$RESULT"
274+
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
275+
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
244276

0 commit comments

Comments
 (0)