Skip to content
This repository was archived by the owner on Sep 4, 2025. It is now read-only.

Commit cdaa1c7

Browse files
committed
Sync with main
Merge remote-tracking branch 'midstream/main' into release
2 parents 5a42fad + 30823b6 commit cdaa1c7

File tree

973 files changed

+59177
-22736
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

973 files changed

+59177
-22736
lines changed

.buildkite/check-wheel-size.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,11 @@
22
import sys
33
import zipfile
44

5-
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 250 MB
6-
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 250))
5+
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 300 MiB
6+
# Note that we have 400 MiB quota, please use it wisely.
7+
# See https://github.com/pypi/support/issues/3792 .
8+
# Please also sync the value with the one in Dockerfile.
9+
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 300))
710

811

912
def print_top_10_largest_files(zip_file):

.buildkite/nightly-benchmarks/benchmark-pipeline.yaml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
steps:
22
- label: "Wait for container to be ready"
3+
key: wait-for-container-image
34
agents:
45
queue: A100
56
plugins:
@@ -10,12 +11,11 @@ steps:
1011
command:
1112
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
1213

13-
- wait
14-
1514
- label: "A100"
1615
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
1716
agents:
1817
queue: A100
18+
depends_on: wait-for-container-image
1919
plugins:
2020
- kubernetes:
2121
podSpec:
@@ -49,6 +49,7 @@ steps:
4949
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
5050
agents:
5151
queue: H200
52+
depends_on: wait-for-container-image
5253
plugins:
5354
- docker#v5.12.0:
5455
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
@@ -73,7 +74,7 @@ steps:
7374
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
7475
agents:
7576
queue: H100
76-
depends_on: block-h100
77+
depends_on: wait-for-container-image
7778
plugins:
7879
- docker#v5.12.0:
7980
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT

.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ main() {
4343

4444

4545

46-
# The figures should be genereated by a separate process outside the CI/CD pipeline
46+
# The figures should be generated by a separate process outside the CI/CD pipeline
4747

4848
# # generate figures
4949
# python3 -m pip install tabulate pandas matplotlib

.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,104 @@ run_serving_tests() {
301301
kill_gpu_processes
302302
}
303303

304+
run_genai_perf_tests() {
305+
# run genai-perf tests
306+
307+
# $1: a json file specifying genai-perf test cases
308+
local genai_perf_test_file
309+
genai_perf_test_file=$1
310+
311+
# Iterate over genai-perf tests
312+
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
313+
# get the test name, and append the GPU type back to it.
314+
test_name=$(echo "$params" | jq -r '.test_name')
315+
316+
# if TEST_SELECTOR is set, only run the test cases that match the selector
317+
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
318+
echo "Skip test case $test_name."
319+
continue
320+
fi
321+
322+
# prepend the current serving engine to the test name
323+
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
324+
325+
# get common parameters
326+
common_params=$(echo "$params" | jq -r '.common_parameters')
327+
model=$(echo "$common_params" | jq -r '.model')
328+
tp=$(echo "$common_params" | jq -r '.tp')
329+
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
330+
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
331+
port=$(echo "$common_params" | jq -r '.port')
332+
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
333+
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
334+
335+
# get client and server arguments
336+
server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
337+
qps_list=$(echo "$params" | jq -r '.qps_list')
338+
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
339+
echo "Running over qps list $qps_list"
340+
341+
# check if there is enough GPU to run the test
342+
if [[ $gpu_count -lt $tp ]]; then
343+
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
344+
continue
345+
fi
346+
347+
if [[ $reuse_server == "true" ]]; then
348+
echo "Reuse previous server for test case $test_name"
349+
else
350+
kill_gpu_processes
351+
bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
352+
"$server_params" "$common_params"
353+
fi
354+
355+
if wait_for_server; then
356+
echo ""
357+
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
358+
else
359+
echo ""
360+
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
361+
break
362+
fi
363+
364+
# iterate over different QPS
365+
for qps in $qps_list; do
366+
# remove the surrounding single quote from qps
367+
if [[ "$qps" == *"inf"* ]]; then
368+
echo "qps was $qps"
369+
qps=$num_prompts
370+
echo "now qps is $qps"
371+
fi
372+
373+
new_test_name=$test_name"_qps_"$qps
374+
backend=$CURRENT_LLM_SERVING_ENGINE
375+
376+
if [[ "$backend" == *"vllm"* ]]; then
377+
backend="vllm"
378+
fi
379+
#TODO: add output dir.
380+
client_command="genai-perf profile \
381+
-m $model \
382+
--service-kind openai \
383+
--backend vllm \
384+
--endpoint-type chat \
385+
--streaming \
386+
--url localhost:$port \
387+
--request-rate $qps \
388+
--num-prompts $num_prompts \
389+
"
390+
391+
echo "Client command: $client_command"
392+
393+
eval "$client_command"
394+
395+
#TODO: process/record outputs
396+
done
397+
done
398+
399+
kill_gpu_processes
400+
401+
}
304402

305403
prepare_dataset() {
306404

@@ -328,12 +426,17 @@ main() {
328426

329427
pip install -U transformers
330428

429+
pip install -r requirements-dev.txt
430+
which genai-perf
431+
331432
# check storage
332433
df -h
333434

334435
ensure_installed wget
335436
ensure_installed curl
336437
ensure_installed jq
438+
# genai-perf dependency
439+
ensure_installed libb64-0d
337440

338441
prepare_dataset
339442

@@ -345,6 +448,10 @@ main() {
345448
# run the test
346449
run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
347450

451+
# run genai-perf tests
452+
run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
453+
mv artifacts/ $RESULTS_FOLDER/
454+
348455
# upload benchmark results to buildkite
349456
python3 -m pip install tabulate pandas
350457
python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
[
2+
{
3+
"test_name": "llama8B_tp1_genai_perf",
4+
"qps_list": [4,8,16,32],
5+
"common_parameters": {
6+
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
7+
"tp": 1,
8+
"port": 8000,
9+
"num_prompts": 500,
10+
"reuse_server": false
11+
},
12+
"vllm_server_parameters": {
13+
"disable_log_stats": "",
14+
"disable_log_requests": "",
15+
"gpu_memory_utilization": 0.9,
16+
"num_scheduler_steps": 10,
17+
"max_num_seqs": 512,
18+
"dtype": "bfloat16"
19+
},
20+
"genai_perf_input_parameters": {
21+
}
22+
}
23+
]

.buildkite/release-pipeline.yaml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,11 @@ steps:
5656
env:
5757
DOCKER_BUILDKIT: "1"
5858

59+
- input: "Provide Release version here"
60+
fields:
61+
- text: "What is the release version?"
62+
key: "release-version"
63+
5964
- block: "Build CPU release image"
6065
key: block-cpu-release-image-build
6166
depends_on: ~
@@ -66,7 +71,7 @@ steps:
6671
queue: cpu_queue_postmerge
6772
commands:
6873
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
69-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION --progress plain -f Dockerfile.cpu ."
70-
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION"
74+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --progress plain -f Dockerfile.cpu ."
75+
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
7176
env:
7277
DOCKER_BUILDKIT: "1"

.buildkite/run-cpu-test.sh

Lines changed: 22 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -9,63 +9,60 @@ CORE_RANGE=${CORE_RANGE:-48-95}
99
NUMA_NODE=${NUMA_NODE:-1}
1010

1111
# Try building the docker image
12-
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.cpu .
13-
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
12+
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
13+
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
1414

1515
# Setup cleanup
16-
remove_docker_container() { docker rm -f cpu-test-"$NUMA_NODE" cpu-test-avx2-"$NUMA_NODE" || true; }
16+
remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
1717
trap remove_docker_container EXIT
1818
remove_docker_container
1919

2020
# Run the image, setting --shm-size=4g for tensor parallel.
2121
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
22-
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test
22+
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
2323
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
24-
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2-"$NUMA_NODE" cpu-test-avx2
24+
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
2525

2626
function cpu_tests() {
2727
set -e
2828
export NUMA_NODE=$2
2929

3030
# offline inference
31-
docker exec cpu-test-avx2-"$NUMA_NODE" bash -c "
31+
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
3232
set -e
33-
python3 examples/offline_inference.py"
33+
python3 examples/offline_inference/basic.py"
3434

3535
# Run basic model test
36-
docker exec cpu-test-"$NUMA_NODE" bash -c "
36+
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
3737
set -e
38-
pip install pytest pytest-asyncio \
39-
decord einops librosa peft Pillow sentence-transformers soundfile \
40-
transformers_stream_generator matplotlib datamodel_code_generator
41-
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
38+
pip install -r vllm/requirements-test.txt
4239
pytest -v -s tests/models/decoder_only/language -m cpu_model
4340
pytest -v -s tests/models/embedding/language -m cpu_model
4441
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
4542
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
4643
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
4744

4845
# Run compressed-tensor test
49-
docker exec cpu-test-"$NUMA_NODE" bash -c "
46+
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
5047
set -e
5148
pytest -s -v \
5249
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
5350
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
5451

5552
# Run AWQ test
56-
docker exec cpu-test-"$NUMA_NODE" bash -c "
53+
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
5754
set -e
5855
pytest -s -v \
5956
tests/quantization/test_ipex_quant.py"
6057

6158
# Run chunked-prefill and prefix-cache test
62-
docker exec cpu-test-"$NUMA_NODE" bash -c "
59+
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
6360
set -e
6461
pytest -s -v -k cpu_model \
6562
tests/basic_correctness/test_chunked_prefill.py"
6663

67-
# online inference
68-
docker exec cpu-test-"$NUMA_NODE" bash -c "
64+
# online serving
65+
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
6966
set -e
7067
export VLLM_CPU_KVCACHE_SPACE=10
7168
export VLLM_CPU_OMP_THREADS_BIND=$1
@@ -78,8 +75,14 @@ function cpu_tests() {
7875
--num-prompts 20 \
7976
--endpoint /v1/completions \
8077
--tokenizer facebook/opt-125m"
78+
79+
# Run multi-lora tests
80+
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
81+
set -e
82+
pytest -s -v \
83+
tests/lora/test_qwen2vl.py"
8184
}
8285

83-
# All of CPU tests are expected to be finished less than 25 mins.
86+
# All of CPU tests are expected to be finished less than 40 mins.
8487
export -f cpu_tests
85-
timeout 30m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
88+
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"

.buildkite/run-gh200-test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,5 +24,5 @@ remove_docker_container
2424

2525
# Run the image and test offline inference
2626
docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
27-
python3 examples/offline_inference.py
27+
python3 examples/offline_inference/basic.py
2828
'

.buildkite/run-hpu-test.sh

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,17 @@ set -ex
88
docker build -t hpu-test-env -f Dockerfile.hpu .
99

1010
# Setup cleanup
11+
# certain versions of HPU software stack have a bug that can
12+
# override the exit code of the script, so we need to use
13+
# separate remove_docker_container and remove_docker_container_and_exit
14+
# functions, while other platforms only need one remove_docker_container
15+
# function.
16+
EXITCODE=1
1117
remove_docker_container() { docker rm -f hpu-test || true; }
12-
trap remove_docker_container EXIT
18+
remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; }
19+
trap remove_docker_container_and_exit EXIT
1320
remove_docker_container
1421

1522
# Run the image and launch offline inference
16-
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py
23+
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py
24+
EXITCODE=$?

0 commit comments

Comments
 (0)