Skip to content

Commit 59ff746

Browse files
committed
Merge remote-tracking branch 'upstream/release'
2 parents 552eac2 + 5a42fad commit 59ff746

File tree

863 files changed

+59155
-24421
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

863 files changed

+59155
-24421
lines changed

.buildkite/generate_index.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import argparse
2+
import os
3+
4+
template = """<!DOCTYPE html>
5+
<html>
6+
<body>
7+
<h1>Links for vLLM</h1/>
8+
<a href="../{wheel_html_escaped}">{wheel}</a><br/>
9+
</body>
10+
</html>
11+
"""
12+
13+
parser = argparse.ArgumentParser()
14+
parser.add_argument("--wheel", help="The wheel path.", required=True)
15+
args = parser.parse_args()
16+
17+
filename = os.path.basename(args.wheel)
18+
19+
with open("index.html", "w") as f:
20+
print(f"Generated index.html for {args.wheel}")
21+
# cloudfront requires escaping the '+' character
22+
f.write(
23+
template.format(wheel=filename,
24+
wheel_html_escaped=filename.replace("+", "%2B")))

.buildkite/nightly-benchmarks/benchmark-pipeline.yaml

Lines changed: 48 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,19 @@ steps:
99
- image: badouralix/curl-jq
1010
command:
1111
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
12+
1213
- wait
14+
1315
- label: "A100"
16+
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
1417
agents:
1518
queue: A100
1619
plugins:
1720
- kubernetes:
1821
podSpec:
1922
priorityClassName: perf-benchmark
2023
containers:
21-
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
24+
- image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
2225
command:
2326
- bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
2427
resources:
@@ -41,20 +44,48 @@ steps:
4144
- name: devshm
4245
emptyDir:
4346
medium: Memory
44-
# - label: "H100"
45-
# agents:
46-
# queue: H100
47-
# plugins:
48-
# - docker#v5.11.0:
49-
# image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
50-
# command:
51-
# - bash
52-
# - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
53-
# mount-buildkite-agent: true
54-
# propagate-environment: true
55-
# ipc: host
56-
# gpus: all
57-
# environment:
58-
# - VLLM_USAGE_SOURCE
59-
# - HF_TOKEN
6047

48+
- label: "H200"
49+
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
50+
agents:
51+
queue: H200
52+
plugins:
53+
- docker#v5.12.0:
54+
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
55+
command:
56+
- bash
57+
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
58+
mount-buildkite-agent: true
59+
propagate-environment: true
60+
ipc: host
61+
gpus: 4,5,6,7
62+
volumes:
63+
- /data/benchmark-hf-cache:/root/.cache/huggingface
64+
environment:
65+
- VLLM_USAGE_SOURCE
66+
- HF_TOKEN
67+
68+
#- block: "Run H100 Benchmark"
69+
#key: block-h100
70+
#depends_on: ~
71+
72+
- label: "H100"
73+
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
74+
agents:
75+
queue: H100
76+
depends_on: block-h100
77+
plugins:
78+
- docker#v5.12.0:
79+
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
80+
command:
81+
- bash
82+
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
83+
mount-buildkite-agent: true
84+
propagate-environment: true
85+
ipc: host
86+
gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
87+
volumes:
88+
- /data/benchmark-hf-cache:/root/.cache/huggingface
89+
environment:
90+
- VLLM_USAGE_SOURCE
91+
- HF_TOKEN

.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,18 @@ def results_to_json(latency, throughput, serving):
157157
throughput_results,
158158
serving_results)
159159

160+
for df in [latency_results, serving_results, throughput_results]:
161+
if df.empty:
162+
continue
163+
164+
# Sort all dataframes by their respective "Test name" columns
165+
df.sort_values(by="Test name", inplace=True)
166+
167+
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
168+
# we want to turn it into "8xGPUTYPE"
169+
df["GPU"] = df["GPU"].apply(
170+
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
171+
160172
# get markdown tables
161173
latency_md_table = tabulate(latency_results,
162174
headers='keys',

.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
# Do not set -e, as the mixtral 8x22B model tends to crash occasionally
88
# and we still want to see other benchmarking results even when mixtral crashes.
9+
set -x
910
set -o pipefail
1011

1112
check_gpus() {
@@ -85,11 +86,7 @@ kill_gpu_processes() {
8586

8687
ps -aux
8788
lsof -t -i:8000 | xargs -r kill -9
88-
pkill -f pt_main_thread
89-
# this line doesn't work now
90-
# ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
91-
pkill -f python3
92-
pkill -f /usr/bin/python3
89+
pgrep python3 | xargs -r kill -9
9390

9491

9592
# wait until GPU memory usage smaller than 1GB
@@ -289,7 +286,7 @@ run_serving_tests() {
289286
# run the server
290287
echo "Running test case $test_name"
291288
echo "Server command: $server_command"
292-
eval "$server_command" &
289+
bash -c "$server_command" &
293290
server_pid=$!
294291

295292
# wait until the server is alive
@@ -322,7 +319,7 @@ run_serving_tests() {
322319
echo "Running test case $test_name with qps $qps"
323320
echo "Client command: $client_command"
324321

325-
eval "$client_command"
322+
bash -c "$client_command"
326323

327324
# record the benchmarking commands
328325
jq_output=$(jq -n \

.buildkite/nightly-benchmarks/scripts/wait-for-image.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/bin/sh
2-
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
3-
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
2+
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
3+
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
44

55
TIMEOUT_SECONDS=10
66

.buildkite/release-pipeline.yaml

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
steps:
22
- label: "Build wheel - CUDA 12.1"
33
agents:
4-
queue: cpu_queue
4+
queue: cpu_queue_postmerge
55
commands:
66
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
77
- "mkdir artifacts"
@@ -18,11 +18,55 @@ steps:
1818
- label: "Build wheel - CUDA 11.8"
1919
# depends_on: block-build-cu118-wheel
2020
agents:
21-
queue: cpu_queue
21+
queue: cpu_queue_postmerge
2222
commands:
2323
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
2424
- "mkdir artifacts"
2525
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
2626
- "bash .buildkite/upload-wheels.sh"
2727
env:
2828
DOCKER_BUILDKIT: "1"
29+
30+
- block: "Build release image"
31+
depends_on: ~
32+
key: block-release-image-build
33+
34+
- label: "Build release image"
35+
depends_on: block-release-image-build
36+
agents:
37+
queue: cpu_queue_postmerge
38+
commands:
39+
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
40+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
41+
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
42+
43+
- label: "Build and publish TPU release image"
44+
depends_on: ~
45+
if: build.env("NIGHTLY") == "1"
46+
agents:
47+
queue: tpu_queue_postmerge
48+
commands:
49+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
50+
- "docker push vllm/vllm-tpu:nightly"
51+
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
52+
plugins:
53+
- docker-login#v3.0.0:
54+
username: vllm
55+
password-env: DOCKERHUB_TOKEN
56+
env:
57+
DOCKER_BUILDKIT: "1"
58+
59+
- block: "Build CPU release image"
60+
key: block-cpu-release-image-build
61+
depends_on: ~
62+
63+
- label: "Build and publish CPU release image"
64+
depends_on: block-cpu-release-image-build
65+
agents:
66+
queue: cpu_queue_postmerge
67+
commands:
68+
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
69+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION --progress plain -f Dockerfile.cpu ."
70+
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION"
71+
env:
72+
DOCKER_BUILDKIT: "1"

.buildkite/run-amd-test.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,6 @@ if [[ $commands == *" kernels "* ]]; then
8585
--ignore=kernels/test_encoder_decoder_attn.py \
8686
--ignore=kernels/test_flash_attn.py \
8787
--ignore=kernels/test_flashinfer.py \
88-
--ignore=kernels/test_gguf.py \
8988
--ignore=kernels/test_int8_quant.py \
9089
--ignore=kernels/test_machete_gemm.py \
9190
--ignore=kernels/test_mamba_ssm.py \

.buildkite/run-cpu-test-ppc64le.sh

Lines changed: 3 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -4,49 +4,11 @@
44
# It serves a sanity check for compilation and basic model usage.
55
set -ex
66

7-
# Try building the docker image
8-
docker build -t cpu-test -f Dockerfile.ppc64le .
9-
107
# Setup cleanup
11-
remove_docker_container() { docker rm -f cpu-test || true; }
8+
remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
129
trap remove_docker_container EXIT
1310
remove_docker_container
1411

15-
# Run the image, setting --shm-size=4g for tensor parallel.
16-
source /etc/environment
17-
#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
18-
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN="$HF_TOKEN" --name cpu-test cpu-test
19-
20-
function cpu_tests() {
21-
set -e
22-
23-
# Run basic model test
24-
docker exec cpu-test bash -c "
25-
set -e
26-
pip install pytest pytest-asyncio \
27-
decord einops librosa peft Pillow sentence-transformers soundfile \
28-
transformers_stream_generator matplotlib datamodel_code_generator
29-
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
30-
pytest -v -s tests/models/decoder_only/language -m cpu_model
31-
pytest -v -s tests/models/embedding/language -m cpu_model
32-
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
33-
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
34-
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
35-
36-
# online inference
37-
docker exec cpu-test bash -c "
38-
set -e
39-
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
40-
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
41-
python3 benchmarks/benchmark_serving.py \
42-
--backend vllm \
43-
--dataset-name random \
44-
--model facebook/opt-125m \
45-
--num-prompts 20 \
46-
--endpoint /v1/completions \
47-
--tokenizer facebook/opt-125m"
48-
}
12+
# Try building the docker image
13+
docker build -t cpu-test -f Dockerfile.ppc64le .
4914

50-
# All of CPU tests are expected to be finished less than 25 mins.
51-
export -f cpu_tests
52-
timeout 25m bash -c "cpu_tests"

.buildkite/run-cpu-test.sh

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,26 +13,27 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.
1313
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
1414

1515
# Setup cleanup
16-
remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
16+
remove_docker_container() { docker rm -f cpu-test-"$NUMA_NODE" cpu-test-avx2-"$NUMA_NODE" || true; }
1717
trap remove_docker_container EXIT
1818
remove_docker_container
1919

2020
# Run the image, setting --shm-size=4g for tensor parallel.
2121
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
22-
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
22+
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test
2323
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
24-
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
24+
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2-"$NUMA_NODE" cpu-test-avx2
2525

2626
function cpu_tests() {
2727
set -e
28+
export NUMA_NODE=$2
2829

2930
# offline inference
30-
docker exec cpu-test-avx2 bash -c "
31+
docker exec cpu-test-avx2-"$NUMA_NODE" bash -c "
3132
set -e
3233
python3 examples/offline_inference.py"
3334

3435
# Run basic model test
35-
docker exec cpu-test bash -c "
36+
docker exec cpu-test-"$NUMA_NODE" bash -c "
3637
set -e
3738
pip install pytest pytest-asyncio \
3839
decord einops librosa peft Pillow sentence-transformers soundfile \
@@ -45,20 +46,26 @@ function cpu_tests() {
4546
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
4647

4748
# Run compressed-tensor test
48-
docker exec cpu-test bash -c "
49+
docker exec cpu-test-"$NUMA_NODE" bash -c "
4950
set -e
5051
pytest -s -v \
5152
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
5253
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
5354

5455
# Run AWQ test
55-
docker exec cpu-test bash -c "
56+
docker exec cpu-test-"$NUMA_NODE" bash -c "
5657
set -e
5758
pytest -s -v \
5859
tests/quantization/test_ipex_quant.py"
5960

61+
# Run chunked-prefill and prefix-cache test
62+
docker exec cpu-test-"$NUMA_NODE" bash -c "
63+
set -e
64+
pytest -s -v -k cpu_model \
65+
tests/basic_correctness/test_chunked_prefill.py"
66+
6067
# online inference
61-
docker exec cpu-test bash -c "
68+
docker exec cpu-test-"$NUMA_NODE" bash -c "
6269
set -e
6370
export VLLM_CPU_KVCACHE_SPACE=10
6471
export VLLM_CPU_OMP_THREADS_BIND=$1
@@ -75,4 +82,4 @@ function cpu_tests() {
7582

7683
# All of CPU tests are expected to be finished less than 25 mins.
7784
export -f cpu_tests
78-
timeout 25m bash -c "cpu_tests $CORE_RANGE"
85+
timeout 30m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"

0 commit comments

Comments
 (0)