Skip to content

Commit ba00047

Browse files
committed
Merge branch 'main' of https://github.com/neuralmagic/vllm into sage/dbo-full-cudagraphs
2 parents b2ed6c3 + 28f350e commit ba00047

File tree

134 files changed

+2334
-1161
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

134 files changed

+2334
-1161
lines changed

.buildkite/nightly-benchmarks/scripts/compare-json-results.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ def split_json_by_tp_pp(
218218
"--xaxis",
219219
type=str,
220220
default="# of max concurrency.",
221-
help="column name to use as X Axis in comparision graph",
221+
help="column name to use as X Axis in comparison graph",
222222
)
223223
args = parser.parse_args()
224224

.buildkite/release-pipeline.yaml

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,24 @@
11
steps:
2-
# aarch64 + CUDA builds
3-
- label: "Build arm64 wheel - CUDA 12.8"
4-
id: build-wheel-arm64-cuda-12-8
2+
# aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
3+
- label: "Build arm64 wheel - CUDA 12.9"
4+
id: build-wheel-arm64-cuda-12-9
55
agents:
66
queue: arm64_cpu_queue_postmerge
77
commands:
88
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
99
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
10-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
10+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
1111
- "mkdir artifacts"
1212
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
1313
- "bash .buildkite/scripts/upload-wheels.sh"
1414
env:
1515
DOCKER_BUILDKIT: "1"
1616

17-
# x86 + CUDA builds
17+
- block: "Build CUDA 12.8 wheel"
18+
key: block-build-cu128-wheel
19+
1820
- label: "Build wheel - CUDA 12.8"
21+
depends_on: block-build-cu128-wheel
1922
id: build-wheel-cuda-12-8
2023
agents:
2124
queue: cpu_queue_postmerge
@@ -44,18 +47,14 @@ steps:
4447
env:
4548
DOCKER_BUILDKIT: "1"
4649

47-
# Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
48-
# However, this block can be uncommented to save some compute hours.
49-
# - block: "Build CUDA 11.8 wheel"
50-
# key: block-build-cu118-wheel
51-
52-
- label: "Build wheel - CUDA 11.8"
53-
# depends_on: block-build-cu118-wheel
54-
id: build-wheel-cuda-11-8
50+
# x86 + CUDA builds
51+
- label: "Build wheel - CUDA 12.9"
52+
depends_on: ~
53+
id: build-wheel-cuda-12-9
5554
agents:
5655
queue: cpu_queue_postmerge
5756
commands:
58-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
57+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
5958
- "mkdir artifacts"
6059
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
6160
- "bash .buildkite/scripts/upload-wheels.sh"
@@ -75,14 +74,15 @@ steps:
7574
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
7675
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
7776

77+
# PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
7878
- label: "Build release image (arm64)"
7979
depends_on: ~
8080
id: build-release-image-arm64
8181
agents:
8282
queue: arm64_cpu_queue_postmerge
8383
commands:
8484
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
85-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
85+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
8686
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
8787

8888
# Add job to create multi-arch manifest
@@ -103,7 +103,7 @@ steps:
103103
- create-multi-arch-manifest
104104
- build-wheel-cuda-12-8
105105
- build-wheel-cuda-12-6
106-
- build-wheel-cuda-11-8
106+
- build-wheel-cuda-12-9
107107
id: annotate-release-workflow
108108
agents:
109109
queue: cpu_queue_postmerge

.buildkite/scripts/upload-wheels.sh

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -58,14 +58,15 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
5858
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
5959
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
6060

61-
if [[ $normal_wheel == *"cu118"* ]]; then
62-
# if $normal_wheel matches cu118, do not upload the index.html
63-
echo "Skipping index files for cu118 wheels"
64-
elif [[ $normal_wheel == *"cu126"* ]]; then
61+
if [[ $normal_wheel == *"cu126"* ]]; then
6562
# if $normal_wheel matches cu126, do not upload the index.html
6663
echo "Skipping index files for cu126 wheels"
64+
elif [[ $normal_wheel == *"cu128"* ]]; then
65+
# if $normal_wheel matches cu128, do not upload the index.html
66+
echo "Skipping index files for cu128 wheels"
6767
else
68-
# only upload index.html for cu128 wheels (default wheels)
68+
# only upload index.html for cu129 wheels (default wheels) as it
69+
# is available on both x86 and arm64
6970
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
7071
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
7172
fi
@@ -74,14 +75,15 @@ fi
7475
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
7576
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
7677

77-
if [[ $normal_wheel == *"cu118"* ]]; then
78-
# if $normal_wheel matches cu118, do not upload the index.html
79-
echo "Skipping index files for cu118 wheels"
80-
elif [[ $normal_wheel == *"cu126"* ]]; then
78+
if [[ $normal_wheel == *"cu126"* ]]; then
8179
# if $normal_wheel matches cu126, do not upload the index.html
8280
echo "Skipping index files for cu126 wheels"
81+
elif [[ $normal_wheel == *"cu128"* ]]; then
82+
# if $normal_wheel matches cu128, do not upload the index.html
83+
echo "Skipping index files for cu128 wheels"
8384
else
84-
# only upload index.html for cu128 wheels (default wheels)
85+
# only upload index.html for cu129 wheels (default wheels) as it
86+
# is available on both x86 and arm64
8587
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
8688
fi
8789

.buildkite/test-pipeline.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -666,7 +666,7 @@ steps:
666666
# Quantization
667667
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
668668
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
669-
- pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
669+
# - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
670670
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
671671
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
672672
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
@@ -676,7 +676,7 @@ steps:
676676
- pytest -v -s tests/compile/test_fusion_all_reduce.py
677677
- pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
678678
- pytest -v -s tests/kernels/moe/test_flashinfer.py
679-
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
679+
# - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
680680

681681
##### 1 GPU test #####
682682
##### multi gpus test #####

benchmarks/README.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,12 @@ become available.
110110

111111
🚧: to be supported
112112

113-
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
113+
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`.
114+
For local `dataset-path`, please set `hf-name` to its Hugging Face ID like
115+
116+
```bash
117+
--dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat
118+
```
114119

115120
## 🚀 Example - Online Benchmark
116121

benchmarks/benchmark_block_pool.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def invoke_main() -> None:
5757
"--num-iteration",
5858
type=int,
5959
default=1000,
60-
help="Number of iterations to run to stablize final data readings",
60+
help="Number of iterations to run to stabilize final data readings",
6161
)
6262
parser.add_argument(
6363
"--allocate-blocks",

benchmarks/benchmark_ngram_proposer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def invoke_main() -> None:
7777
"--num-iteration",
7878
type=int,
7979
default=100,
80-
help="Number of iterations to run to stablize final data readings",
80+
help="Number of iterations to run to stabilize final data readings",
8181
)
8282
parser.add_argument(
8383
"--num-req", type=int, default=128, help="Number of requests in the batch"

benchmarks/benchmark_serving.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1104,7 +1104,7 @@ def create_argument_parser():
11041104
"--percentile-metrics",
11051105
type=str,
11061106
default="ttft,tpot,itl",
1107-
help="Comma-separated list of selected metrics to report percentils. "
1107+
help="Comma-separated list of selected metrics to report percentiles. "
11081108
"This argument specifies the metrics to report percentiles. "
11091109
'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
11101110
'Default value is "ttft,tpot,itl".',

benchmarks/benchmark_serving_structured_output.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -998,7 +998,7 @@ def create_argument_parser():
998998
"--percentile-metrics",
999999
type=str,
10001000
default="ttft,tpot,itl",
1001-
help="Comma-separated list of selected metrics to report percentils. "
1001+
help="Comma-separated list of selected metrics to report percentiles. "
10021002
"This argument specifies the metrics to report percentiles. "
10031003
'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
10041004
'Default value is "ttft,tpot,itl".',

benchmarks/benchmark_throughput.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -719,7 +719,7 @@ def create_argument_parser():
719719
"[length * (1 - range_ratio), length * (1 + range_ratio)].",
720720
)
721721

722-
# hf dtaset
722+
# hf dataset
723723
parser.add_argument(
724724
"--hf-subset", type=str, default=None, help="Subset of the HF dataset."
725725
)

0 commit comments

Comments
 (0)