Skip to content

Commit ab92741

Browse files
committed
Merge remote-tracking branch 'upstream/main'
2 parents 3712649 + 476844d commit ab92741

File tree

74 files changed

+2018
-420
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

74 files changed

+2018
-420
lines changed

.buildkite/scripts/hardware_ci/run-amd-test.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,10 @@ if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
9494
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
9595
fi
9696

97+
if [[ $commands == *"pytest -v -s lora"* ]]; then
98+
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
99+
fi
100+
97101
#ignore certain kernels tests
98102
if [[ $commands == *" kernels/core"* ]]; then
99103
commands="${commands} \

.buildkite/scripts/hardware_ci/run-cpu-test.sh

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -8,67 +8,65 @@ set -ex
88
CORE_RANGE=${CORE_RANGE:-48-95}
99
NUMA_NODE=${NUMA_NODE:-1}
1010

11+
export CMAKE_BUILD_PARALLEL_LEVEL=32
12+
1113
# Setup cleanup
1214
remove_docker_container() {
1315
set -e;
14-
docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true;
15-
docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true;
16+
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
1617
}
1718
trap remove_docker_container EXIT
1819
remove_docker_container
1920

2021
# Try building the docker image
21-
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
22-
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
22+
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
23+
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
2324

2425
# Run the image, setting --shm-size=4g for tensor parallel.
2526
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
26-
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
27+
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
2728
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
28-
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
29+
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
2930

3031
function cpu_tests() {
3132
set -e
3233
export NUMA_NODE=$2
33-
export BUILDKITE_BUILD_NUMBER=$3
3434

3535
# offline inference
36-
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
36+
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
3737
set -e
3838
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
3939

4040
# Run basic model test
41-
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
41+
docker exec cpu-test-"$NUMA_NODE" bash -c "
4242
set -e
43-
pytest -v -s tests/kernels/test_cache.py -m cpu_model
44-
pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
45-
pytest -v -s tests/models/decoder_only/language -m cpu_model
46-
pytest -v -s tests/models/embedding/language -m cpu_model
47-
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
48-
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
49-
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
43+
pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
44+
pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
45+
pytest -v -s tests/models/language/generation -m cpu_model
46+
pytest -v -s tests/models/language/pooling -m cpu_model
47+
pytest -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_mllama.py -m cpu_model"
5048

5149
# Run compressed-tensor test
52-
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
50+
docker exec cpu-test-"$NUMA_NODE" bash -c "
5351
set -e
5452
pytest -s -v \
5553
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
5654
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
5755

5856
# Run AWQ test
59-
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
57+
docker exec cpu-test-"$NUMA_NODE" bash -c "
6058
set -e
6159
pytest -s -v \
6260
tests/quantization/test_ipex_quant.py"
6361

6462
# Run chunked-prefill and prefix-cache test
65-
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
63+
docker exec cpu-test-"$NUMA_NODE" bash -c "
6664
set -e
6765
pytest -s -v -k cpu_model \
6866
tests/basic_correctness/test_chunked_prefill.py"
6967

7068
# online serving
71-
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
69+
docker exec cpu-test-"$NUMA_NODE" bash -c "
7270
set -e
7371
export VLLM_CPU_KVCACHE_SPACE=10
7472
export VLLM_CPU_OMP_THREADS_BIND=$1
@@ -83,12 +81,12 @@ function cpu_tests() {
8381
--tokenizer facebook/opt-125m"
8482

8583
# Run multi-lora tests
86-
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
84+
docker exec cpu-test-"$NUMA_NODE" bash -c "
8785
set -e
8886
pytest -s -v \
8987
tests/lora/test_qwen2vl.py"
9088
}
9189

9290
# All of CPU tests are expected to be finished less than 40 mins.
9391
export -f cpu_tests
94-
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
92+
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"

.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,10 @@ run_and_track_test 12 "test_moe_pallas.py" \
155155
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
156156
run_and_track_test 13 "test_lora.py" \
157157
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
158+
run_and_track_test 14 "test_tpu_qkv_linear.py" \
159+
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
160+
run_and_track_test 15 "test_spmd_model_weight_loading.py" \
161+
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
158162
159163
# After all tests have been attempted, exit with the overall status.
160164
if [ "$overall_script_exit_code" -ne 0 ]; then

.buildkite/test-pipeline.yaml

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -98,9 +98,7 @@ steps:
9898
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
9999

100100
- label: Core Test # 10min
101-
working_dir: "/vllm-workspace/tests"
102101
mirror_hardwares: [amdexperimental, amdproduction]
103-
amd_gpus: 4 # Just for the sake of queue testing
104102
fast_check: true
105103
source_file_dependencies:
106104
- vllm/core
@@ -114,7 +112,6 @@ steps:
114112
working_dir: "/vllm-workspace/tests"
115113
fast_check: true
116114
torch_nightly: true
117-
amd_gpus: 2 # Just for the sake of queue testing
118115
source_file_dependencies:
119116
- vllm/
120117
- tests/entrypoints/llm
@@ -207,7 +204,6 @@ steps:
207204
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
208205
# OOM in the CI unless we run this separately
209206
- pytest -v -s tokenization
210-
working_dir: "/vllm-workspace/tests" # optional
211207

212208
- label: V1 Test
213209
mirror_hardwares: [amdexperimental]
@@ -261,7 +257,6 @@ steps:
261257
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
262258

263259
- label: Prefix Caching Test # 9min
264-
working_dir: "/vllm-workspace/tests"
265260
mirror_hardwares: [amdexperimental, amdproduction]
266261
source_file_dependencies:
267262
- vllm/
@@ -292,9 +287,7 @@ steps:
292287
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
293288

294289
- label: LoRA Test %N # 15min each
295-
working_dir: "/vllm-workspace/tests"
296-
mirror_hardwares: [amdexperimental]
297-
amd_gpus: 8
290+
mirror_hardwares: [amdexperimental, amdproduction]
298291
source_file_dependencies:
299292
- vllm/lora
300293
- tests/lora
@@ -335,10 +328,8 @@ steps:
335328
commands:
336329
- pytest -v -s compile/test_full_graph.py
337330

338-
- label: Kernels Test %N # 1h each
339-
working_dir: "/vllm-workspace/tests"
340-
# mirror_hardwares: [amdexperimental, amdproduction]
341-
amd_gpus: 8
331+
- label: Kernels Core Operation Test
332+
mirror_hardwares: [amdexperimental, amdproduction]
342333
source_file_dependencies:
343334
- csrc/
344335
- tests/kernels/core
@@ -384,7 +375,6 @@ steps:
384375
- pytest -v -s kernels/mamba
385376

386377
- label: Tensorizer Test # 11min
387-
working_dir: "/vllm-workspace/tests"
388378
mirror_hardwares: [amdexperimental, amdproduction]
389379
soft_fail: true
390380
source_file_dependencies:
@@ -461,7 +451,6 @@ steps:
461451
- pytest -v -s encoder_decoder
462452

463453
- label: OpenAI-Compatible Tool Use # 20 min
464-
working_dir: "/vllm-workspace/tests"
465454
mirror_hardwares: [amdexperimental]
466455
fast_check: false
467456
source_file_dependencies:

CMakeLists.txt

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -189,11 +189,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
189189
set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
190190
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
191191

192-
#
193-
# Set rocm version dev int.
194-
#
195-
list(APPEND VLLM_GPU_FLAGS "-DROCM_VERSION=${ROCM_VERSION_DEV_INT}")
196-
197192
#
198193
# Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
199194
# a lot of warnings that always mask real issues. Suppressing until this is properly addressed.

docker/Dockerfile.cpu

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ RUN --mount=type=bind,source=.git,target=.git \
7575

7676
RUN --mount=type=cache,target=/root/.cache/uv \
7777
--mount=type=cache,target=/root/.cache/ccache \
78+
--mount=type=cache,target=/workspace/vllm/.deps,sharing=locked \
7879
--mount=type=bind,source=.git,target=.git \
7980
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel
8081

@@ -85,7 +86,7 @@ WORKDIR /workspace/vllm
8586

8687
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
8788
--mount=type=cache,target=/var/lib/apt,sharing=locked \
88-
apt-get install -y --no-install-recommends vim numactl
89+
apt-get install -y --no-install-recommends vim numactl xz-utils
8990

9091
# install development dependencies (for testing)
9192
RUN --mount=type=cache,target=/root/.cache/uv \
@@ -108,8 +109,11 @@ FROM base AS vllm-test
108109
WORKDIR /workspace/
109110

110111
RUN --mount=type=cache,target=/root/.cache/uv \
111-
--mount=type=bind,src=requirements/test.txt,target=requirements/test.txt \
112-
uv pip install -r requirements/test.txt
112+
--mount=type=bind,src=requirements/test.in,target=requirements/test.in \
113+
cp requirements/test.in requirements/test-cpu.in && \
114+
sed -i '/mamba_ssm/d' requirements/test-cpu.in && \
115+
uv pip compile requirements/test-cpu.in -o requirements/cpu-test.txt && \
116+
uv pip install -r requirements/cpu-test.txt
113117

114118
RUN --mount=type=cache,target=/root/.cache/uv \
115119
--mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \

docker/Dockerfile.neuron

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ RUN --mount=type=bind,source=.git,target=.git \
3434
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
3535

3636
RUN python3 -m pip install -U \
37-
'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
37+
'cmake>=3.26.1' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
3838
-r requirements/neuron.txt
3939

4040
ENV VLLM_TARGET_DEVICE neuron

docs/cli/README.md

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,6 @@ Available Commands:
1212
vllm {chat,complete,serve,bench,collect-env,run-batch}
1313
```
1414

15-
## Table of Contents
16-
17-
- [serve](#serve)
18-
- [chat](#chat)
19-
- [complete](#complete)
20-
- [bench](#bench)
21-
- [latency](#latency)
22-
- [serve](#serve-1)
23-
- [throughput](#throughput)
24-
- [collect-env](#collect-env)
25-
- [run-batch](#run-batch)
26-
- [More Help](#more-help)
27-
2815
## serve
2916

3017
Start the vLLM OpenAI Compatible API server.

docs/deployment/docker.md

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,10 +107,21 @@ DOCKER_BUILDKIT=1 docker build . \
107107
-t vllm/vllm-gh200-openai:latest \
108108
--build-arg max_jobs=66 \
109109
--build-arg nvcc_threads=2 \
110-
--build-arg torch_cuda_arch_list="9.0+PTX" \
110+
--build-arg torch_cuda_arch_list="9.0 10.0+PTX" \
111111
--build-arg vllm_fa_cmake_gpu_arches="90-real"
112112
```
113113

114+
!!! note
115+
If you are building the `linux/arm64` image on a non-ARM host (e.g., an x86_64 machine), you need to ensure your system is set up for cross-compilation using QEMU. This allows your host machine to emulate ARM64 execution.
116+
117+
Run the following command on your host machine to register QEMU user static handlers:
118+
119+
```console
120+
docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
121+
```
122+
123+
After setting up QEMU, you can use the `--platform "linux/arm64"` flag in your `docker build` command.
124+
114125
## Use the custom-built vLLM Docker image
115126

116127
To run vLLM with the custom-built Docker image:

docs/deployment/nginx.md

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,6 @@ title: Using Nginx
55

66
This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers.
77

8-
Table of contents:
9-
10-
1. [Build Nginx Container][nginxloadbalancer-nginx-build]
11-
2. [Create Simple Nginx Config file][nginxloadbalancer-nginx-conf]
12-
3. [Build vLLM Container][nginxloadbalancer-nginx-vllm-container]
13-
4. [Create Docker Network][nginxloadbalancer-nginx-docker-network]
14-
5. [Launch vLLM Containers][nginxloadbalancer-nginx-launch-container]
15-
6. [Launch Nginx][nginxloadbalancer-nginx-launch-nginx]
16-
7. [Verify That vLLM Servers Are Ready][nginxloadbalancer-nginx-verify-nginx]
17-
188
[](){ #nginxloadbalancer-nginx-build }
199

2010
## Build Nginx Container

0 commit comments

Comments
 (0)