Skip to content

Commit c3edf24

Browse files
authored
Merge branch 'main' into dev-xxhash
2 parents 7882c26 + e4c3182 commit c3edf24

File tree

385 files changed

+8783
-4683
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

385 files changed

+8783
-4683
lines changed

.buildkite/scripts/annotate-release.sh

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ To download the wheel (by version):
2323
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
2424
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
2525
26-
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
2726
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
27+
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
2828
\`\`\`
2929
3030
To download and upload the image:
@@ -45,9 +45,10 @@ docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
4545
docker push vllm/vllm-openai:latest-aarch64
4646
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
4747
48-
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
49-
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
48+
docker manifest rm vllm/vllm-openai:latest
49+
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
50+
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
5051
docker manifest push vllm/vllm-openai:latest
5152
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
5253
\`\`\`
53-
EOF
54+
EOF
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#!/bin/bash
2+
3+
# This script build the CPU docker image and run the offline inference inside the container.
4+
# It serves a sanity check for compilation and basic model usage.
5+
set -ex
6+
7+
# allow to bind to different cores
8+
CORE_RANGE=${CORE_RANGE:-0-16}
9+
OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16}
10+
NUMA_NODE=${NUMA_NODE:-0}
11+
12+
export CMAKE_BUILD_PARALLEL_LEVEL=32
13+
14+
# Setup cleanup
15+
remove_docker_container() {
16+
set -e;
17+
docker rm -f cpu-test-"$NUMA_NODE" || true;
18+
}
19+
trap remove_docker_container EXIT
20+
remove_docker_container
21+
22+
# Try building the docker image
23+
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
24+
25+
# Run the image, setting --shm-size=4g for tensor parallel.
26+
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
27+
28+
function cpu_tests() {
29+
set -e
30+
export NUMA_NODE=$2
31+
32+
docker exec cpu-test-"$NUMA_NODE" bash -c "
33+
set -e
34+
pip list"
35+
36+
# offline inference
37+
docker exec cpu-test-"$NUMA_NODE" bash -c "
38+
set -e
39+
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
40+
41+
# Run kernel tests
42+
docker exec cpu-test-"$NUMA_NODE" bash -c "
43+
set -e
44+
pytest -x -v -s tests/kernels/test_onednn.py
45+
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
46+
47+
# basic online serving
48+
docker exec cpu-test-"$NUMA_NODE" bash -c '
49+
set -e
50+
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve meta-llama/Llama-3.2-3B-Instruct --max-model-len 2048 &
51+
server_pid=$!
52+
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
53+
vllm bench serve \
54+
--backend vllm \
55+
--dataset-name random \
56+
--model meta-llama/Llama-3.2-3B-Instruct \
57+
--num-prompts 20 \
58+
--endpoint /v1/completions
59+
kill -s SIGTERM $server_pid &'
60+
}
61+
62+
# All of CPU tests are expected to be finished less than 40 mins.
63+
export -f cpu_tests
64+
timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"

.buildkite/test-amd.yaml

Lines changed: 50 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ steps:
6161
- pytest -v -s -m 'not cpu_test' multimodal
6262
- pytest -v -s utils_
6363

64-
- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
64+
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins
6565
timeout_in_minutes: 10
6666
mirror_hardwares: [amdexperimental, amdproduction]
6767
agent_pool: mi325_1
@@ -73,13 +73,15 @@ steps:
7373
- tests/multimodal
7474
- tests/standalone_tests/lazy_imports.py
7575
- tests/transformers_utils
76+
- tests/config
7677
no_gpu: true
7778
commands:
7879
- python3 standalone_tests/lazy_imports.py
7980
- pytest -v -s test_inputs.py
8081
- pytest -v -s test_outputs.py
8182
- pytest -v -s -m 'cpu_test' multimodal
8283
- pytest -v -s transformers_utils
84+
- pytest -v -s config
8385

8486
- label: Python-only Installation Test # 10min
8587
timeout_in_minutes: 20
@@ -187,7 +189,7 @@ steps:
187189
- tests/distributed/test_utils
188190
- tests/distributed/test_pynccl
189191
- tests/distributed/test_events
190-
- tests/compile/test_basic_correctness
192+
- tests/compile/fullgraph/test_basic_correctness.py
191193
- examples/offline_inference/rlhf.py
192194
- examples/offline_inference/rlhf_colocate.py
193195
- tests/examples/offline_inference/data_parallel.py
@@ -215,7 +217,7 @@ steps:
215217
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
216218
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
217219
- pytest -v -s distributed/test_utils.py
218-
- pytest -v -s compile/test_basic_correctness.py
220+
- pytest -v -s compile/fullgraph/test_basic_correctness.py
219221
- pytest -v -s distributed/test_pynccl.py
220222
- pytest -v -s distributed/test_events.py
221223
- pytest -v -s distributed/test_symm_mem_allreduce.py
@@ -390,6 +392,15 @@ steps:
390392
commands:
391393
- pytest -v -s v1/attention
392394

395+
- label: V1 Test attention (B200) # 10min
396+
timeout_in_minutes: 30
397+
gpu: b200
398+
source_file_dependencies:
399+
- vllm/v1/attention
400+
- tests/v1/attention
401+
commands:
402+
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
403+
393404
- label: V1 Test others (CPU) # 5 mins
394405
mirror_hardwares: [amdexperimental, amdproduction]
395406
agent_pool: mi325_1
@@ -493,17 +504,12 @@ steps:
493504
- vllm/
494505
- tests/compile
495506
commands:
496-
- pytest -v -s compile/test_pass_manager.py
497-
- pytest -v -s compile/test_fusion.py
498-
- pytest -v -s compile/test_fusion_attn.py
499-
- pytest -v -s compile/test_functionalization.py
500-
- pytest -v -s compile/test_silu_mul_quant_fusion.py
501-
# - pytest -v -s compile/test_sequence_parallelism.py
502-
# - pytest -v -s compile/test_async_tp.py
503-
- pytest -v -s compile/test_fusion_all_reduce.py
504-
- pytest -v -s compile/test_decorator.py
505-
- pytest -v -s compile/test_noop_elimination.py
506-
- pytest -v -s compile/test_aot_compile.py
507+
# Run unit tests defined directly under compile/,
508+
# not including subdirectories, which are usually heavier
509+
# tests covered elsewhere.
510+
# Use `find` to launch multiple instances of pytest so that
511+
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
512+
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
507513

508514
- label: PyTorch Fullgraph Smoke Test # 15min
509515
timeout_in_minutes: 30
@@ -515,9 +521,11 @@ steps:
515521
- vllm/
516522
- tests/compile
517523
commands:
518-
- pytest -v -s compile/test_basic_correctness.py
519-
- pytest -v -s compile/test_multimodal_compile.py
520-
- pytest -v -s compile/piecewise/
524+
# Run smoke tests under fullgraph directory, except test_full_graph.py
525+
# as it is a heavy test that is covered in other steps.
526+
# Use `find` to launch multiple instances of pytest so that
527+
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
528+
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
521529

522530
- label: PyTorch Fullgraph Test # 27min
523531
timeout_in_minutes: 40
@@ -529,10 +537,10 @@ steps:
529537
- vllm/
530538
- tests/compile
531539
commands:
532-
- pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
540+
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
533541
# Limit to no custom ops to reduce running time
534542
# Wrap with quotes to escape yaml and avoid starting -k string with a -
535-
- "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
543+
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
536544

537545
- label: Cudagraph test
538546
timeout_in_minutes: 20
@@ -697,7 +705,7 @@ steps:
697705
- vllm/model_executor/models/whisper.py
698706
commands: # LMEval
699707
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
700-
- pytest -s entrypoints/openai/correctness/ --ignore entrypoints/openai/correctness/test_transcription_api_correctness.py
708+
- pytest -s entrypoints/openai/correctness/
701709

702710
- label: OpenAI-Compatible Tool Use # 23 min
703711
timeout_in_minutes: 35
@@ -998,12 +1006,12 @@ steps:
9981006
optional: true
9991007
commands:
10001008
- pip install --upgrade git+https://github.com/huggingface/transformers
1001-
- pytest -v -s tests/models/test_initialization.py
1009+
- pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
10021010
- pytest -v -s tests/models/test_transformers.py
1003-
- pytest -v -s tests/models/multimodal/processing/
1004-
- pytest -v -s tests/models/multimodal/test_mapping.py
1011+
# - pytest -v -s tests/models/multimodal/processing/
1012+
- pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
10051013
- python3 examples/offline_inference/basic/chat.py
1006-
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
1014+
# - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
10071015
# Whisper needs spawn method to avoid deadlock
10081016
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
10091017

@@ -1048,7 +1056,7 @@ steps:
10481056
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
10491057
- pytest -v -s tests/kernels/moe/test_flashinfer.py
10501058

1051-
- label: Blackwell Fusion Tests # 30 min
1059+
- label: Blackwell Fusion and Compile Tests # 30 min
10521060
timeout_in_minutes: 40
10531061
working_dir: "/vllm-workspace/"
10541062
gpu: b200
@@ -1066,10 +1074,12 @@ steps:
10661074
- pytest -v -s tests/compile/test_fusion_attn.py
10671075
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
10681076
# this runner has 2 GPUs available even though num_gpus=2 is not set
1069-
- pytest -v -s tests/compile/test_fusion_all_reduce.py
1077+
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
10701078
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
10711079
# Wrap with quotes to escape yaml
1072-
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
1080+
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
1081+
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
1082+
- pytest -v -s tests/compile/distributed/test_full_graph.py::test_fp8_kv_scale_compile
10731083

10741084
- label: Blackwell Fusion E2E Tests # 30 min
10751085
timeout_in_minutes: 40
@@ -1086,20 +1096,18 @@ steps:
10861096
- vllm/model_executor/layers/layernorm.py
10871097
- vllm/model_executor/layers/activation.py
10881098
- vllm/model_executor/layers/quantization/input_quant_fp8.py
1089-
- tests/compile/test_fusions_e2e.py
1090-
- tests/compile/test_full_graph.py
1099+
- tests/compile/distributed/test_fusions_e2e.py
1100+
- tests/compile/fullgraph/test_full_graph.py
10911101
commands:
10921102
- nvidia-smi
10931103
# Run all e2e fusion tests
10941104
- pytest -v -s tests/compile/test_fusions_e2e.py
1095-
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
1096-
- pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
10971105

10981106
- label: ROCm GPT-OSS Eval
10991107
timeout_in_minutes: 60
11001108
working_dir: "/vllm-workspace/"
11011109
agent_pool: mi325_1
1102-
mirror_hardwares: [amdproduction]
1110+
mirror_hardwares: [amdexperimental, amdproduction]
11031111
optional: true # run on nightlies
11041112
source_file_dependencies:
11051113
- tests/evals/gpt_oss
@@ -1198,7 +1206,7 @@ steps:
11981206
- vllm/worker/worker_base.py
11991207
- vllm/v1/engine/
12001208
- vllm/v1/worker/
1201-
- tests/compile/test_basic_correctness.py
1209+
- tests/compile/fullgraph/test_basic_correctness.py
12021210
- tests/compile/test_wrapper.py
12031211
- tests/distributed/
12041212
- tests/entrypoints/llm/test_collective_rpc.py
@@ -1211,7 +1219,7 @@ steps:
12111219
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
12121220
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
12131221
- pytest -v -s entrypoints/llm/test_collective_rpc.py
1214-
- pytest -v -s ./compile/test_basic_correctness.py
1222+
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
12151223
- pytest -v -s ./compile/test_wrapper.py
12161224
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
12171225
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
@@ -1326,21 +1334,20 @@ steps:
13261334
- vllm/
13271335
- tests/weight_loading
13281336
commands:
1329-
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
1337+
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt
13301338

13311339
- label: Weight Loading Multiple GPU Test - Large Models # optional
13321340
mirror_hardwares: [amdexperimental]
13331341
agent_pool: mi325_2
13341342
# grade: Blocking
13351343
working_dir: "/vllm-workspace/tests"
13361344
num_gpus: 2
1337-
gpu: a100
13381345
optional: true
13391346
source_file_dependencies:
13401347
- vllm/
13411348
- tests/weight_loading
13421349
commands:
1343-
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
1350+
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt
13441351

13451352
- label: NixlConnector PD accuracy tests (Distributed) # 30min
13461353
mirror_hardwares: [amdexperimental]
@@ -1417,10 +1424,12 @@ steps:
14171424
working_dir: "/vllm-workspace/"
14181425
num_gpus: 2
14191426
commands:
1420-
- pytest -v -s tests/compile/test_async_tp.py
1421-
- pytest -v -s tests/compile/test_sequence_parallelism.py
1422-
- pytest -v -s tests/compile/test_fusion_all_reduce.py
1423-
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
1427+
- pytest -v -s tests/compile/distributed/test_async_tp.py
1428+
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
1429+
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
1430+
#- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
1431+
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
1432+
- pytest -v -s tests/compile/distributed/test_sequence_parallel.py
14241433
- pytest -v -s tests/distributed/test_context_parallel.py
14251434
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
14261435
- pytest -v -s tests/v1/distributed/test_dbo.py

0 commit comments

Comments
 (0)