Skip to content

Commit ce53f46

Browse files
committed
Merge remote-tracking branch 'upstream/main'
2 parents 3efdd2b + c6db213 commit ce53f46

File tree

484 files changed

+12578
-5964
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

484 files changed

+12578
-5964
lines changed

.buildkite/run-cpu-test.sh

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -9,63 +9,60 @@ CORE_RANGE=${CORE_RANGE:-48-95}
99
NUMA_NODE=${NUMA_NODE:-1}
1010

1111
# Try building the docker image
12-
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.cpu .
13-
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
12+
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
13+
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
1414

1515
# Setup cleanup
16-
remove_docker_container() { docker rm -f cpu-test-"$NUMA_NODE" cpu-test-avx2-"$NUMA_NODE" || true; }
16+
remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
1717
trap remove_docker_container EXIT
1818
remove_docker_container
1919

2020
# Run the image, setting --shm-size=4g for tensor parallel.
2121
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
22-
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test
22+
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
2323
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
24-
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2-"$NUMA_NODE" cpu-test-avx2
24+
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
2525

2626
function cpu_tests() {
2727
set -e
2828
export NUMA_NODE=$2
2929

3030
# offline inference
31-
docker exec cpu-test-avx2-"$NUMA_NODE" bash -c "
31+
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
3232
set -e
33-
python3 examples/offline_inference.py"
33+
python3 examples/offline_inference/basic.py"
3434

3535
# Run basic model test
36-
docker exec cpu-test-"$NUMA_NODE" bash -c "
36+
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
3737
set -e
38-
pip install pytest pytest-asyncio \
39-
decord einops librosa peft Pillow sentence-transformers soundfile \
40-
transformers_stream_generator matplotlib datamodel_code_generator
41-
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
38+
pip install -r vllm/requirements-test.txt
4239
pytest -v -s tests/models/decoder_only/language -m cpu_model
4340
pytest -v -s tests/models/embedding/language -m cpu_model
4441
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
4542
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
4643
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
4744

4845
# Run compressed-tensor test
49-
docker exec cpu-test-"$NUMA_NODE" bash -c "
46+
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
5047
set -e
5148
pytest -s -v \
5249
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
5350
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
5451

5552
# Run AWQ test
56-
docker exec cpu-test-"$NUMA_NODE" bash -c "
53+
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
5754
set -e
5855
pytest -s -v \
5956
tests/quantization/test_ipex_quant.py"
6057

6158
# Run chunked-prefill and prefix-cache test
62-
docker exec cpu-test-"$NUMA_NODE" bash -c "
59+
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
6360
set -e
6461
pytest -s -v -k cpu_model \
6562
tests/basic_correctness/test_chunked_prefill.py"
6663

67-
# online inference
68-
docker exec cpu-test-"$NUMA_NODE" bash -c "
64+
# online serving
65+
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
6966
set -e
7067
export VLLM_CPU_KVCACHE_SPACE=10
7168
export VLLM_CPU_OMP_THREADS_BIND=$1
@@ -78,6 +75,12 @@ function cpu_tests() {
7875
--num-prompts 20 \
7976
--endpoint /v1/completions \
8077
--tokenizer facebook/opt-125m"
78+
79+
# Run multi-lora tests
80+
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
81+
set -e
82+
pytest -s -v \
83+
tests/lora/test_qwen2vl.py"
8184
}
8285

8386
# All of CPU tests are expected to be finished less than 25 mins.

.buildkite/run-gh200-test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,5 +24,5 @@ remove_docker_container
2424

2525
# Run the image and test offline inference
2626
docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
27-
python3 examples/offline_inference.py
27+
python3 examples/offline_inference/basic.py
2828
'

.buildkite/run-hpu-test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,4 @@ trap remove_docker_container EXIT
1313
remove_docker_container
1414

1515
# Run the image and launch offline inference
16-
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py
16+
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py

.buildkite/run-neuron-test.sh

Lines changed: 27 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,18 @@
33
# This script build the Neuron docker image and run the API server inside the container.
44
# It serves a sanity check for compilation and basic model usage.
55
set -e
6+
set -v
7+
8+
image_name="neuron/vllm-ci"
9+
container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
10+
11+
HF_CACHE="$(realpath ~)/huggingface"
12+
mkdir -p "${HF_CACHE}"
13+
HF_MOUNT="/root/.cache/huggingface"
14+
15+
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
16+
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
17+
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
618

719
# Try building the docker image
820
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
@@ -13,41 +25,30 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
1325
last_build=$(cat /tmp/neuron-docker-build-timestamp)
1426
current_time=$(date +%s)
1527
if [ $((current_time - last_build)) -gt 86400 ]; then
28+
docker image prune -f
1629
docker system prune -f
30+
rm -rf "${HF_MOUNT:?}/*"
31+
rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*"
1732
echo "$current_time" > /tmp/neuron-docker-build-timestamp
1833
fi
1934
else
2035
date "+%s" > /tmp/neuron-docker-build-timestamp
2136
fi
2237

23-
docker build -t neuron -f Dockerfile.neuron .
38+
docker build -t "${image_name}" -f Dockerfile.neuron .
2439

2540
# Setup cleanup
26-
remove_docker_container() { docker rm -f neuron || true; }
41+
remove_docker_container() {
42+
docker image rm -f "${image_name}" || true;
43+
}
2744
trap remove_docker_container EXIT
28-
remove_docker_container
2945

3046
# Run the image
31-
docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
32-
--model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
33-
34-
# Wait for the server to start
35-
wait_for_server_to_start() {
36-
timeout=300
37-
counter=0
38-
39-
while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do
40-
sleep 1
41-
counter=$((counter + 1))
42-
if [ $counter -ge $timeout ]; then
43-
echo "Timeout after $timeout seconds"
44-
break
45-
fi
46-
done
47-
}
48-
wait_for_server_to_start
49-
50-
# Test a simple prompt
51-
curl -X POST -H "Content-Type: application/json" \
52-
localhost:8000/generate \
53-
-d '{"prompt": "San Francisco is a"}'
47+
docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
48+
-v "${HF_CACHE}:${HF_MOUNT}" \
49+
-e "HF_HOME=${HF_MOUNT}" \
50+
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
51+
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
52+
--name "${container_name}" \
53+
${image_name} \
54+
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py"

.buildkite/run-openvino-test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,4 @@ trap remove_docker_container EXIT
1313
remove_docker_container
1414

1515
# Run the image and launch offline inference
16-
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py
16+
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic.py

.buildkite/run-tpu-test.sh

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,13 @@ remove_docker_container
1414
# For HF_TOKEN.
1515
source /etc/environment
1616
# Run a simple end-to-end example.
17-
docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
17+
docker run --privileged --net host --shm-size=16G -it \
18+
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
19+
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
20+
&& python3 -m pip install pytest \
21+
&& python3 -m pip install lm_eval[api]==0.4.4 \
22+
&& pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py \
23+
&& pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
24+
&& python3 /workspace/vllm/tests/tpu/test_compilation.py \
25+
&& python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
26+
&& python3 /workspace/vllm/examples/offline_inference/tpu.py"

.buildkite/run-xpu-test.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,6 @@ remove_docker_container
1414

1515
# Run the image and test offline inference/tensor parallel
1616
docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
17-
python3 examples/offline_inference.py
18-
python3 examples/offline_inference_cli.py -tp 2
17+
python3 examples/offline_inference/basic.py
18+
python3 examples/offline_inference/cli.py -tp 2
1919
'

.buildkite/test-pipeline.yaml

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ steps:
3838
- pip install -r requirements-docs.txt
3939
- SPHINXOPTS=\"-W\" make html
4040
# Check API reference (if it fails, you may have missing mock imports)
41-
- grep \"sig sig-object py\" build/html/dev/sampling_params.html
41+
- grep \"sig sig-object py\" build/html/api/inference_params.html
4242

4343
- label: Async Engine, Inputs, Utils, Worker Test # 24min
4444
fast_check: true
@@ -52,6 +52,7 @@ steps:
5252
- tests/worker
5353
- tests/standalone_tests/lazy_torch_compile.py
5454
commands:
55+
- pip install git+https://github.com/Isotr0py/DeepSeek-VL2.git # Used by multimoda processing test
5556
- python3 standalone_tests/lazy_torch_compile.py
5657
- pytest -v -s mq_llm_engine # MQLLMEngine
5758
- pytest -v -s async_engine # AsyncLLMEngine
@@ -187,19 +188,19 @@ steps:
187188
- examples/
188189
commands:
189190
- pip install tensorizer # for tensorizer test
190-
- python3 offline_inference.py
191-
- python3 cpu_offload.py
192-
- python3 offline_inference_chat.py
193-
- python3 offline_inference_with_prefix.py
194-
- python3 llm_engine_example.py
195-
- python3 offline_inference_vision_language.py
196-
- python3 offline_inference_vision_language_multi_image.py
197-
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
198-
- python3 offline_inference_encoder_decoder.py
199-
- python3 offline_inference_classification.py
200-
- python3 offline_inference_embedding.py
201-
- python3 offline_inference_scoring.py
202-
- python3 offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
191+
- python3 offline_inference/basic.py
192+
- python3 offline_inference/cpu_offload.py
193+
- python3 offline_inference/chat.py
194+
- python3 offline_inference/prefix_caching.py
195+
- python3 offline_inference/llm_engine_example.py
196+
- python3 offline_inference/vision_language.py
197+
- python3 offline_inference/vision_language_multi_image.py
198+
- python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
199+
- python3 offline_inference/encoder_decoder.py
200+
- python3 offline_inference/classification.py
201+
- python3 offline_inference/embedding.py
202+
- python3 offline_inference/scoring.py
203+
- python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
203204

204205
- label: Prefix Caching Test # 9min
205206
mirror_hardwares: [amd]
@@ -214,6 +215,7 @@ steps:
214215
- vllm/model_executor/layers
215216
- vllm/sampling_metadata.py
216217
- tests/samplers
218+
- tests/conftest.py
217219
commands:
218220
- pytest -v -s samplers
219221
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
@@ -229,20 +231,22 @@ steps:
229231
- pytest -v -s test_logits_processor.py
230232
- pytest -v -s model_executor/test_guided_processors.py
231233

232-
- label: Speculative decoding tests # 30min
234+
- label: Speculative decoding tests # 40min
233235
source_file_dependencies:
234236
- vllm/spec_decode
235237
- tests/spec_decode
238+
- vllm/model_executor/models/eagle.py
236239
commands:
237240
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
238241
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
242+
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
239243

240244
- label: LoRA Test %N # 15min each
241245
mirror_hardwares: [amd]
242246
source_file_dependencies:
243247
- vllm/lora
244248
- tests/lora
245-
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
249+
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
246250
parallelism: 4
247251

248252
- label: "PyTorch Fullgraph Smoke Test" # 9min
@@ -367,6 +371,7 @@ steps:
367371
- tests/models/encoder_decoder/vision_language
368372
commands:
369373
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
374+
- pytest -v -s models/multimodal
370375
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
371376
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
372377
- pytest -v -s models/embedding/vision_language -m core_model
@@ -535,6 +540,7 @@ steps:
535540
# requires multi-GPU testing for validation.
536541
- pytest -v -s -x lora/test_chatglm3_tp.py
537542
- pytest -v -s -x lora/test_llama_tp.py
543+
- pytest -v -s -x lora/test_minicpmv_tp.py
538544

539545

540546
- label: Weight Loading Multiple GPU Test # 33min
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ on:
1313
- "docs/**"
1414

1515
jobs:
16-
sphinx-lint:
16+
doc-lint:
1717
runs-on: ubuntu-latest
1818
strategy:
1919
matrix:
@@ -29,4 +29,4 @@ jobs:
2929
python -m pip install --upgrade pip
3030
pip install -r requirements-lint.txt
3131
- name: Linting docs
32-
run: tools/sphinx-lint.sh
32+
run: tools/doc-lint.sh

.gitignore

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,10 +79,7 @@ instance/
7979

8080
# Sphinx documentation
8181
docs/_build/
82-
docs/source/getting_started/examples/*.rst
83-
!**/*.template.rst
84-
docs/source/getting_started/examples/*.md
85-
!**/*.template.md
82+
docs/source/getting_started/examples/
8683

8784
# PyBuilder
8885
.pybuilder/

0 commit comments

Comments
 (0)