Skip to content

Commit 576ddfd

Browse files
Merge branch 'main' into stagecli
2 parents 27137b1 + 4f9731c commit 576ddfd

File tree

15 files changed

+674
-425
lines changed

15 files changed

+674
-425
lines changed

.buildkite/pipeline.yml

Lines changed: 105 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -139,41 +139,41 @@ steps:
139139
- "/fsx/hf_cache:/fsx/hf_cache"
140140

141141

142-
- label: "Benchmark&Engine Test"
143-
timeout_in_minutes: 15
144-
depends_on: image-build
145-
commands:
146-
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
147-
- pytest -s -v tests/benchmarks/test_serve_cli.py
148-
- pytest -s -v tests/engine/test_async_omni_engine_abort.py
149-
agents:
150-
queue: "mithril-h100-pool"
151-
plugins:
152-
- kubernetes:
153-
podSpec:
154-
containers:
155-
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
156-
resources:
157-
limits:
158-
nvidia.com/gpu: 2
159-
volumeMounts:
160-
- name: devshm
161-
mountPath: /dev/shm
162-
- name: hf-cache
163-
mountPath: /root/.cache/huggingface
164-
env:
165-
- name: HF_HOME
166-
value: /root/.cache/huggingface
167-
nodeSelector:
168-
node.kubernetes.io/instance-type: gpu-h100-sxm
169-
volumes:
170-
- name: devshm
171-
emptyDir:
172-
medium: Memory
173-
- name: hf-cache
174-
hostPath:
175-
path: /mnt/hf-cache
176-
type: DirectoryOrCreate
142+
# - label: "Benchmark&Engine Test"
143+
# timeout_in_minutes: 15
144+
# depends_on: image-build
145+
# commands:
146+
# - export VLLM_WORKER_MULTIPROC_METHOD=spawn
147+
# - pytest -s -v tests/benchmarks/test_serve_cli.py
148+
# - pytest -s -v tests/engine/test_async_omni_engine_abort.py
149+
# agents:
150+
# queue: "mithril-h100-pool"
151+
# plugins:
152+
# - kubernetes:
153+
# podSpec:
154+
# containers:
155+
# - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
156+
# resources:
157+
# limits:
158+
# nvidia.com/gpu: 2
159+
# volumeMounts:
160+
# - name: devshm
161+
# mountPath: /dev/shm
162+
# - name: hf-cache
163+
# mountPath: /root/.cache/huggingface
164+
# env:
165+
# - name: HF_HOME
166+
# value: /root/.cache/huggingface
167+
# nodeSelector:
168+
# node.kubernetes.io/instance-type: gpu-h100-sxm
169+
# volumes:
170+
# - name: devshm
171+
# emptyDir:
172+
# medium: Memory
173+
# - name: hf-cache
174+
# hostPath:
175+
# path: /mnt/hf-cache
176+
# type: DirectoryOrCreate
177177

178178
- label: "Omni Model Test"
179179
timeout_in_minutes: 15
@@ -194,42 +194,42 @@ steps:
194194
volumes:
195195
- "/fsx/hf_cache:/fsx/hf_cache"
196196

197-
- label: "Omni Model Test with H100"
198-
timeout_in_minutes: 20
199-
depends_on: image-build
200-
commands:
201-
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
202-
- export VLLM_TEST_CLEAN_GPU_MEMORY="1"
203-
- pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py
204-
- pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py -m "core_model" --run-level "core_model"
205-
agents:
206-
queue: "mithril-h100-pool"
207-
plugins:
208-
- kubernetes:
209-
podSpec:
210-
containers:
211-
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
212-
resources:
213-
limits:
214-
nvidia.com/gpu: 2
215-
volumeMounts:
216-
- name: devshm
217-
mountPath: /dev/shm
218-
- name: hf-cache
219-
mountPath: /root/.cache/huggingface
220-
env:
221-
- name: HF_HOME
222-
value: /root/.cache/huggingface
223-
nodeSelector:
224-
node.kubernetes.io/instance-type: gpu-h100-sxm
225-
volumes:
226-
- name: devshm
227-
emptyDir:
228-
medium: Memory
229-
- name: hf-cache
230-
hostPath:
231-
path: /mnt/hf-cache
232-
type: DirectoryOrCreate
197+
# - label: "Omni Model Test with H100"
198+
# timeout_in_minutes: 20
199+
# depends_on: image-build
200+
# commands:
201+
# - export VLLM_WORKER_MULTIPROC_METHOD=spawn
202+
# - export VLLM_TEST_CLEAN_GPU_MEMORY="1"
203+
# - pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py
204+
# - pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py -m "core_model" --run-level "core_model"
205+
# agents:
206+
# queue: "mithril-h100-pool"
207+
# plugins:
208+
# - kubernetes:
209+
# podSpec:
210+
# containers:
211+
# - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
212+
# resources:
213+
# limits:
214+
# nvidia.com/gpu: 2
215+
# volumeMounts:
216+
# - name: devshm
217+
# mountPath: /dev/shm
218+
# - name: hf-cache
219+
# mountPath: /root/.cache/huggingface
220+
# env:
221+
# - name: HF_HOME
222+
# value: /root/.cache/huggingface
223+
# nodeSelector:
224+
# node.kubernetes.io/instance-type: gpu-h100-sxm
225+
# volumes:
226+
# - name: devshm
227+
# emptyDir:
228+
# medium: Memory
229+
# - name: hf-cache
230+
# hostPath:
231+
# path: /mnt/hf-cache
232+
# type: DirectoryOrCreate
233233

234234
- label: "Qwen3-TTS E2E Test"
235235
timeout_in_minutes: 20
@@ -251,40 +251,40 @@ steps:
251251
volumes:
252252
- "/fsx/hf_cache:/fsx/hf_cache"
253253

254-
- label: "Diffusion Image Edit Test with H100 (1 GPU)"
255-
timeout_in_minutes: 20
256-
depends_on: image-build
257-
commands:
258-
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
259-
- pytest -s -v tests/e2e/online_serving/test_image_gen_edit.py
260-
agents:
261-
queue: "mithril-h100-pool"
262-
plugins:
263-
- kubernetes:
264-
podSpec:
265-
containers:
266-
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
267-
resources:
268-
limits:
269-
nvidia.com/gpu: 1
270-
volumeMounts:
271-
- name: devshm
272-
mountPath: /dev/shm
273-
- name: hf-cache
274-
mountPath: /root/.cache/huggingface
275-
env:
276-
- name: HF_HOME
277-
value: /root/.cache/huggingface
278-
nodeSelector:
279-
node.kubernetes.io/instance-type: gpu-h100-sxm
280-
volumes:
281-
- name: devshm
282-
emptyDir:
283-
medium: Memory
284-
- name: hf-cache
285-
hostPath:
286-
path: /mnt/hf-cache
287-
type: DirectoryOrCreate
254+
# - label: "Diffusion Image Edit Test with H100 (1 GPU)"
255+
# timeout_in_minutes: 20
256+
# depends_on: image-build
257+
# commands:
258+
# - export VLLM_WORKER_MULTIPROC_METHOD=spawn
259+
# - pytest -s -v tests/e2e/online_serving/test_image_gen_edit.py
260+
# agents:
261+
# queue: "mithril-h100-pool"
262+
# plugins:
263+
# - kubernetes:
264+
# podSpec:
265+
# containers:
266+
# - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
267+
# resources:
268+
# limits:
269+
# nvidia.com/gpu: 1
270+
# volumeMounts:
271+
# - name: devshm
272+
# mountPath: /dev/shm
273+
# - name: hf-cache
274+
# mountPath: /root/.cache/huggingface
275+
# env:
276+
# - name: HF_HOME
277+
# value: /root/.cache/huggingface
278+
# nodeSelector:
279+
# node.kubernetes.io/instance-type: gpu-h100-sxm
280+
# volumes:
281+
# - name: devshm
282+
# emptyDir:
283+
# medium: Memory
284+
# - name: hf-cache
285+
# hostPath:
286+
# path: /mnt/hf-cache
287+
# type: DirectoryOrCreate
288288

289289
# - label: "Bagel Text2Img Model Test with H100"
290290
# timeout_in_minutes: 30

.buildkite/test-nightly.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@ steps:
55
if: build.env("NIGHTLY") == "1"
66
commands:
77
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
8-
- pytest -s -v tests/e2e/online_serving/test_qwen3_omni_expansion.py
9-
- pytest -s -v tests/examples/online_serving/test_qwen3_omni.py
8+
- pytest -s -v tests/e2e/online_serving/test_qwen3_omni_expansion.py -m "advanced_model" --run-level "advanced_model"
9+
- pytest -s -v tests/examples/online_serving/test_qwen3_omni.py -m "advanced_model" --run-level "advanced_model"
1010
agents:
1111
queue: "mithril-h100-pool"
1212
plugins:
@@ -43,7 +43,7 @@ steps:
4343
commands:
4444
- export VLLM_LOGGING_LEVEL=DEBUG
4545
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
46-
- pytest -s -v tests/examples/online_serving/test_qwen2_5_omni.py
46+
- pytest -s -v tests/examples/online_serving/test_qwen2_5_omni.py -m "advanced_model" --run-level "advanced_model"
4747
agents:
4848
queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
4949
plugins:

docker/Dockerfile.xpu

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
# Argument to configure vllm base image if pre-built
2+
ARG VLLM_BASE=vllm-base
3+
4+
FROM intel/deep-learning-essentials:2025.3.2-0-devel-ubuntu24.04 AS vllm-base
5+
6+
WORKDIR /workspace/
7+
8+
ARG PYTHON_VERSION=3.12
9+
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/xpu"
10+
11+
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
12+
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
13+
add-apt-repository -y ppa:kobuk-team/intel-graphics
14+
15+
RUN apt clean && apt-get update -y && \
16+
apt-get install -y --no-install-recommends --fix-missing \
17+
curl \
18+
ffmpeg \
19+
git \
20+
libsndfile1 \
21+
libsm6 \
22+
libxext6 \
23+
libgl1 \
24+
lsb-release \
25+
libaio-dev \
26+
numactl \
27+
wget \
28+
vim \
29+
python3.12 \
30+
python3.12-dev \
31+
python3-pip
32+
33+
RUN apt update && apt upgrade -y && \
34+
apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing intel-ocloc && \
35+
apt install -y intel-oneapi-compiler-dpcpp-cpp-2025.3
36+
37+
ENV PATH="/root/.local/bin:$PATH"
38+
ENV VIRTUAL_ENV="/opt/venv"
39+
ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python
40+
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
41+
RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
42+
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
43+
44+
# This oneccl contains the BMG support which is not the case for default version of oneapi 2025.2.
45+
ARG ONECCL_INSTALLER="intel-oneccl-2021.15.7.8_offline.sh"
46+
RUN wget "https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.7/${ONECCL_INSTALLER}" && \
47+
bash "${ONECCL_INSTALLER}" -a --silent --eula accept && \
48+
rm "${ONECCL_INSTALLER}" && \
49+
echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc && \
50+
echo "source /opt/intel/oneapi/ccl/2021.15/env/vars.sh --force" >> /root/.bashrc
51+
RUN rm -f /opt/intel/oneapi/ccl/latest && \
52+
ln -s /opt/intel/oneapi/ccl/2021.15 /opt/intel/oneapi/ccl/latest
53+
54+
SHELL ["bash", "-c"]
55+
CMD ["bash", "-c", "source /root/.bashrc && exec bash"]
56+
57+
WORKDIR /workspace/
58+
ENV UV_HTTP_TIMEOUT=500
59+
60+
# Configure package index for XPU
61+
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
62+
ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
63+
ENV UV_INDEX_STRATEGY="unsafe-best-match"
64+
ENV UV_LINK_MODE="copy"
65+
66+
ARG VLLM_VERSION=v0.16.0
67+
RUN git clone -b ${VLLM_VERSION} https://github.com/vllm-project/vllm
68+
WORKDIR /workspace/vllm
69+
70+
RUN --mount=type=cache,target=/root/.cache/uv \
71+
uv pip install --upgrade pip && \
72+
uv pip install -r requirements/xpu.txt
73+
74+
# used for suffix method speculative decoding
75+
# build deps for proto + nanobind-based extensions to set up the build environment
76+
RUN --mount=type=cache,target=/root/.cache/uv \
77+
uv pip install grpcio-tools protobuf nanobind
78+
# arctic-inference is built from source which needs torch-xpu properly installed first
79+
RUN --mount=type=cache,target=/root/.cache/uv \
80+
source /opt/intel/oneapi/setvars.sh --force && \
81+
source /opt/intel/oneapi/ccl/2021.15/env/vars.sh --force && \
82+
export CMAKE_PREFIX_PATH="$(python -c 'import site; print(site.getsitepackages()[0])'):${CMAKE_PREFIX_PATH}" && \
83+
uv pip install --no-build-isolation arctic-inference==0.1.1
84+
85+
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
86+
87+
ENV VLLM_TARGET_DEVICE=xpu
88+
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
89+
90+
RUN --mount=type=cache,target=/root/.cache/uv \
91+
uv pip install --no-build-isolation .
92+
93+
CMD ["/bin/bash"]
94+
95+
FROM vllm-base AS vllm-openai
96+
97+
# install additional dependencies for openai api server
98+
RUN --mount=type=cache,target=/root/.cache/uv \
99+
uv pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] modelscope
100+
101+
# install development dependencies (for testing)
102+
RUN uv pip install -e tests/vllm_test_utils
103+
104+
# install nixl from source code
105+
ENV NIXL_VERSION=0.7.0
106+
RUN python /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
107+
108+
# ensure vllm is properly installed
109+
RUN python -c "import vllm, inspect; print(vllm.__file__)"
110+
RUN uv pip show vllm
111+
112+
CMD ["/bin/bash"]
113+
114+
ENTRYPOINT []
115+
116+
FROM ${VLLM_BASE} AS vllm-omni
117+
118+
WORKDIR /workspace/vllm-omni
119+
COPY . .
120+
121+
ENV VLLM_OMNI_TARGET_DEVICE=xpu
122+
RUN uv pip install --no-cache-dir ".[dev]" --no-build-isolation
123+
124+
# FIX triton
125+
RUN --mount=type=cache,target=/root/.cache/uv \
126+
uv pip uninstall triton triton-xpu && \
127+
uv pip install triton-xpu==3.6.0
128+
129+
# remove torch bundled oneccl to avoid conflicts
130+
RUN --mount=type=cache,target=/root/.cache/uv \
131+
uv pip uninstall oneccl oneccl-devel
132+
133+
FROM vllm-omni AS vllm-omni-openai
134+
135+
RUN ln -sf /usr/bin/python3 /usr/bin/python
136+
137+
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
138+
139+
ENTRYPOINT ["vllm", "serve", "--omni"]

docs/getting_started/installation/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,5 @@ vLLM-Omni supports the following hardware platforms:
55
- [GPU](gpu.md)
66
- [NVIDIA CUDA](gpu.md)
77
- [AMD ROCm](gpu.md)
8+
- [Intel XPU](gpu.md)
89
- [NPU](npu.md)

0 commit comments

Comments
 (0)