1
- ## Global Args #################################################################
2
- ARG BASE_UBI_IMAGE_TAG=9.5-1741850109
3
- ARG PYTHON_VERSION=3.12
4
1
5
- ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
6
- ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
2
+ ARG BASE_UBI_IMAGE_TAG
3
+ ARG PYTHON_VERSION
7
4
8
5
## Base Layer ##################################################################
9
- FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base
6
+ FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base
10
7
ARG PYTHON_VERSION
11
8
ENV PYTHON_VERSION=${PYTHON_VERSION}
12
9
RUN microdnf -y update && microdnf install -y --nodocs \
@@ -19,25 +16,28 @@ ENV LANG=C.UTF-8 \
19
16
LC_ALL=C.UTF-8
20
17
21
18
# Some utils for dev purposes - tar required for kubectl cp
19
+
22
20
RUN microdnf install -y --nodocs \
23
- which procps findutils tar vim git\
21
+ which procps findutils tar vim git \
24
22
&& microdnf clean all
25
23
26
24
27
25
## Python Installer ############################################################
28
- FROM base as python-install
26
+ FROM base AS python-install
29
27
ARG PYTHON_VERSION
30
28
31
29
ENV VIRTUAL_ENV=/opt/vllm
32
30
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
33
31
ENV PYTHON_VERSION=${PYTHON_VERSION}
34
32
RUN microdnf install -y --nodocs \
35
33
python${PYTHON_VERSION}-devel && \
36
- python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel uv && microdnf clean all
34
+ python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && \
35
+ pip install --no-cache -U pip wheel uv && \
36
+ microdnf clean all
37
37
38
38
39
39
## CUDA Base ###################################################################
40
- FROM python-install as cuda-base
40
+ FROM python-install AS cuda-base
41
41
42
42
RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \
43
43
https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
@@ -51,88 +51,30 @@ RUN microdnf install -y --nodocs \
51
51
ln -s ${CUDA_HOME}/lib64/stubs/libcuda.so /usr/lib64/
52
52
53
53
54
-
55
54
## Python cuda base #################################################################
56
55
FROM cuda-base AS python-cuda-base
57
56
58
57
ENV VIRTUAL_ENV=/opt/vllm
59
58
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
60
59
61
60
# install cuda and common dependencies
62
- RUN --mount=type=cache,target=/root/.cache/pip \
63
- --mount=type=cache,target=/root/.cache/uv \
61
+ RUN --mount=type=cache,target=/root/.cache/uv \
64
62
--mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
65
63
--mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
66
64
uv pip install \
67
65
-r requirements-cuda.txt
68
66
69
67
70
- ## Development #################################################################
71
- FROM python-cuda-base AS dev
72
-
73
- # install build and runtime dependencies
74
- RUN --mount=type=cache,target=/root/.cache/pip \
75
- --mount=type=cache,target=/root/.cache/uv \
76
- --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
77
- --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
78
- --mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \
79
- --mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \
80
- --mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \
81
- uv pip install \
82
- -r requirements-cuda.txt \
83
- -r requirements-dev.txt
84
-
85
- ## Builder #####################################################################
86
- FROM dev AS build
87
-
88
- # install build dependencies
89
- RUN --mount=type=cache,target=/root/.cache/pip \
90
- --mount=type=cache,target=/root/.cache/uv \
91
- --mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \
92
- uv pip install -r requirements-build.txt
93
-
94
- # install compiler cache to speed up compilation leveraging local or remote caching
95
- # git is required for the cutlass kernels
96
- RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && rpm -ql epel-release && microdnf install -y --nodocs git ccache && microdnf clean all
97
-
98
- COPY . .
99
-
100
- ARG TORCH_CUDA_ARCH_LIST
101
- ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
102
- ARG vllm_fa_cmake_gpu_arches
103
- ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
104
-
105
- # max jobs used by Ninja to build extensions
106
- ARG max_jobs=2
107
- ENV MAX_JOBS=${max_jobs}
108
- # number of threads used by nvcc
109
- ARG nvcc_threads=8
110
- ENV NVCC_THREADS=$nvcc_threads
111
- # make sure punica kernels are built (for LoRA)
112
- ENV VLLM_INSTALL_PUNICA_KERNELS=1
113
-
114
- # Make sure the cuda environment is in the PATH
115
- ENV PATH=/usr/local/cuda/bin:$PATH
116
-
117
- ENV CCACHE_DIR=/root/.cache/ccache
118
- RUN --mount=type=cache,target=/root/.cache/ccache \
119
- --mount=type=cache,target=/root/.cache/pip \
120
- --mount=type=cache,target=/root/.cache/uv \
121
- --mount=type=bind,src=.git,target=/workspace/.git \
122
- env CFLAGS="-march=haswell" \
123
- CXXFLAGS="$CFLAGS $CXXFLAGS" \
124
- CMAKE_BUILD_TYPE=Release \
125
- python3 setup.py bdist_wheel --dist-dir=dist
126
68
127
69
#################### libsodium Build IMAGE ####################
128
- FROM base as libsodium-builder
70
+ FROM base AS libsodium-builder
129
71
130
72
RUN microdnf install -y --nodocs gcc gzip \
131
73
&& microdnf clean all
132
74
133
75
WORKDIR /usr/src/libsodium
134
76
135
- ARG LIBSODIUM_VERSION=1.0.20
77
+ ARG LIBSODIUM_VERSION
136
78
RUN curl -LO https://github.com/jedisct1/libsodium/releases/download/${LIBSODIUM_VERSION}-RELEASE/libsodium-${LIBSODIUM_VERSION}.tar.gz \
137
79
&& tar -xzvf libsodium*.tar.gz \
138
80
&& rm -f libsodium*.tar.gz \
@@ -156,25 +98,32 @@ ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nv
156
98
ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvtx/lib:${LD_LIBRARY_PATH}"
157
99
158
100
# Triton needs a CC compiler
101
+
159
102
RUN microdnf install -y --nodocs gcc \
160
103
rsync \
161
104
&& microdnf clean all
162
105
163
- # install vllm wheel first, so that torch etc will be installed
164
- RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
165
- --mount=type=cache,target=/root/.cache/pip \
166
- --mount=type=cache,target=/root/.cache/uv \
167
- uv pip install "$(echo dist/*.whl)[tensorizer]" --verbose
168
106
169
107
# Install libsodium for Tensorizer encryption
170
108
RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \
171
- cd /usr/src/libsodium \
172
- && make install
109
+ make -C /usr/src/libsodium install
173
110
174
- RUN --mount=type=cache,target=/root/.cache/pip \
175
- --mount=type=cache,target=/root/.cache/uv \
176
- uv pip install \
177
- "https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.0.post2/flashinfer_python-0.2.0.post2+cu124torch2.5-cp312-cp312-linux_x86_64.whl"
111
+ COPY LICENSE /licenses/vllm.md
112
+ COPY examples/*.jinja /app/data/template/
113
+
114
+ # install vllm by running the payload script and then install flashinfer
115
+
116
+ ARG VLLM_WHEEL_VERSION
117
+ ARG VLLM_WHEEL_INDEX
118
+ ARG FLASHINFER_VERSION
119
+ RUN --mount=type=cache,target=/root/.cache/uv \
120
+ --mount=type=bind,src=payload,target=/workspace/payload \
121
+ --mount=type=secret,id=rhel-ai-private-index-auth/BOT_PAT \
122
+ env BOT_PAT=$(cat /run/secrets/rhel-ai-private-index-auth/BOT_PAT) \
123
+ VLLM_WHEEL_VERSION=${VLLM_VERSION} \
124
+ VLLM_WHEEL_INDEX=${VLLM_WHEEL_INDEX} \
125
+ ./payload/run.sh && \
126
+ uv pip install "${FLASHINFER_VERSION}"
178
127
179
128
ENV HF_HUB_OFFLINE=1 \
180
129
HOME=/home/vllm \
@@ -199,25 +148,32 @@ ENV HF_HUB_OFFLINE=1 \
199
148
RUN umask 002 && \
200
149
useradd --uid 2000 --gid 0 vllm && \
201
150
mkdir -p /home/vllm && \
202
- chmod g+rwx /home/vllm /usr/src /workspace
203
-
204
- COPY LICENSE /licenses/vllm.md
205
- COPY examples/*.jinja /app/data/template/
151
+ chmod g+rwx /home/vllm
206
152
207
153
USER 2000
208
154
WORKDIR /home/vllm
209
155
210
156
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
211
157
212
158
213
- FROM vllm-openai as vllm-grpc-adapter
159
+ ## TGIS Adapter layer #####################################################################
160
+ FROM vllm-openai AS vllm-grpc-adapter
214
161
215
162
USER root
216
163
217
- RUN --mount=type=cache,target=/root/.cache/pip \
218
- --mount=type=cache,target=/root/.cache/uv \
219
- --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
220
- HOME=/root uv pip install "$(echo /workspace/dist/*.whl)[tensorizer]" vllm-tgis-adapter==0.6.3
164
+ ARG VLLM_TGIS_ADAPTER_VERSION
165
+ RUN --mount=type=cache,target=/root/.cache/uv \
166
+ --mount=type=bind,src=payload,target=/workspace/payload \
167
+ --mount=type=secret,id=rhel-ai-private-index-auth/BOT_PAT \
168
+ cd /workspace && \
169
+ ls && \
170
+ env HOME=/root \
171
+ BOT_PAT=$(cat /run/secrets/rhel-ai-private-index-auth/BOT_PAT) \
172
+ VLLM_WHEEL_VERSION=${VLLM_VERSION} \
173
+ VLLM_TGIS_ADAPTER_VERSION=${VLLM_TGIS_ADAPTER_VERSION} \
174
+ VLLM_WHEEL_INDEX=${VLLM_WHEEL_INDEX} \
175
+ ./payload/run.sh
176
+
221
177
222
178
ENV GRPC_PORT=8033 \
223
179
PORT=8000 \
0 commit comments