1
1
# # Global Args #################################################################
2
- ARG BASE_UBI_IMAGE_TAG=9.2-755.1696515532
3
- ARG PROTOC_VERSION=24.4
2
+ ARG BASE_UBI_IMAGE_TAG=9.3-1361.1699548029
3
+ ARG PROTOC_VERSION=25.0
4
4
ARG PYTORCH_INDEX="https://download.pytorch.org/whl"
5
5
# ARG PYTORCH_INDEX="https://download.pytorch.org/whl/nightly"
6
6
ARG PYTORCH_VERSION=2.1.0
@@ -12,7 +12,7 @@ WORKDIR /app
12
12
RUN dnf remove -y --disableplugin=subscription-manager \
13
13
subscription-manager \
14
14
# we install newer version of requests via pip
15
- python3-requests \
15
+ python3.11 -requests \
16
16
&& dnf install -y make \
17
17
# to help with debugging
18
18
procps \
@@ -128,9 +128,10 @@ RUN cargo install --path .
128
128
# # Tests base ##################################################################
129
129
FROM base as test-base
130
130
131
- RUN dnf install -y make unzip python39 python3-pip gcc openssl-devel gcc-c++ && \
131
+ RUN dnf install -y make unzip python3.11 python3.11 -pip gcc openssl-devel gcc-c++ && \
132
132
dnf clean all && \
133
- ln -s /usr/bin/python3 /usr/local/bin/python && ln -s /usr/bin/pip3 /usr/local/bin/pip
133
+ ln -fs /usr/bin/python3.11 /usr/bin/python3 && \
134
+ ln -s /usr/bin/python3.11 /usr/local/bin/python && ln -s /usr/bin/pip3.11 /usr/local/bin/pip
134
135
135
136
RUN pip install --upgrade pip && pip install pytest && pip install pytest-asyncio
136
137
@@ -141,6 +142,7 @@ ENV CUDA_VISIBLE_DEVICES=""
141
142
FROM test-base as cpu-tests
142
143
ARG PYTORCH_INDEX
143
144
ARG PYTORCH_VERSION
145
+ ARG SITE_PACKAGES=/usr/local/lib/python3.11/site-packages
144
146
145
147
WORKDIR /usr/src
146
148
@@ -157,8 +159,7 @@ RUN cd server && \
157
159
pip install ".[accelerate]" --no-cache-dir
158
160
159
161
# Patch codegen model changes into transformers 4.34
160
- RUN cp server/transformers_patch/modeling_codegen.py \
161
- /usr/local/lib/python3.*/site-packages/transformers/models/codegen/modeling_codegen.py
162
+ RUN cp server/transformers_patch/modeling_codegen.py ${SITE_PACKAGES}/transformers/models/codegen/modeling_codegen.py
162
163
163
164
# Install router
164
165
COPY --from=router-builder /usr/local/cargo/bin/text-generation-router /usr/local/bin/text-generation-router
@@ -177,7 +178,7 @@ ARG PYTORCH_VERSION
177
178
RUN dnf install -y unzip git ninja-build && dnf clean all
178
179
179
180
RUN cd ~ && \
180
- curl -L -O https://repo.anaconda.com/miniconda/Miniconda3-py39_23.5.2 -0-Linux-x86_64.sh && \
181
+ curl -L -O https://repo.anaconda.com/miniconda/Miniconda3-py311_23.9.0 -0-Linux-x86_64.sh && \
181
182
chmod +x Miniconda3-*-Linux-x86_64.sh && \
182
183
bash ./Miniconda3-*-Linux-x86_64.sh -bf -p /opt/miniconda && \
183
184
/opt/miniconda/bin/conda update -y --all && \
@@ -191,7 +192,7 @@ RUN if [ -d " /opt/miniconda/pkgs/conda-content-trust-*/info/test/tests" ]; then
191
192
ENV PATH=/opt/miniconda/bin:$PATH
192
193
193
194
# Install specific version of torch
194
- RUN pip install ninja==1.11.1 --no-cache-dir
195
+ RUN pip install ninja==1.11.1.1 --no-cache-dir
195
196
RUN pip install torch==$PYTORCH_VERSION+cu118 --index-url "${PYTORCH_INDEX}/cu118" --no-cache-dir
196
197
197
198
@@ -219,6 +220,23 @@ FROM python-builder as build
219
220
COPY server/custom_kernels/ /usr/src/.
220
221
RUN cd /usr/src && python setup.py build_ext && python setup.py install
221
222
223
+
224
+ # # Build transformers exllama kernels ##########################################
225
+ FROM python-builder as exllama-kernels-builder
226
+
227
+ WORKDIR /usr/src
228
+
229
+ COPY server/exllama_kernels/ .
230
+ RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
231
+
232
+ # # Build transformers exllamav2 kernels ########################################
233
+ FROM python-builder as exllamav2-kernels-builder
234
+
235
+ WORKDIR /usr/src
236
+
237
+ COPY server/exllamav2_kernels/ .
238
+ RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
239
+
222
240
# # Flash attention cached build image ##########################################
223
241
FROM base as flash-att-cache
224
242
COPY --from=flash-att-builder /usr/src/flash-attention/build /usr/src/flash-attention/build
@@ -233,6 +251,9 @@ COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build /usr/src/flas
233
251
234
252
# # Final Inference Server image ################################################
235
253
FROM cuda-runtime as server-release
254
+ ARG SITE_PACKAGES=/opt/miniconda/lib/python3.11/site-packages
255
+
256
+ RUN dnf update -y
236
257
237
258
# Install C++ compiler (required at runtime when PT2_COMPILE is enabled)
238
259
RUN dnf install -y gcc-c++ && dnf clean all \
@@ -247,21 +268,26 @@ ENV PATH=/opt/miniconda/bin:$PATH
247
268
# These could instead come from explicitly cached images
248
269
249
270
# Copy build artifacts from flash attention builder
250
- COPY --from=flash-att-cache /usr/src/flash-attention/build/lib.linux-x86_64-cpython-39 /opt/miniconda/lib/python3.9/site-packages
251
- COPY --from=flash-att-cache /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-39 /opt/miniconda/lib/python3.9/site-packages
252
- COPY --from=flash-att-cache /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-39 /opt/miniconda/lib/python3.9/site-packages
271
+ COPY --from=flash-att-cache /usr/src/flash-attention/build/lib.linux-x86_64-cpython-* ${SITE_PACKAGES}
272
+ COPY --from=flash-att-cache /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-* ${SITE_PACKAGES}
273
+ COPY --from=flash-att-cache /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-* ${SITE_PACKAGES}
253
274
254
275
# Copy build artifacts from flash attention v2 builder
255
- COPY --from=flash-att-v2-cache /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-39 /opt/miniconda/lib/python3.9/site-packages
276
+ COPY --from=flash-att-v2-cache /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-* ${SITE_PACKAGES}
277
+
278
+ # Copy build artifacts from exllama kernels builder
279
+ COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-* ${SITE_PACKAGES}
280
+
281
+ # Copy build artifacts from exllamav2 kernels builder
282
+ COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-* ${SITE_PACKAGES}
256
283
257
284
# Install server
258
285
COPY proto proto
259
286
COPY server server
260
- RUN cd server && make gen-server && pip install ".[accelerate, onnx-gpu]" --no-cache-dir
287
+ RUN cd server && make gen-server && pip install ".[accelerate, onnx-gpu, quantize ]" --no-cache-dir
261
288
262
- # Patch codegen model changes into transformers 4.34
263
- RUN cp server/transformers_patch/modeling_codegen.py \
264
- /opt/miniconda/lib/python3.*/site-packages/transformers/models/codegen/modeling_codegen.py
289
+ # Patch codegen model changes into transformers 4.34.0
290
+ RUN cp server/transformers_patch/modeling_codegen.py ${SITE_PACKAGES}/transformers/models/codegen/modeling_codegen.py
265
291
266
292
# Install router
267
293
COPY --from=router-builder /usr/local/cargo/bin/text-generation-router /usr/local/bin/text-generation-router
@@ -276,7 +302,7 @@ ENV PORT=3000 \
276
302
RUN chmod -R g+rwx ${HOME}
277
303
278
304
# Temporary for dev
279
- RUN chmod -R g+w /opt/miniconda/lib/python3.*/site-packages /text_generation_server /usr/src /usr/local/bin
305
+ RUN chmod -R g+w ${SITE_PACKAGES} /text_generation_server /usr/src /usr/local/bin
280
306
281
307
# Run as non-root user by default
282
308
USER tgis
0 commit comments