Skip to content

Commit 6e84d20

Browse files
authored
Merge branch 'opendatahub-io:main' into main
2 parents 379120b + a57bdf1 commit 6e84d20

File tree

120 files changed

+12480
-1803
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

120 files changed

+12480
-1803
lines changed

Cargo.lock

Lines changed: 135 additions & 250 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Dockerfile

Lines changed: 44 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
## Global Args #################################################################
2-
ARG BASE_UBI_IMAGE_TAG=9.2-755.1696515532
3-
ARG PROTOC_VERSION=24.4
2+
ARG BASE_UBI_IMAGE_TAG=9.3-1361.1699548029
3+
ARG PROTOC_VERSION=25.0
44
ARG PYTORCH_INDEX="https://download.pytorch.org/whl"
55
#ARG PYTORCH_INDEX="https://download.pytorch.org/whl/nightly"
66
ARG PYTORCH_VERSION=2.1.0
@@ -12,7 +12,7 @@ WORKDIR /app
1212
RUN dnf remove -y --disableplugin=subscription-manager \
1313
subscription-manager \
1414
# we install newer version of requests via pip
15-
python3-requests \
15+
python3.11-requests \
1616
&& dnf install -y make \
1717
# to help with debugging
1818
procps \
@@ -128,9 +128,10 @@ RUN cargo install --path .
128128
## Tests base ##################################################################
129129
FROM base as test-base
130130

131-
RUN dnf install -y make unzip python39 python3-pip gcc openssl-devel gcc-c++ && \
131+
RUN dnf install -y make unzip python3.11 python3.11-pip gcc openssl-devel gcc-c++ && \
132132
dnf clean all && \
133-
ln -s /usr/bin/python3 /usr/local/bin/python && ln -s /usr/bin/pip3 /usr/local/bin/pip
133+
ln -fs /usr/bin/python3.11 /usr/bin/python3 && \
134+
ln -s /usr/bin/python3.11 /usr/local/bin/python && ln -s /usr/bin/pip3.11 /usr/local/bin/pip
134135

135136
RUN pip install --upgrade pip && pip install pytest && pip install pytest-asyncio
136137

@@ -141,6 +142,7 @@ ENV CUDA_VISIBLE_DEVICES=""
141142
FROM test-base as cpu-tests
142143
ARG PYTORCH_INDEX
143144
ARG PYTORCH_VERSION
145+
ARG SITE_PACKAGES=/usr/local/lib/python3.11/site-packages
144146

145147
WORKDIR /usr/src
146148

@@ -157,8 +159,7 @@ RUN cd server && \
157159
pip install ".[accelerate]" --no-cache-dir
158160

159161
# Patch codegen model changes into transformers 4.34
160-
RUN cp server/transformers_patch/modeling_codegen.py \
161-
/usr/local/lib/python3.*/site-packages/transformers/models/codegen/modeling_codegen.py
162+
RUN cp server/transformers_patch/modeling_codegen.py ${SITE_PACKAGES}/transformers/models/codegen/modeling_codegen.py
162163

163164
# Install router
164165
COPY --from=router-builder /usr/local/cargo/bin/text-generation-router /usr/local/bin/text-generation-router
@@ -177,7 +178,7 @@ ARG PYTORCH_VERSION
177178
RUN dnf install -y unzip git ninja-build && dnf clean all
178179

179180
RUN cd ~ && \
180-
curl -L -O https://repo.anaconda.com/miniconda/Miniconda3-py39_23.5.2-0-Linux-x86_64.sh && \
181+
curl -L -O https://repo.anaconda.com/miniconda/Miniconda3-py311_23.9.0-0-Linux-x86_64.sh && \
181182
chmod +x Miniconda3-*-Linux-x86_64.sh && \
182183
bash ./Miniconda3-*-Linux-x86_64.sh -bf -p /opt/miniconda && \
183184
/opt/miniconda/bin/conda update -y --all && \
@@ -191,7 +192,7 @@ RUN if [ -d " /opt/miniconda/pkgs/conda-content-trust-*/info/test/tests" ]; then
191192
ENV PATH=/opt/miniconda/bin:$PATH
192193

193194
# Install specific version of torch
194-
RUN pip install ninja==1.11.1 --no-cache-dir
195+
RUN pip install ninja==1.11.1.1 --no-cache-dir
195196
RUN pip install torch==$PYTORCH_VERSION+cu118 --index-url "${PYTORCH_INDEX}/cu118" --no-cache-dir
196197

197198

@@ -219,6 +220,23 @@ FROM python-builder as build
219220
COPY server/custom_kernels/ /usr/src/.
220221
RUN cd /usr/src && python setup.py build_ext && python setup.py install
221222

223+
224+
## Build transformers exllama kernels ##########################################
225+
FROM python-builder as exllama-kernels-builder
226+
227+
WORKDIR /usr/src
228+
229+
COPY server/exllama_kernels/ .
230+
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
231+
232+
## Build transformers exllamav2 kernels ########################################
233+
FROM python-builder as exllamav2-kernels-builder
234+
235+
WORKDIR /usr/src
236+
237+
COPY server/exllamav2_kernels/ .
238+
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
239+
222240
## Flash attention cached build image ##########################################
223241
FROM base as flash-att-cache
224242
COPY --from=flash-att-builder /usr/src/flash-attention/build /usr/src/flash-attention/build
@@ -233,6 +251,9 @@ COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build /usr/src/flas
233251

234252
## Final Inference Server image ################################################
235253
FROM cuda-runtime as server-release
254+
ARG SITE_PACKAGES=/opt/miniconda/lib/python3.11/site-packages
255+
256+
RUN dnf update -y
236257

237258
# Install C++ compiler (required at runtime when PT2_COMPILE is enabled)
238259
RUN dnf install -y gcc-c++ && dnf clean all \
@@ -247,21 +268,26 @@ ENV PATH=/opt/miniconda/bin:$PATH
247268
# These could instead come from explicitly cached images
248269

249270
# Copy build artifacts from flash attention builder
250-
COPY --from=flash-att-cache /usr/src/flash-attention/build/lib.linux-x86_64-cpython-39 /opt/miniconda/lib/python3.9/site-packages
251-
COPY --from=flash-att-cache /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-39 /opt/miniconda/lib/python3.9/site-packages
252-
COPY --from=flash-att-cache /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-39 /opt/miniconda/lib/python3.9/site-packages
271+
COPY --from=flash-att-cache /usr/src/flash-attention/build/lib.linux-x86_64-cpython-* ${SITE_PACKAGES}
272+
COPY --from=flash-att-cache /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-* ${SITE_PACKAGES}
273+
COPY --from=flash-att-cache /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-* ${SITE_PACKAGES}
253274

254275
# Copy build artifacts from flash attention v2 builder
255-
COPY --from=flash-att-v2-cache /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-39 /opt/miniconda/lib/python3.9/site-packages
276+
COPY --from=flash-att-v2-cache /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-* ${SITE_PACKAGES}
277+
278+
# Copy build artifacts from exllama kernels builder
279+
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-* ${SITE_PACKAGES}
280+
281+
# Copy build artifacts from exllamav2 kernels builder
282+
COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-* ${SITE_PACKAGES}
256283

257284
# Install server
258285
COPY proto proto
259286
COPY server server
260-
RUN cd server && make gen-server && pip install ".[accelerate, onnx-gpu]" --no-cache-dir
287+
RUN cd server && make gen-server && pip install ".[accelerate, onnx-gpu, quantize]" --no-cache-dir
261288

262-
# Patch codegen model changes into transformers 4.34
263-
RUN cp server/transformers_patch/modeling_codegen.py \
264-
/opt/miniconda/lib/python3.*/site-packages/transformers/models/codegen/modeling_codegen.py
289+
# Patch codegen model changes into transformers 4.34.0
290+
RUN cp server/transformers_patch/modeling_codegen.py ${SITE_PACKAGES}/transformers/models/codegen/modeling_codegen.py
265291

266292
# Install router
267293
COPY --from=router-builder /usr/local/cargo/bin/text-generation-router /usr/local/bin/text-generation-router
@@ -276,7 +302,7 @@ ENV PORT=3000 \
276302
RUN chmod -R g+rwx ${HOME}
277303

278304
# Temporary for dev
279-
RUN chmod -R g+w /opt/miniconda/lib/python3.*/site-packages/text_generation_server /usr/src /usr/local/bin
305+
RUN chmod -R g+w ${SITE_PACKAGES}/text_generation_server /usr/src /usr/local/bin
280306

281307
# Run as non-root user by default
282308
USER tgis

Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,5 +56,7 @@ python-tests: build-test-image
5656
-e HUGGINGFACE_HUB_CACHE=/transformers_cache \
5757
-e TRANSFORMERS_CACHE=/transformers_cache cpu-tests:0 pytest -sv --ignore=server/tests/test_utils.py server/tests
5858

59+
clean:
60+
rm -rf target
5961

6062
.PHONY: build build-test-image integration-tests python-tests

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,3 +154,7 @@ They are all prefixed with `tgi_`. Descriptions will be added to the table below
154154
| `tgi_prefill_weight_limit_exceeded` | `counter` | | |
155155
| `tgi_prompt_load_failure` | `counter` | | |
156156
| `tgi_prompt_load_duration` | `histogram` | | |
157+
| `tgi_tokenize_request_count` | `counter` | | |
158+
| `tgi_tokenize_request_input_count` | `counter` | | |
159+
| `tgi_tokenize_request_tokens` | `histogram` | | |
160+
| `tgi_tokenize_request_duration` | `histogram` | | |

integration_tests/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
gen-client:
22
# Compile protos
3-
pip install grpcio-tools==1.58.0 mypy-protobuf==3.4.0 'types-protobuf>=3.20.4' --no-cache-dir
3+
pip install grpcio-tools==1.59.2 mypy-protobuf==3.4.0 'types-protobuf>=3.20.4' --no-cache-dir
44
mkdir text_generation_tests/pb || true
55
python -m grpc_tools.protoc -I../proto --python_out=text_generation_tests/pb \
66
--grpc_python_out=text_generation_tests/pb --mypy_out=text_generation_tests/pb ../proto/generation.proto

0 commit comments

Comments
 (0)