Skip to content

Commit 0232c42

Browse files
authored
feat: add optimum-tpu TGI v0.2.3 (#139)
* feat: add optimum-tpu TGI v0.2.3 The main feature is the addition of Llama 3.1, 3.2 and 3.3 (text-only) models. * fix: remove * when copying entrypoint * review(TGI TPU): add a comment on why we install two python versions
1 parent 1c31c51 commit 0232c42

File tree

3 files changed

+246
-2
lines changed

3 files changed

+246
-2
lines changed

containers/tgi/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -132,8 +132,8 @@ The TGI containers come with two different variants depending on the accelerator
132132
docker build -t us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-3.ubuntu2204.py311 -f containers/tgi/gpu/2.3.1/Dockerfile .
133133
```
134134

135-
- **TPU**: You can build TGI container for Google Cloud TPUs on any machine with docker build, you do not need to build it on a TPU VM
135+
- **TPU**: You can build TGI container for Google Cloud TPUs on any machine with docker build, you do not need to build it on a TPU VM.
136136

137137
```bash
138-
docker build --ulimit nofile=100000:100000 -t us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-tpu.0.2.2.py310 -f containers/tgi/tpu/0.2.2/Dockerfile .
138+
docker build --ulimit nofile=100000:100000 -t us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-tpu.0.2.3.py310 -f containers/tgi/tpu/0.2.3/Dockerfile .
139139
```
Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
# Enable GCP integration by default
2+
ARG ENABLE_GOOGLE_FEATURE=1
3+
4+
# Fetch and extract the TGI sources
5+
FROM alpine AS tgi
6+
# TGI version 2.4.1 by default
7+
ARG TGI_VERSION=v2.4.1
8+
RUN test -n ${TGI_VERSION:?}
9+
RUN mkdir -p /tgi
10+
ADD https://github.com/huggingface/text-generation-inference/archive/${TGI_VERSION}.tar.gz /tgi/sources.tar.gz
11+
RUN tar -C /tgi -xf /tgi/sources.tar.gz --strip-components=1
12+
13+
# Build cargo components (adapted from TGI original Dockerfile)
14+
# Note that the build image is aligned on the same Linux version as the base image (Debian bookworm/ Ubuntu 22.04)
15+
FROM lukemathwalker/cargo-chef:latest-rust-1.80.1-bookworm AS chef
16+
WORKDIR /usr/src
17+
18+
ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
19+
20+
FROM chef AS planner
21+
COPY --from=tgi /tgi/Cargo.toml Cargo.toml
22+
COPY --from=tgi /tgi/Cargo.lock Cargo.lock
23+
COPY --from=tgi /tgi/rust-toolchain.toml rust-toolchain.toml
24+
COPY --from=tgi /tgi/proto proto
25+
COPY --from=tgi /tgi/benchmark benchmark
26+
COPY --from=tgi /tgi/router router
27+
COPY --from=tgi /tgi/backends backends
28+
COPY --from=tgi /tgi/launcher launcher
29+
RUN cargo chef prepare --recipe-path recipe.json
30+
31+
FROM chef AS builder
32+
ARG ENABLE_GOOGLE_FEATURE
33+
RUN echo "Google Feature Status: ${ENABLE_GOOGLE_FEATURE}"
34+
35+
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
36+
python3.11-dev
37+
RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
38+
curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
39+
unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
40+
unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
41+
rm -f $PROTOC_ZIP
42+
43+
COPY --from=planner /usr/src/recipe.json recipe.json
44+
RUN cargo chef cook --profile release-opt --recipe-path recipe.json
45+
46+
COPY --from=tgi /tgi/Cargo.toml Cargo.toml
47+
COPY --from=tgi /tgi/Cargo.lock Cargo.lock
48+
COPY --from=tgi /tgi/rust-toolchain.toml rust-toolchain.toml
49+
COPY --from=tgi /tgi/proto proto
50+
COPY --from=tgi /tgi/benchmark benchmark
51+
COPY --from=tgi /tgi/router router
52+
COPY --from=tgi /tgi/backends backends
53+
COPY --from=tgi /tgi/launcher launcher
54+
RUN if [ -n "$ENABLE_GOOGLE_FEATURE" ]; then \
55+
cargo build --profile release-opt --features google; \
56+
else \
57+
cargo build --profile release-opt; \
58+
fi
59+
60+
# Python base image
61+
FROM ubuntu:22.04 AS base
62+
63+
RUN apt-get update -y \
64+
&& apt-get install -y --no-install-recommends \
65+
python3-pip \
66+
python3-setuptools \
67+
python-is-python3 \
68+
git \
69+
&& rm -rf /var/lib/apt/lists/* \
70+
&& apt-get clean
71+
RUN pip3 --no-cache-dir install --upgrade pip
72+
73+
ARG ENABLE_GOOGLE_FEATURE
74+
ARG VERSION='0.2.3'
75+
RUN test -n ${VERSION:?}
76+
77+
FROM base AS optimum-tpu-installer
78+
79+
COPY . /tmp/src
80+
81+
RUN if [ -n "$ENABLE_GOOGLE_FEATURE" ]; then \
82+
# If we are building for GCP, we need to clone the optimum-tpu repo as this is built from the huggingface/Google-Cloud-Containers repository and not the huggingface/optimum-tpu repository
83+
git clone https://github.com/huggingface/optimum-tpu.git /opt/optimum-tpu && \
84+
cd /opt/optimum-tpu && git checkout v${VERSION}; \
85+
fi && \
86+
# Check if the optimum-tpu repo is cloned properly
87+
cp -a /tmp/src /opt/optimum-tpu && \
88+
if [ ! -d "/opt/optimum-tpu/optimum" ]; then \
89+
echo "Error: Building from incorrect repository. This build must be run from optimum-tpu repo. If building from google-cloud-containers repo, set ENABLE_GOOGLE_FEATURE=1 to automatically clone optimum-tpu" && \
90+
exit 1; \
91+
fi
92+
93+
94+
# Python server build image
95+
FROM base AS pyserver
96+
97+
RUN apt-get update -y \
98+
&& apt-get install -y --no-install-recommends \
99+
make \
100+
python3-venv \
101+
&& rm -rf /var/lib/apt/lists/* \
102+
&& apt-get clean
103+
104+
RUN install -d /pyserver
105+
WORKDIR /pyserver
106+
COPY --from=optimum-tpu-installer /opt/optimum-tpu/text-generation-inference/server server
107+
COPY --from=tgi /tgi/proto proto
108+
RUN pip3 install -r server/build-requirements.txt
109+
RUN VERBOSE=1 BUILDDIR=/pyserver/build PROTODIR=/pyserver/proto VERSION=${VERSION} make -C server gen-server
110+
111+
# TPU base image (used for deployment)
112+
FROM base AS tpu_base
113+
114+
ARG VERSION=${VERSION}
115+
116+
# Install system prerequisites
117+
# NOTE: we need both python3.10 and python3.11 to be installed, as the TGI router uses python 3.11 and optimum-tpu uses
118+
# python 3.10. This has been fixed on newest version of optimum-tpu and will be removed in the next version (see
119+
# https://github.com/huggingface/optimum-tpu/pull/135 for details).
120+
RUN apt-get update -y \
121+
&& apt-get install -y --no-install-recommends \
122+
libpython3.10 \
123+
libpython3.11 \
124+
python3.11 \
125+
git \
126+
gnupg2 \
127+
wget \
128+
curl \
129+
&& rm -rf /var/lib/apt/lists/* \
130+
&& apt-get clean
131+
132+
# Update pip
133+
RUN pip install --upgrade pip
134+
135+
# Install HuggingFace packages
136+
ARG TRANSFORMERS_VERSION='4.46.3'
137+
ARG ACCELERATE_VERSION='1.1.1'
138+
ARG SAFETENSORS_VERSION='0.4.5'
139+
140+
ARG ENABLE_GOOGLE_FEATURE
141+
142+
ENV HF_HUB_ENABLE_HF_TRANSFER=1
143+
ENV VERSION=${VERSION}
144+
145+
ENV PORT=${ENABLE_GOOGLE_FEATURE:+8080}
146+
ENV PORT=${PORT:-80}
147+
148+
ENV HF_HOME=${ENABLE_GOOGLE_FEATURE:+/tmp}
149+
ENV HF_HOME=${HF_HOME:-/data}
150+
151+
# Install requirements for TGI, that uses python3.11
152+
RUN python3.11 -m pip install transformers==${TRANSFORMERS_VERSION}
153+
154+
# Install requirements for optimum-tpu, then for TGI then optimum-tpu
155+
RUN python3 -m pip install hf_transfer safetensors==${SAFETENSORS_VERSION} typer
156+
COPY --from=optimum-tpu-installer /opt/optimum-tpu /opt/optimum-tpu
157+
RUN python3 /opt/optimum-tpu/optimum/tpu/cli.py install-jetstream-pytorch --yes
158+
RUN python3 -m pip install -e /opt/optimum-tpu \
159+
-f https://storage.googleapis.com/libtpu-releases/index.html
160+
161+
# Install benchmarker
162+
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
163+
# Install router
164+
COPY --from=builder /usr/src/target/release-opt/text-generation-router-v2 /usr/local/bin/text-generation-router
165+
# Install launcher
166+
COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
167+
# Install python server
168+
COPY --from=pyserver /pyserver/build/dist dist
169+
RUN pip install dist/text_generation_server*.tar.gz
170+
171+
172+
# TPU compatible image for Inference Endpoints
173+
FROM tpu_base AS inference-endpoint
174+
175+
COPY text-generation-inference/docker/entrypoint.sh entrypoint.sh
176+
RUN chmod +x entrypoint.sh
177+
ENTRYPOINT ["./entrypoint.sh"]
178+
179+
FROM tpu_base AS google-cloud-containers
180+
181+
# Install Google specific components if ENABLE_GOOGLE_FEATURE is set
182+
RUN if [ -n "$ENABLE_GOOGLE_FEATURE" ]; then \
183+
apt-get update && \
184+
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
185+
ca-certificates \
186+
curl \
187+
git && \
188+
rm -rf /var/lib/apt/lists/* && \
189+
echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" \
190+
| tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \
191+
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg \
192+
| apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \
193+
apt-get update -y && \
194+
apt-get install google-cloud-sdk -y; \
195+
fi
196+
197+
# Custom entrypoint for Google
198+
COPY --chmod=775 containers/tgi/tpu/${VERSION}/entrypoint.sh entrypoint.sh
199+
ENTRYPOINT ["./entrypoint.sh"]
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#!/bin/bash
2+
3+
# This is required by GKE, see
4+
# https://cloud.google.com/kubernetes-engine/docs/how-to/tpus#privileged-mode
5+
ulimit -l 68719476736
6+
7+
# Check if MODEL_ID starts with "gs://"
8+
if [[ $AIP_STORAGE_URI == gs://* ]]; then
9+
echo "AIP_STORAGE_URI set and starts with 'gs://', proceeding to download from GCS."
10+
echo "AIP_STORAGE_URI: $AIP_STORAGE_URI"
11+
12+
# Define the target directory
13+
TARGET_DIR="/tmp/model"
14+
mkdir -p "$TARGET_DIR"
15+
16+
# Use gsutil to copy the content from GCS to the target directory
17+
echo "Running: gcloud storage storage cp $AIP_STORAGE_URI/* $TARGET_DIR --recursive"
18+
gcloud storage cp "$AIP_STORAGE_URI/*" "$TARGET_DIR" --recursive
19+
20+
# Check if gsutil command was successful
21+
if [ $? -eq 0 ]; then
22+
echo "Model downloaded successfully to ${TARGET_DIR}."
23+
# Update MODEL_ID to point to the local directory
24+
echo "Updating MODEL_ID to point to the local directory."
25+
export MODEL_ID="$TARGET_DIR"
26+
else
27+
echo "Failed to download model from GCS."
28+
exit 1
29+
fi
30+
fi
31+
32+
if [[ -z "${MAX_BATCH_SIZE}" ]]; then
33+
# Default to a batch size of 4 if no value is provided
34+
export MAX_BATCH_SIZE="4"
35+
fi
36+
37+
if [[ -n "${QUANTIZATION}" ]]; then
38+
# If quantization is set, we use jetstream_int8 (this is the only option supported by optimum-tpu at the moment)
39+
QUANTIZATION="jetstream_int8"
40+
export QUANTIZATION="${QUANTIZATION}"
41+
fi
42+
43+
ldconfig 2>/dev/null || echo 'unable to refresh ld cache, not a big deal in most cases'
44+
45+
exec text-generation-launcher $@

0 commit comments

Comments
 (0)