Skip to content

Commit b7be6c0

Browse files
feat: add cuda all image to facilitate deployment (#186)
1 parent 7b85d8c commit b7be6c0

File tree

5 files changed

+206
-4
lines changed

5 files changed

+206
-4
lines changed

.github/workflows/build_all.yaml

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
name: Build and push Cuda docker image to registry
2+
3+
on:
4+
workflow_dispatch:
5+
push:
6+
tags:
7+
- 'v*'
8+
9+
jobs:
10+
build-and-push-image:
11+
concurrency:
12+
group: ${{ github.workflow }}-${{ github.job }}-all-${{ github.head_ref || github.run_id }}
13+
cancel-in-progress: true
14+
runs-on: [self-hosted, intel-cpu, 32-cpu, tgi-ci]
15+
permissions:
16+
contents: write
17+
packages: write
18+
# This is used to complete the identity challenge
19+
# with sigstore/fulcio when running outside of PRs.
20+
id-token: write
21+
security-events: write
22+
steps:
23+
- name: Checkout repository
24+
uses: actions/checkout@v3
25+
- name: Initialize Docker Buildx
26+
uses: docker/[email protected]
27+
with:
28+
install: true
29+
- name: Configure sccache
30+
uses: actions/github-script@v6
31+
with:
32+
script: |
33+
core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || '');
34+
core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
35+
- name: Inject slug/short variables
36+
uses: rlespinasse/[email protected]
37+
- name: Tailscale
38+
uses: tailscale/github-action@7bd8039bf25c23c4ab1b8d6e2cc2da2280601966
39+
with:
40+
authkey: ${{ secrets.TAILSCALE_AUTHKEY }}
41+
- name: Login to GitHub Container Registry
42+
if: github.event_name != 'pull_request'
43+
uses: docker/login-action@v2
44+
with:
45+
registry: ghcr.io
46+
username: ${{ github.actor }}
47+
password: ${{ secrets.GITHUB_TOKEN }}
48+
- name: Login to internal Container Registry
49+
uses: docker/[email protected]
50+
with:
51+
username: ${{ secrets.TAILSCALE_DOCKER_USERNAME }}
52+
password: ${{ secrets.TAILSCALE_DOCKER_PASSWORD }}
53+
registry: registry.internal.huggingface.tech
54+
- name: Extract metadata (tags, labels) for Docker
55+
id: meta
56+
uses: docker/[email protected]
57+
with:
58+
images: |
59+
registry.internal.huggingface.tech/api-inference/text-embeddings-inference
60+
ghcr.io/huggingface/text-embeddings-inference
61+
flavor: |
62+
latest=false
63+
tags: |
64+
type=semver,pattern=cuda-{{version}}
65+
type=semver,pattern=cuda-{{major}}.{{minor}}
66+
type=raw,value=cuda-latest,enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
67+
type=raw,value=cuda-sha-${{ env.GITHUB_SHA_SHORT }}
68+
- name: Build and push Docker image
69+
id: build-and-push
70+
uses: docker/build-push-action@v4
71+
with:
72+
context: .
73+
file: Dockerfile-cuda-all
74+
push: ${{ github.event_name != 'pull_request' }}
75+
platforms: 'linux/amd64'
76+
build-args: |
77+
SCCACHE_GHA_ENABLED=on
78+
ACTIONS_CACHE_URL=${{ env.ACTIONS_CACHE_URL }}
79+
ACTIONS_RUNTIME_TOKEN=${{ env.ACTIONS_RUNTIME_TOKEN }}
80+
GIT_SHA=${{ env.GITHUB_SHA }}
81+
DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
82+
tags: ${{ steps.meta.outputs.tags }}
83+
labels: ${{ steps.meta.outputs.labels }}
84+
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-all,mode=max
85+
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-all,mode=max

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ hf-hub = { git = "https://github.com/huggingface/hf-hub", rev = "b167f69692be5f4
2626

2727

2828
[profile.release]
29-
debug = 1
29+
debug = 0
3030
incremental = true
3131
lto = "off"
3232
panic = "abort"

Dockerfile-cuda-all

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 AS base-builder
2+
3+
ENV SCCACHE=0.5.4
4+
ENV RUSTC_WRAPPER=/usr/local/bin/sccache
5+
ENV PATH="/root/.cargo/bin:${PATH}"
6+
7+
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
8+
curl \
9+
libssl-dev \
10+
pkg-config \
11+
&& rm -rf /var/lib/apt/lists/*
12+
13+
# Donwload and configure sccache
14+
RUN curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-x86_64-unknown-linux-musl.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-x86_64-unknown-linux-musl/sccache && \
15+
chmod +x /usr/local/bin/sccache
16+
17+
RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
18+
RUN cargo install cargo-chef --locked
19+
20+
FROM base-builder AS planner
21+
22+
WORKDIR /usr/src
23+
24+
COPY backends backends
25+
COPY core core
26+
COPY router router
27+
COPY Cargo.toml ./
28+
COPY Cargo.lock ./
29+
30+
RUN cargo chef prepare --recipe-path recipe.json
31+
32+
FROM base-builder AS builder
33+
34+
ARG GIT_SHA
35+
ARG DOCKER_LABEL
36+
37+
# sccache specific variables
38+
ARG ACTIONS_CACHE_URL
39+
ARG ACTIONS_RUNTIME_TOKEN
40+
ARG SCCACHE_GHA_ENABLED
41+
42+
WORKDIR /usr/src
43+
44+
COPY --from=planner /usr/src/recipe.json recipe.json
45+
46+
FROM builder as builder-75
47+
48+
RUN CUDA_COMPUTE_CAP=75 cargo chef cook --release --features candle-cuda-turing --no-default-features --recipe-path recipe.json && sccache -s
49+
50+
COPY backends backends
51+
COPY core core
52+
COPY router router
53+
COPY Cargo.toml ./
54+
COPY Cargo.lock ./
55+
56+
RUN CUDA_COMPUTE_CAP=75 cargo build --release --bin text-embeddings-router -F candle-cuda-turing -F http --no-default-features && sccache -s
57+
58+
FROM builder as builder-80
59+
60+
RUN CUDA_COMPUTE_CAP=80 cargo chef cook --release --features candle-cuda --no-default-features --recipe-path recipe.json && sccache -s
61+
62+
COPY backends backends
63+
COPY core core
64+
COPY router router
65+
COPY Cargo.toml ./
66+
COPY Cargo.lock ./
67+
68+
RUN CUDA_COMPUTE_CAP=80 cargo build --release --bin text-embeddings-router -F candle-cuda -F http --no-default-features && sccache -s
69+
70+
FROM builder as builder-90
71+
72+
RUN CUDA_COMPUTE_CAP=90 cargo chef cook --release --features candle-cuda --no-default-features --recipe-path recipe.json && sccache -s
73+
74+
COPY backends backends
75+
COPY core core
76+
COPY router router
77+
COPY Cargo.toml ./
78+
COPY Cargo.lock ./
79+
80+
RUN CUDA_COMPUTE_CAP=90 cargo build --release --bin text-embeddings-router -F candle-cuda -F http --no-default-features && sccache -s
81+
82+
FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04 as base
83+
84+
ARG DEFAULT_USE_FLASH_ATTENTION=True
85+
86+
ENV HUGGINGFACE_HUB_CACHE=/data \
87+
PORT=80 \
88+
USE_FLASH_ATTENTION=$DEFAULT_USE_FLASH_ATTENTION
89+
90+
COPY --from=builder-75 /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router-75
91+
COPY --from=builder-80 /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router-80
92+
COPY --from=builder-90 /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router-90
93+
94+
COPY cuda-all-entrypoint.sh entrypoint.sh
95+
RUN chmod +x entrypoint.sh
96+
97+
ENTRYPOINT ["./entrypoint.sh"]
98+
CMD ["--json-output"]

backends/candle/src/models/distilbert.rs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -364,9 +364,7 @@ impl DistilBertSpladeHead {
364364
let hidden_states = self.vocab_transform.forward(hidden_states)?;
365365
let hidden_states = self.vocab_layer_norm.forward(&hidden_states, None)?;
366366
let hidden_states = self.vocab_projector.forward(&hidden_states)?;
367-
Ok(hidden_states)
368-
369-
// (1.0 + hidden_states)?.log()
367+
(1.0 + hidden_states)?.log()
370368
}
371369
}
372370

cuda-all-entrypoint.sh

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#!/bin/bash
2+
3+
if ! command -v nvidia-smi &> /dev/null; then
4+
echo "Error: 'nvidia-smi' command not found."
5+
exit 1
6+
fi
7+
8+
compute_cap=$(nvidia-smi --query-gpu=compute_cap --format=csv | sed -n '2p' | sed 's/\.//g')
9+
10+
if [ ${compute_cap} -eq 75 ]
11+
then
12+
exec text-embeddings-router-75 "$@"
13+
elif [ ${compute_cap} -ge 80 -a ${compute_cap} -lt 90 ]
14+
then
15+
exec text-embeddings-router-80 "$@"
16+
elif [ ${compute_cap} -eq 90 ]
17+
then
18+
exec text-embeddings-router-90 "$@"
19+
else
20+
echo "cuda compute cap ${compute_cap} is not supported"; exit 1
21+
fi

0 commit comments

Comments
 (0)