feat: add cuda all image to facilitate deployment (#186)

OlivierDehaene · web-flow · commit b7be6c09df4b · 2024-03-05T18:42:31.000+01:00
diff --git a/.github/workflows/build_all.yaml b/.github/workflows/build_all.yaml
@@ -0,0 +1,85 @@
+ name: Build and push Cuda docker image to registry
+
+ on:
+   workflow_dispatch:
+   push:
+     tags:
+       - 'v*'
+
+ jobs:
+   build-and-push-image:
+     concurrency:
+       group: ${{ github.workflow }}-${{ github.job }}-all-${{ github.head_ref || github.run_id }}
+       cancel-in-progress: true
+     runs-on: [self-hosted, intel-cpu, 32-cpu, tgi-ci]
+     permissions:
+       contents: write
+       packages: write
+       # This is used to complete the identity challenge
+       # with sigstore/fulcio when running outside of PRs.
+       id-token: write
+       security-events: write
+     steps:
+       - name: Checkout repository
+         uses: actions/checkout@v3
+       - name: Initialize Docker Buildx
+         uses: docker/setup-buildx-action@v2.0.0
+         with:
+           install: true
+       - name: Configure sccache
+         uses: actions/github-script@v6
+         with:
+           script: |
+             core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || '');
+             core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
+       - name: Inject slug/short variables
+         uses: rlespinasse/github-slug-action@v4.4.1
+       - name: Tailscale
+         uses: tailscale/github-action@7bd8039bf25c23c4ab1b8d6e2cc2da2280601966
+         with:
+           authkey: ${{ secrets.TAILSCALE_AUTHKEY }}
+       - name: Login to GitHub Container Registry
+         if: github.event_name != 'pull_request'
+         uses: docker/login-action@v2
+         with:
+           registry: ghcr.io
+           username: ${{ github.actor }}
+           password: ${{ secrets.GITHUB_TOKEN }}
+       - name: Login to internal Container Registry
+         uses: docker/login-action@v2.1.0
+         with:
+           username: ${{ secrets.TAILSCALE_DOCKER_USERNAME }}
+           password: ${{ secrets.TAILSCALE_DOCKER_PASSWORD }}
+           registry: registry.internal.huggingface.tech
+       - name: Extract metadata (tags, labels) for Docker
+         id: meta
+         uses: docker/metadata-action@v4.3.0
+         with:
+           images: |
+             registry.internal.huggingface.tech/api-inference/text-embeddings-inference
+             ghcr.io/huggingface/text-embeddings-inference
+           flavor: |
+             latest=false
+           tags: |
+             type=semver,pattern=cuda-{{version}}
+             type=semver,pattern=cuda-{{major}}.{{minor}}
+             type=raw,value=cuda-latest,enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
+             type=raw,value=cuda-sha-${{ env.GITHUB_SHA_SHORT }}
+       - name: Build and push Docker image
+         id: build-and-push
+         uses: docker/build-push-action@v4
+         with:
+           context: .
+           file: Dockerfile-cuda-all
+           push: ${{ github.event_name != 'pull_request' }}
+           platforms: 'linux/amd64'
+           build-args: |
+             SCCACHE_GHA_ENABLED=on
+             ACTIONS_CACHE_URL=${{ env.ACTIONS_CACHE_URL }}
+             ACTIONS_RUNTIME_TOKEN=${{ env.ACTIONS_RUNTIME_TOKEN }}
+             GIT_SHA=${{ env.GITHUB_SHA }}
+             DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
+           tags: ${{ steps.meta.outputs.tags }}
+           labels: ${{ steps.meta.outputs.labels }}
+           cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-all,mode=max
+           cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-all,mode=max
diff --git a/Cargo.toml b/Cargo.toml
@@ -26,7 +26,7 @@ hf-hub = { git = "https://github.com/huggingface/hf-hub", rev = "b167f69692be5f4
 
 
 [profile.release]
-debug = 1
+debug = 0
 incremental = true
 lto = "off"
 panic = "abort"
diff --git a/Dockerfile-cuda-all b/Dockerfile-cuda-all
@@ -0,0 +1,98 @@
+FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 AS base-builder
+
+ENV SCCACHE=0.5.4
+ENV RUSTC_WRAPPER=/usr/local/bin/sccache
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    curl \
+    libssl-dev \
+    pkg-config \
+    && rm -rf /var/lib/apt/lists/*
+
+# Donwload and configure sccache
+RUN curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-x86_64-unknown-linux-musl.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-x86_64-unknown-linux-musl/sccache && \
+    chmod +x /usr/local/bin/sccache
+
+RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
+RUN cargo install cargo-chef --locked
+
+FROM base-builder AS planner
+
+WORKDIR /usr/src
+
+COPY backends backends
+COPY core core
+COPY router router
+COPY Cargo.toml ./
+COPY Cargo.lock ./
+
+RUN cargo chef prepare  --recipe-path recipe.json
+
+FROM base-builder AS builder
+
+ARG GIT_SHA
+ARG DOCKER_LABEL
+
+# sccache specific variables
+ARG ACTIONS_CACHE_URL
+ARG ACTIONS_RUNTIME_TOKEN
+ARG SCCACHE_GHA_ENABLED
+
+WORKDIR /usr/src
+
+COPY --from=planner /usr/src/recipe.json recipe.json
+
+FROM builder as builder-75
+
+RUN CUDA_COMPUTE_CAP=75 cargo chef cook --release --features candle-cuda-turing --no-default-features --recipe-path recipe.json && sccache -s
+
+COPY backends backends
+COPY core core
+COPY router router
+COPY Cargo.toml ./
+COPY Cargo.lock ./
+
+RUN CUDA_COMPUTE_CAP=75 cargo build --release --bin text-embeddings-router -F candle-cuda-turing -F http --no-default-features && sccache -s
+
+FROM builder as builder-80
+
+RUN CUDA_COMPUTE_CAP=80 cargo chef cook --release --features candle-cuda --no-default-features --recipe-path recipe.json && sccache -s
+
+COPY backends backends
+COPY core core
+COPY router router
+COPY Cargo.toml ./
+COPY Cargo.lock ./
+
+RUN CUDA_COMPUTE_CAP=80 cargo build --release --bin text-embeddings-router -F candle-cuda -F http --no-default-features && sccache -s
+
+FROM builder as builder-90
+
+RUN CUDA_COMPUTE_CAP=90 cargo chef cook --release --features candle-cuda --no-default-features --recipe-path recipe.json && sccache -s
+
+COPY backends backends
+COPY core core
+COPY router router
+COPY Cargo.toml ./
+COPY Cargo.lock ./
+
+RUN CUDA_COMPUTE_CAP=90 cargo build --release --bin text-embeddings-router -F candle-cuda -F http --no-default-features && sccache -s
+
+FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04 as base
+
+ARG DEFAULT_USE_FLASH_ATTENTION=True
+
+ENV HUGGINGFACE_HUB_CACHE=/data \
+    PORT=80 \
+    USE_FLASH_ATTENTION=$DEFAULT_USE_FLASH_ATTENTION
+
+COPY --from=builder-75 /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router-75
+COPY --from=builder-80 /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router-80
+COPY --from=builder-90 /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router-90
+
+COPY cuda-all-entrypoint.sh entrypoint.sh
+RUN chmod +x entrypoint.sh
+
+ENTRYPOINT ["./entrypoint.sh"]
+CMD ["--json-output"]
diff --git a/backends/candle/src/models/distilbert.rs b/backends/candle/src/models/distilbert.rs
@@ -364,9 +364,7 @@ impl DistilBertSpladeHead {
         let hidden_states = self.vocab_transform.forward(hidden_states)?;
         let hidden_states = self.vocab_layer_norm.forward(&hidden_states, None)?;
         let hidden_states = self.vocab_projector.forward(&hidden_states)?;
-        Ok(hidden_states)
-
-        // (1.0 + hidden_states)?.log()
+        (1.0 + hidden_states)?.log()
     }
 }
 
diff --git a/cuda-all-entrypoint.sh b/cuda-all-entrypoint.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+if ! command -v nvidia-smi &> /dev/null; then
+    echo "Error: 'nvidia-smi' command not found."
+    exit 1
+fi
+
+compute_cap=$(nvidia-smi --query-gpu=compute_cap --format=csv | sed -n '2p' | sed 's/\.//g')
+
+if [ ${compute_cap} -eq 75 ]
+then
+    exec text-embeddings-router-75 "$@"
+elif [ ${compute_cap} -ge 80 -a ${compute_cap} -lt 90 ]
+then
+    exec text-embeddings-router-80 "$@"
+elif [ ${compute_cap} -eq 90 ]
+then
+    exec text-embeddings-router-90 "$@"
+else
+    echo "cuda compute cap ${compute_cap} is not supported"; exit 1
+fi

Original file line number	Diff line number	Diff line change
`@@ -364,9 +364,7 @@ impl DistilBertSpladeHead {`
`364`	`364`	`let hidden_states = self.vocab_transform.forward(hidden_states)?;`
`365`	`365`	`let hidden_states = self.vocab_layer_norm.forward(&hidden_states, None)?;`
`366`	`366`	`let hidden_states = self.vocab_projector.forward(&hidden_states)?;`
`367`		`- Ok(hidden_states)`
`368`		`-`
`369`		`- // (1.0 + hidden_states)?.log()`
	`367`	`+ (1.0 + hidden_states)?.log()`
`370`	`368`	`}`
`371`	`369`	`}`
`372`	`370`