huggingface
diff --git a/‎.github/workflows/matrix.json‎
Lines changed: 5 additions & 13 deletions b/‎.github/workflows/matrix.json‎
Lines changed: 5 additions & 13 deletions
diff --git a/‎Dockerfile-cuda‎
Lines changed: 9 additions & 3 deletions b/‎Dockerfile-cuda‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎Dockerfile-cuda-all‎
Lines changed: 25 additions & 3 deletions b/‎Dockerfile-cuda-all‎
Lines changed: 25 additions & 3 deletions
diff --git a/‎Dockerfile-cuda-blackwell‎
Lines changed: 0 additions & 124 deletions b/‎Dockerfile-cuda-blackwell‎
Lines changed: 0 additions & 124 deletions
diff --git a/‎Dockerfile-cuda-blackwell-all‎
Lines changed: 0 additions & 103 deletions b/‎Dockerfile-cuda-blackwell-all‎
Lines changed: 0 additions & 103 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 11 deletions b/‎README.md‎
Lines changed: 1 addition & 11 deletions
@@ -45,22 +45,14 @@
     "grpc": true,
     "dockerfile": "Dockerfile-cuda"
   },
-  {
-    "name": "turing-ampere-ada-hopper",
-    "imageNamePrefix": "cuda-",
-    "runOn": "always",
-    "sccache": false,
-    "grpc": false,
-    "dockerfile": "Dockerfile-cuda-all"
-  },
   {
     "name": "blackwell-100",
     "imageNamePrefix": "100-",
     "runOn": "always",
     "sccache": true,
     "cudaComputeCap": 100,
     "grpc": true,
-    "dockerfile": "Dockerfile-cuda-blackwell"
+    "dockerfile": "Dockerfile-cuda"
   },
   {
     "name": "blackwell-120",
@@ -69,15 +61,15 @@
     "sccache": true,
     "cudaComputeCap": 120,
     "grpc": true,
-    "dockerfile": "Dockerfile-cuda-blackwell"
+    "dockerfile": "Dockerfile-cuda"
   },
   {
-    "name": "blackwell",
-    "imageNamePrefix": "blackwell-",
+    "name": "all",
+    "imageNamePrefix": "cuda-",
     "runOn": "always",
     "sccache": false,
     "grpc": false,
-    "dockerfile": "Dockerfile-cuda-blackwell-all"
+    "dockerfile": "Dockerfile-cuda-all"
   },
   {
     "name": "cpu",
 
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.6.3-devel-ubuntu24.04 AS base-builder
+FROM nvidia/cuda:12.9.1-devel-ubuntu24.04 AS base-builder
 
 ENV SCCACHE=0.10.0
 ENV RUSTC_WRAPPER=/usr/local/bin/sccache
@@ -58,6 +58,12 @@ RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
     elif [ ${CUDA_COMPUTE_CAP} -eq 90 ]; \
     then  \
     nvprune --generate-code code=sm_90 /usr/local/cuda/lib64/libcublas_static.a -o /usr/local/cuda/lib64/libcublas_static.a; \
+    elif [ ${CUDA_COMPUTE_CAP} -eq 100 ]; \
+    then  \
+    nvprune --generate-code code=sm_100 /usr/local/cuda/lib64/libcublas_static.a -o /usr/local/cuda/lib64/libcublas_static.a; \
+    elif [ ${CUDA_COMPUTE_CAP} -eq 120 ]; \
+    then  \
+    nvprune --generate-code code=sm_120 /usr/local/cuda/lib64/libcublas_static.a -o /usr/local/cuda/lib64/libcublas_static.a; \
     else  \
     echo "cuda compute cap ${CUDA_COMPUTE_CAP} is not supported"; exit 1; \
     fi;
@@ -113,7 +119,7 @@ RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
     cargo build --release --bin text-embeddings-router -F candle-cuda -F static-linking -F grpc --no-default-features && sccache -s; \
     fi;
 
-FROM nvidia/cuda:12.6.3-base-ubuntu24.04 AS base
+FROM nvidia/cuda:12.9.1-runtime-ubuntu24.04 AS base
 
 ARG DEFAULT_USE_FLASH_ATTENTION=True
 
@@ -126,7 +132,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
     ca-certificates \
     libssl-dev \
     curl \
-    cuda-compat-12-6 \
+    cuda-compat-12-9 \
     && rm -rf /var/lib/apt/lists/*
 
 FROM base AS grpc
 
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.6.3-devel-ubuntu24.04 AS base-builder
+FROM nvidia/cuda:12.9.1-devel-ubuntu24.04 AS base-builder
 
 ENV SCCACHE=0.10.0
 ENV RUSTC_WRAPPER=/usr/local/bin/sccache
@@ -64,6 +64,14 @@ RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
     --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
     CUDA_COMPUTE_CAP=90 cargo chef cook --release --features candle-cuda --recipe-path recipe.json && sccache -s;
 
+RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
+    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
+    CUDA_COMPUTE_CAP=100 cargo chef cook --release --features candle-cuda --recipe-path recipe.json && sccache -s;
+
+RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
+    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
+    CUDA_COMPUTE_CAP=120 cargo chef cook --release --features candle-cuda --recipe-path recipe.json && sccache -s;
+
 COPY backends backends
 COPY core core
 COPY router router
@@ -88,7 +96,19 @@ RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
 
 RUN mv /usr/src/target/release/text-embeddings-router /usr/src/target/release/text-embeddings-router-90
 
-FROM nvidia/cuda:12.6.3-base-ubuntu24.04 AS base
+RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
+    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
+    CUDA_COMPUTE_CAP=100 cargo build --release --bin text-embeddings-router -F candle-cuda && sccache -s;
+
+RUN mv /usr/src/target/release/text-embeddings-router /usr/src/target/release/text-embeddings-router-100
+
+RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
+    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
+    CUDA_COMPUTE_CAP=120 cargo build --release --bin text-embeddings-router -F candle-cuda && sccache -s;
+
+RUN mv /usr/src/target/release/text-embeddings-router /usr/src/target/release/text-embeddings-router-120
+
+FROM nvidia/cuda:12.9.1-runtime-ubuntu24.04 AS base
 
 ARG DEFAULT_USE_FLASH_ATTENTION=True
 
@@ -101,12 +121,14 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
     ca-certificates \
     libssl-dev \
     curl \
-    cuda-compat-12-6 \
+    cuda-compat-12-9 \
     && rm -rf /var/lib/apt/lists/*
 
 COPY --from=builder /usr/src/target/release/text-embeddings-router-75 /usr/local/bin/text-embeddings-router-75
 COPY --from=builder /usr/src/target/release/text-embeddings-router-80 /usr/local/bin/text-embeddings-router-80
 COPY --from=builder /usr/src/target/release/text-embeddings-router-90 /usr/local/bin/text-embeddings-router-90
+COPY --from=builder /usr/src/target/release/text-embeddings-router-100 /usr/local/bin/text-embeddings-router-100
+COPY --from=builder /usr/src/target/release/text-embeddings-router-120 /usr/local/bin/text-embeddings-router-120
 
 COPY --chmod=775 cuda-all-entrypoint.sh entrypoint.sh
 
 
@@ -587,23 +587,13 @@ runtime_compute_cap=89
 # Example for Hopper (H100, ...)
 runtime_compute_cap=90
 
-docker build . -f Dockerfile-cuda --build-arg CUDA_COMPUTE_CAP=$runtime_compute_cap
-```
-
-If your CUDA device architecture is Blackwell, then you need to run the following
-instead, as CUDA 12.9 is required, hence the Dockerfile differs:
-
-```shell
-# Get submodule dependencies
-git submodule update --init
-
 # Example for Blackwell (B200, GB200, ...)
 runtime_compute_cap=100
 
 # Example for Blackwell (GeForce RTX 50X0, RTX PRO 6000, ...)
 runtime_compute_cap=120
 
-docker build . -f Dockerfile-cuda-blackwell --build-arg CUDA_COMPUTE_CAP=$runtime_compute_cap
+docker build . -f Dockerfile-cuda --build-arg CUDA_COMPUTE_CAP=$runtime_compute_cap
 ```
 
 ### Apple M1/M2 arm64 architectures