fix: fix turing for Jina and limit concurrency in docker build (#121)

OlivierDehaene · web-flow · commit 2f75be280bf4 · 2024-01-04T15:58:21.000+01:00
diff --git a/Dockerfile-cuda b/Dockerfile-cuda
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.0.0-devel-ubuntu22.04 AS base-builder
+FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 AS base-builder
 
 ENV SCCACHE=0.5.4
 ENV RUSTC_WRAPPER=/usr/local/bin/sccache
@@ -100,7 +100,7 @@ RUN if [ ${CUDA_COMPUTE_CAP} -ge 75 -a ${CUDA_COMPUTE_CAP} -lt 80 ]; \
         cargo build --release --bin text-embeddings-router -F candle-cuda -F static-linking -F grpc --no-default-features && sccache -s; \
     fi;
 
-FROM nvidia/cuda:12.0.0-base-ubuntu22.04 as base
+FROM nvidia/cuda:12.2.0-base-ubuntu22.04 as base
 
 ARG DEFAULT_USE_FLASH_ATTENTION=True
 
diff --git a/README.md b/README.md
@@ -117,7 +117,7 @@ curl 127.0.0.1:8080/embed \
 
 **Note:** To use GPUs, you need to install
 the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html).
-We also recommend using NVIDIA drivers with CUDA version 12.0 or higher.
+NVIDIA drivers on your machine need to be compatible with CUDA version 12.2 or higher.
 
 To see all options to serve your models:
 
@@ -395,8 +395,7 @@ sudo apt-get install libssl-dev gcc -y
 
 GPUs with Cuda compute capabilities < 7.5 are not supported (V100, Titan V, GTX 1000 series, ...).
 
-Make sure you have Cuda and the nvidia drivers installed. We recommend using NVIDIA drivers with CUDA version 12.0 or
-higher.
+Make sure you have Cuda and the nvidia drivers installed. NVIDIA drivers on your device need to be compatible with CUDA version 12.2 or higher.
 You also need to add the nvidia binaries to your path:
 
 ```shell
diff --git a/backends/candle/src/flash_attn.rs b/backends/candle/src/flash_attn.rs
@@ -6,6 +6,7 @@ pub(crate) fn flash_attn_varlen(
     q: &Tensor,
     k: &Tensor,
     v: &Tensor,
+    alibi_slopes: Option<&Tensor>,
     seqlens_q: &Tensor,
     seqlens_k: &Tensor,
     max_seqlen_q: usize,
@@ -16,6 +17,10 @@ pub(crate) fn flash_attn_varlen(
     let runtime_compute_cap = get_runtime_compute_cap();
 
     if runtime_compute_cap == 75 {
+        if alibi_slopes.is_some() {
+            candle::bail!("Flash attention v1 does not support alibi");
+        }
+
         #[cfg(feature = "flash-attn-v1")]
         {
             use candle_flash_attn_v1::flash_attn_varlen;
diff --git a/backends/candle/src/models/flash_bert.rs b/backends/candle/src/models/flash_bert.rs
@@ -155,6 +155,7 @@ impl BertAttention {
             &qkv[0],
             &qkv[1],
             &qkv[2],
+            None,
             cu_seqlens,
             cu_seqlens,
             max_s,
diff --git a/backends/candle/src/models/flash_jina.rs b/backends/candle/src/models/flash_jina.rs
@@ -4,7 +4,6 @@ use crate::layers::{HiddenAct, LayerNorm, Linear};
 use crate::models::bert::{Config, PositionEmbeddingType};
 use crate::models::Model;
 use candle::{DType, Device, IndexOp, Result, Tensor};
-use candle_flash_attn::flash_attn_varlen_alibi;
 use candle_nn::{Embedding, Module, VarBuilder};
 use text_embeddings_backend_core::{Batch, ModelType, Pool};
 
@@ -161,32 +160,18 @@ impl AlibiBertAttention {
         let qkv = qkv.reshape(new_qkv_shape.as_slice())?;
         let qkv = qkv.chunk(3, 1)?;
 
-        let attention = if let Some(alibi_slopes) = &self.alibi_slopes {
-            flash_attn_varlen_alibi(
-                &qkv[0],
-                &qkv[1],
-                &qkv[2],
-                alibi_slopes,
-                cu_seqlens,
-                cu_seqlens,
-                max_s,
-                max_s,
-                self.softmax_scale,
-                false,
-            )
-        } else {
-            flash_attn_varlen(
-                &qkv[0],
-                &qkv[1],
-                &qkv[2],
-                cu_seqlens,
-                cu_seqlens,
-                max_s,
-                max_s,
-                self.softmax_scale,
-                false,
-            )
-        }?;
+        let attention = flash_attn_varlen(
+            &qkv[0],
+            &qkv[1],
+            &qkv[2],
+            self.alibi_slopes.as_ref(),
+            cu_seqlens,
+            cu_seqlens,
+            max_s,
+            max_s,
+            self.softmax_scale,
+            false,
+        )?;
         let attention = attention.flatten_from(candle::D::Minus2)?;
 
         let hidden_states = self.dense.forward(&attention)?;
diff --git a/docs/source/en/local_gpu.md b/docs/source/en/local_gpu.md
@@ -21,7 +21,7 @@ To make sure that your hardware is supported, check out the [Supported models an
 
 ## Step 1: CUDA and NVIDIA drivers
 
-Make sure you have CUDA and the NVIDIA drivers installed - we recommend using NVIDIA drivers with CUDA version 12.2 or higher. 
+Make sure you have CUDA and the NVIDIA drivers installed - NVIDIA drivers on your device need to be compatible with CUDA version 12.2 or higher.
 
 Add the NVIDIA binaries to your path:
 
diff --git a/docs/source/en/quick_tour.md b/docs/source/en/quick_tour.md
@@ -23,7 +23,7 @@ The easiest way to get started with TEI is to use one of the official Docker con
 
 After making sure that your hardware is supported, install the 
 [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) if you 
-plan on utilizing GPUs. We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. 
+plan on utilizing GPUs. NVIDIA drivers on your device need to be compatible with CUDA version 12.2 or higher.
 
 Next, install Docker following their [installation instructions](https://docs.docker.com/get-docker/).