Skip to content

Commit 2f75be2

Browse files
fix: fix turing for Jina and limit concurrency in docker build (#121)
1 parent 0e97f10 commit 2f75be2

File tree

7 files changed

+24
-34
lines changed

7 files changed

+24
-34
lines changed

Dockerfile-cuda

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM nvidia/cuda:12.0.0-devel-ubuntu22.04 AS base-builder
1+
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 AS base-builder
22

33
ENV SCCACHE=0.5.4
44
ENV RUSTC_WRAPPER=/usr/local/bin/sccache
@@ -100,7 +100,7 @@ RUN if [ ${CUDA_COMPUTE_CAP} -ge 75 -a ${CUDA_COMPUTE_CAP} -lt 80 ]; \
100100
cargo build --release --bin text-embeddings-router -F candle-cuda -F static-linking -F grpc --no-default-features && sccache -s; \
101101
fi;
102102

103-
FROM nvidia/cuda:12.0.0-base-ubuntu22.04 as base
103+
FROM nvidia/cuda:12.2.0-base-ubuntu22.04 as base
104104

105105
ARG DEFAULT_USE_FLASH_ATTENTION=True
106106

README.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ curl 127.0.0.1:8080/embed \
117117

118118
**Note:** To use GPUs, you need to install
119119
the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html).
120-
We also recommend using NVIDIA drivers with CUDA version 12.0 or higher.
120+
NVIDIA drivers on your machine need to be compatible with CUDA version 12.2 or higher.
121121

122122
To see all options to serve your models:
123123

@@ -395,8 +395,7 @@ sudo apt-get install libssl-dev gcc -y
395395

396396
GPUs with Cuda compute capabilities < 7.5 are not supported (V100, Titan V, GTX 1000 series, ...).
397397

398-
Make sure you have Cuda and the nvidia drivers installed. We recommend using NVIDIA drivers with CUDA version 12.0 or
399-
higher.
398+
Make sure you have Cuda and the nvidia drivers installed. NVIDIA drivers on your device need to be compatible with CUDA version 12.2 or higher.
400399
You also need to add the nvidia binaries to your path:
401400

402401
```shell

backends/candle/src/flash_attn.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ pub(crate) fn flash_attn_varlen(
66
q: &Tensor,
77
k: &Tensor,
88
v: &Tensor,
9+
alibi_slopes: Option<&Tensor>,
910
seqlens_q: &Tensor,
1011
seqlens_k: &Tensor,
1112
max_seqlen_q: usize,
@@ -16,6 +17,10 @@ pub(crate) fn flash_attn_varlen(
1617
let runtime_compute_cap = get_runtime_compute_cap();
1718

1819
if runtime_compute_cap == 75 {
20+
if alibi_slopes.is_some() {
21+
candle::bail!("Flash attention v1 does not support alibi");
22+
}
23+
1924
#[cfg(feature = "flash-attn-v1")]
2025
{
2126
use candle_flash_attn_v1::flash_attn_varlen;

backends/candle/src/models/flash_bert.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ impl BertAttention {
155155
&qkv[0],
156156
&qkv[1],
157157
&qkv[2],
158+
None,
158159
cu_seqlens,
159160
cu_seqlens,
160161
max_s,

backends/candle/src/models/flash_jina.rs

Lines changed: 12 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ use crate::layers::{HiddenAct, LayerNorm, Linear};
44
use crate::models::bert::{Config, PositionEmbeddingType};
55
use crate::models::Model;
66
use candle::{DType, Device, IndexOp, Result, Tensor};
7-
use candle_flash_attn::flash_attn_varlen_alibi;
87
use candle_nn::{Embedding, Module, VarBuilder};
98
use text_embeddings_backend_core::{Batch, ModelType, Pool};
109

@@ -161,32 +160,18 @@ impl AlibiBertAttention {
161160
let qkv = qkv.reshape(new_qkv_shape.as_slice())?;
162161
let qkv = qkv.chunk(3, 1)?;
163162

164-
let attention = if let Some(alibi_slopes) = &self.alibi_slopes {
165-
flash_attn_varlen_alibi(
166-
&qkv[0],
167-
&qkv[1],
168-
&qkv[2],
169-
alibi_slopes,
170-
cu_seqlens,
171-
cu_seqlens,
172-
max_s,
173-
max_s,
174-
self.softmax_scale,
175-
false,
176-
)
177-
} else {
178-
flash_attn_varlen(
179-
&qkv[0],
180-
&qkv[1],
181-
&qkv[2],
182-
cu_seqlens,
183-
cu_seqlens,
184-
max_s,
185-
max_s,
186-
self.softmax_scale,
187-
false,
188-
)
189-
}?;
163+
let attention = flash_attn_varlen(
164+
&qkv[0],
165+
&qkv[1],
166+
&qkv[2],
167+
self.alibi_slopes.as_ref(),
168+
cu_seqlens,
169+
cu_seqlens,
170+
max_s,
171+
max_s,
172+
self.softmax_scale,
173+
false,
174+
)?;
190175
let attention = attention.flatten_from(candle::D::Minus2)?;
191176

192177
let hidden_states = self.dense.forward(&attention)?;

docs/source/en/local_gpu.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ To make sure that your hardware is supported, check out the [Supported models an
2121

2222
## Step 1: CUDA and NVIDIA drivers
2323

24-
Make sure you have CUDA and the NVIDIA drivers installed - we recommend using NVIDIA drivers with CUDA version 12.2 or higher.
24+
Make sure you have CUDA and the NVIDIA drivers installed - NVIDIA drivers on your device need to be compatible with CUDA version 12.2 or higher.
2525

2626
Add the NVIDIA binaries to your path:
2727

docs/source/en/quick_tour.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ The easiest way to get started with TEI is to use one of the official Docker con
2323

2424
After making sure that your hardware is supported, install the
2525
[NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) if you
26-
plan on utilizing GPUs. We also recommend using NVIDIA drivers with CUDA version 12.2 or higher.
26+
plan on utilizing GPUs. NVIDIA drivers on your device need to be compatible with CUDA version 12.2 or higher.
2727

2828
Next, install Docker following their [installation instructions](https://docs.docker.com/get-docker/).
2929

0 commit comments

Comments
 (0)