huggingface
diff --git a/‎Cargo.lock
Lines changed: 331 additions & 255 deletions b/‎Cargo.lock
Lines changed: 331 additions & 255 deletions
diff --git a/‎Cargo.toml
Lines changed: 5 additions & 4 deletions b/‎Cargo.toml
Lines changed: 5 additions & 4 deletions
diff --git a/‎backends/Cargo.toml
Lines changed: 1 addition & 0 deletions b/‎backends/Cargo.toml
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/candle/Cargo.toml
Lines changed: 8 additions & 7 deletions b/‎backends/candle/Cargo.toml
Lines changed: 8 additions & 7 deletions
diff --git a/‎backends/candle/src/alibi.rs
Lines changed: 1 addition & 1 deletion b/‎backends/candle/src/alibi.rs
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/candle/src/layers/layer_norm.rs
Lines changed: 2 additions & 2 deletions b/‎backends/candle/src/layers/layer_norm.rs
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/candle/src/lib.rs
Lines changed: 23 additions & 7 deletions b/‎backends/candle/src/lib.rs
Lines changed: 23 additions & 7 deletions
diff --git a/‎backends/candle/src/models.rs
Lines changed: 9 additions & 3 deletions b/‎backends/candle/src/models.rs
Lines changed: 9 additions & 3 deletions
diff --git a/‎backends/candle/src/models/bert.rs
Lines changed: 0 additions & 5 deletions b/‎backends/candle/src/models/bert.rs
Lines changed: 0 additions & 5 deletions
diff --git a/‎backends/candle/src/models/flash_bert.rs
Lines changed: 0 additions & 5 deletions b/‎backends/candle/src/models/flash_bert.rs
Lines changed: 0 additions & 5 deletions
@@ -18,12 +18,13 @@ homepage = "https://github.com/huggingface/text-embeddings-inference"
 
 [patch.crates-io]
 cudarc = { git = "https://github.com/OlivierDehaene/cudarc", rev = "c19522f1e411ab453d71bdfad3383b118cd4216f" }
-candle = { git = "https://github.com/OlivierDehaene/candle", rev = "9f2b4081b83a0e47ec1b12caa71d3cac7cc2161e", package = "candle-core" }
-candle-nn = { git = "https://github.com/OlivierDehaene/candle", rev = "9f2b4081b83a0e47ec1b12caa71d3cac7cc2161e", package = "candle-nn" }
-candle-transformers = { git = "https://github.com/OlivierDehaene/candle", rev = "9f2b4081b83a0e47ec1b12caa71d3cac7cc2161e", package = "candle-transformers" }
-candle-flash-attn = { git = "https://github.com/OlivierDehaene/candle", rev = "9f2b4081b83a0e47ec1b12caa71d3cac7cc2161e", package = "candle-flash-attn" }
+candle = { git = "https://github.com/OlivierDehaene/candle", rev = "7a181166d96480ec0302b496469427b3db0ab71b", package = "candle-core" }
+candle-nn = { git = "https://github.com/OlivierDehaene/candle", rev = "7a181166d96480ec0302b496469427b3db0ab71b", package = "candle-nn" }
+candle-transformers = { git = "https://github.com/OlivierDehaene/candle", rev = "7a181166d96480ec0302b496469427b3db0ab71b", package = "candle-transformers" }
+candle-flash-attn = { git = "https://github.com/OlivierDehaene/candle", rev = "7a181166d96480ec0302b496469427b3db0ab71b", package = "candle-flash-attn" }
 hf-hub = { git = "https://github.com/huggingface/hf-hub", rev = "b167f69692be5f49eb8003788f7f8a499a98b096" }
 
+
 [profile.release]
 debug = 1
 incremental = true
 
@@ -18,6 +18,7 @@ clap = ["dep:clap", "text-embeddings-backend-core/clap"]
 python = ["dep:text-embeddings-backend-python"]
 candle = ["dep:text-embeddings-backend-candle"]
 cuda = ["text-embeddings-backend-candle?/cuda"]
+metal = ["text-embeddings-backend-candle?/metal"]
 mkl = ["text-embeddings-backend-candle?/mkl"]
 mkl-dynamic = ["text-embeddings-backend-candle?/mkl-dynamic"]
 accelerate = ["text-embeddings-backend-candle?/accelerate"]
 
@@ -8,13 +8,13 @@ homepage.workspace = true
 [dependencies]
 accelerate-src = { version = "0.3.2", optional = true }
 intel-mkl-src = { version = "0.8.1", optional = true  }
-candle = { version = "0.3.0", package = "candle-core", default-features = false }
-candle-nn = { version = "0.3.0" }
-candle-transformers = { version = "0.3.0" }
-candle-flash-attn = { version = "0.3.0", optional = true }
-candle-flash-attn-v1 = { git = "https://github.com/huggingface/candle-flash-attn-v1", rev = "62b75f1ea4e0961fad7b983ee8d723ed6fd68be5", optional = true }
-candle-cublaslt = { git = "https://github.com/huggingface/candle-cublaslt", rev = "58684e116aae248c353f87846ddf0b2a8a7ed855", optional = true }
-candle-layer-norm = { git = "https://github.com/huggingface/candle-layer-norm", rev = "5ed96012a693dff9685320765dd55a57fdaecdd6", optional = true }
+candle = { version = "^0.3", package = "candle-core", default-features = false }
+candle-nn = { version = "^0.3" }
+candle-transformers = { version = "^0.3" }
+candle-flash-attn = { version = "^0.3", optional = true }
+candle-flash-attn-v1 = { git = "https://github.com/huggingface/candle-flash-attn-v1", rev = "d5b873e4555b7f460ed639d96f26cb014f2daad7", optional = true }
+candle-cublaslt = { git = "https://github.com/huggingface/candle-cublaslt", rev = "c8a810ffe649c5f4634cbe1f0aaf02f6025fe5a5", optional = true }
+candle-layer-norm = { git = "https://github.com/huggingface/candle-layer-norm", rev = "0dd5bdceb9ba7cded921c62f9ddd66e7726327ba", optional = true }
 text-embeddings-backend-core = { path = "../core" }
 tracing = "^0.1"
 safetensors = "^0.4"
@@ -36,6 +36,7 @@ anyhow = { version = "1", features = ["backtrace"] }
 
 [features]
 accelerate = ["dep:accelerate-src", "candle/accelerate", "candle-nn/accelerate"]
+metal = ["candle/metal", "candle-nn/metal"]
 mkl = ["dep:intel-mkl-src", "intel-mkl-src/mkl-static-lp64-iomp", "candle/mkl", "candle-nn/mkl"]
 mkl-dynamic = ["dep:intel-mkl-src", "intel-mkl-src/mkl-dynamic-lp64-iomp", "candle/mkl-dynamic", "candle-nn/mkl-dynamic"]
 cuda = ["candle/cuda", "candle-nn/cuda", "dep:candle-cublaslt", "dep:candle-layer-norm"]
 
@@ -21,7 +21,7 @@ fn get_slopes_power_of_2(n: usize) -> Vec<f64> {
     (0..n).map(|i| start * start.powi(i as i32)).collect()
 }
 
-fn alibi_head_slopes(num_attention_heads: usize) -> Vec<f64> {
+pub fn alibi_head_slopes(num_attention_heads: usize) -> Vec<f64> {
     if (num_attention_heads as f64).log2().fract() == 0.0 {
         // `num_attention_heads` is a power of 2
         get_slopes_power_of_2(num_attention_heads)
 
@@ -27,7 +27,7 @@ impl LayerNorm {
         let _enter = self.span.enter();
 
         match hidden_states.device() {
-            Device::Cpu => {
+            Device::Cpu | Device::Metal(_) => {
                 let hidden_states = hidden_states.add(residual)?;
                 let hidden_states_dtype = hidden_states.dtype();
                 let internal_dtype = match hidden_states_dtype {
@@ -61,7 +61,7 @@ impl LayerNorm {
                         &hidden_states,
                         &residual,
                         &self.weight,
-                        &self.bias,
+                        Some(&self.bias),
                         self.epsilon,
                     )?;
                     result.reshape(original_shape)
 
@@ -12,6 +12,8 @@ use crate::compute_cap::{
 };
 #[cfg(feature = "cuda")]
 use crate::models::FlashBertModel;
+#[cfg(feature = "cuda")]
+use crate::models::FlashJinaBertModel;
 use crate::models::{BertModel, JinaBertModel, Model, PositionEmbeddingType};
 use candle::{DType, Device};
 use candle_nn::VarBuilder;
@@ -36,10 +38,14 @@ impl CandleBackend {
             serde_json::from_str(&config).map_err(|err| BackendError::Start(err.to_string()))?;
 
         // Get candle device
-        let device = match Device::cuda_if_available(0) {
-            Ok(device) => device,
-            Err(err) => return Err(BackendError::Start(err.to_string())),
-        };
+        let device = if candle::utils::cuda_is_available() {
+            Device::new_cuda(0)
+        } else if candle::utils::metal_is_available() {
+            Device::new_metal(0)
+        } else {
+            Ok(Device::Cpu)
+        }
+        .map_err(|err| BackendError::Start(err.to_string()))?;
 
         // Check model type
         if config.model_type != Some("bert".to_string())
@@ -79,12 +85,12 @@ impl CandleBackend {
         .s()?;
 
         let model: Box<dyn Model + Send> = match device {
-            Device::Cpu => {
+            Device::Cpu | Device::Metal(_) => {
                 if config.position_embedding_type == PositionEmbeddingType::Alibi {
-                    tracing::info!("Starting JinaBert model on CPU");
+                    tracing::info!("Starting JinaBert model on {:?}", device);
                     Box::new(JinaBertModel::load(vb, &config, model_type).s()?)
                 } else {
-                    tracing::info!("Starting Bert model on CPU");
+                    tracing::info!("Starting Bert model on {:?}", device);
                     Box::new(BertModel::load(vb, &config, model_type).s()?)
                 }
             }
@@ -108,6 +114,16 @@ impl CandleBackend {
                     {
                         tracing::info!("Starting FlashBert model on Cuda");
                         Box::new(FlashBertModel::load(vb, &config, model_type).s()?)
+                    } else if cfg!(feature = "flash-attn")
+                        && dtype == DType::F16
+                        && config.position_embedding_type == PositionEmbeddingType::Alibi
+                        && &std::env::var("USE_FLASH_ATTENTION")
+                            .unwrap_or("True".to_string())
+                            .to_lowercase()
+                            == "true"
+                    {
+                        tracing::info!("Starting FlashJinaBertModel model on Cuda");
+                        Box::new(FlashJinaBertModel::load(vb, &config, model_type).s()?)
                     } else if config.position_embedding_type == PositionEmbeddingType::Alibi {
                         tracing::info!("Starting JinaBert model on Cuda");
                         Box::new(JinaBertModel::load(vb, &config, model_type).s()?)
 
@@ -6,17 +6,23 @@ extern crate accelerate_src;
 
 mod bert;
 
+#[cfg(feature = "cuda")]
+mod flash_bert;
+
+#[cfg(feature = "cuda")]
+mod flash_jina;
+mod jina;
+
 pub use bert::{BertModel, Config, PositionEmbeddingType};
 use candle::{Result, Tensor};
 pub use jina::JinaBertModel;
 use text_embeddings_backend_core::Batch;
 
 #[cfg(feature = "cuda")]
-mod flash_bert;
-mod jina;
+pub use flash_bert::FlashBertModel;
 
 #[cfg(feature = "cuda")]
-pub use flash_bert::FlashBertModel;
+pub use flash_jina::FlashJinaBertModel;
 
 pub(crate) trait Model {
     fn is_padded(&self) -> bool;
 
@@ -437,11 +437,6 @@ impl BertModel {
             ModelType::Embedding(pool) => (pool, None),
         };
 
-        // Check pool type
-        if pool != Pool::Mean && pool != Pool::Cls {
-            candle::bail!("Pool type {pool:?} is not supported");
-        }
-
         let (embeddings, encoder) = match (
             BertEmbeddings::load(vb.pp("embeddings"), config),
             BertEncoder::load(vb.pp("encoder"), config),
 
@@ -353,11 +353,6 @@ impl FlashBertModel {
             ModelType::Embedding(pool) => (pool, None),
         };
 
-        // Check pool type
-        if pool != Pool::Mean && pool != Pool::Cls {
-            candle::bail!("Pool type {pool:?} is not supported");
-        }
-
         let (embeddings, encoder) = match (
             BertEmbeddings::load(vb.pp("embeddings"), config),
             BertEncoder::load(vb.pp("encoder"), config),
Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,7 @@ fn get_slopes_power_of_2(n: usize) -> Vec<f64> {`
`21`	`21`	`(0..n).map(\|i\| start * start.powi(i as i32)).collect()`
`22`	`22`	`}`
`23`	`23`
`24`		`-fn alibi_head_slopes(num_attention_heads: usize) -> Vec<f64> {`
	`24`	`+pub fn alibi_head_slopes(num_attention_heads: usize) -> Vec<f64> {`
`25`	`25`	`if (num_attention_heads as f64).log2().fract() == 0.0 {`
`26`	`26`	// `num_attention_heads` is a power of 2
`27`	`27`	`get_slopes_power_of_2(num_attention_heads)`