huggingface
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/candle/src/models/flash_qwen3.rs‎
Lines changed: 36 additions & 2 deletions b/‎backends/candle/src/models/flash_qwen3.rs‎
Lines changed: 36 additions & 2 deletions
diff --git a/‎backends/candle/src/models/qwen3.rs‎
Lines changed: 36 additions & 1 deletion b/‎backends/candle/src/models/qwen3.rs‎
Lines changed: 36 additions & 1 deletion
@@ -91,6 +91,7 @@ Below are some examples of the currently supported models:
 | N/A       | 396M                   | ModernBERT     | [answerdotai/ModernBERT-large](https://hf.co/answerdotai/ModernBERT-large)                       |
 | N/A       | 137M                   | JinaBERT       | [jinaai/jina-embeddings-v2-base-en](https://hf.co/jinaai/jina-embeddings-v2-base-en)             |
 | N/A       | 137M                   | JinaBERT       | [jinaai/jina-embeddings-v2-base-code](https://hf.co/jinaai/jina-embeddings-v2-base-code)         |
+| N/A       | 340M                   | Qwen3          | [voyageai/voyage-4-nano](https://hf.co/voyageai/voyage-4-nano)                                   |
 
 To explore the list of best performing text embeddings models, visit the
 [Massive Text Embedding Benchmark (MTEB) Leaderboard](https://huggingface.co/spaces/mteb/leaderboard).
 
@@ -109,6 +109,7 @@ impl Qwen3Attention {
         cos: &Tensor,
         sin: &Tensor,
         max_s: usize,
+        causal: bool,
     ) -> Result<Tensor> {
         let _enter = self.span.enter();
 
@@ -158,7 +159,7 @@ impl Qwen3Attention {
             max_s,
             max_s,
             self.softmax_scale,
-            true,
+            causal,
             None,
             None,
         )?;
@@ -262,14 +263,15 @@ impl Qwen3Layer {
         cos: &Tensor,
         sin: &Tensor,
         max_s: usize,
+        causal: bool,
     ) -> Result<(Tensor, Tensor)> {
         let _enter = self.span.enter();
 
         let (normed_hidden_states, res) = self.input_layer_norm.forward(hidden_states, residual)?;
 
         let attn_output =
             self.attention
-                .forward(&normed_hidden_states, cu_seqlens, cos, sin, max_s)?;
+                .forward(&normed_hidden_states, cu_seqlens, cos, sin, max_s, causal)?;
 
         let (normed_attn_res_output, attn_res) = self
             .post_attention_layer_norm
@@ -285,9 +287,11 @@ pub struct FlashQwen3Model {
     embeddings: Embedding,
     layers: Vec<Qwen3Layer>,
     norm: RMSNorm,
+    projection: Option<Linear>,
     cos_cache: Tensor,
     sin_cache: Tensor,
     pool: Pool,
+    use_bidirectional_attention: bool,
     pub device: Device,
 
     span: tracing::Span,
@@ -313,6 +317,8 @@ impl FlashQwen3Model {
 
         // The Qwen3-Reranker models contain the `model` key
         // https://huggingface.co/collections/Qwen/qwen3-reranker-6841b22d0192d7ade9cdefea
+        // Keep reference to root vb for loading projection layer
+        let vb_root = vb.clone();
         let vb = if vb.contains_tensor("model.embed_tokens.weight") {
             vb.pp("model")
         } else {
@@ -331,6 +337,23 @@ impl FlashQwen3Model {
 
         let norm = RMSNorm::load(vb.pp("norm"), config.hidden_size, config.rms_norm_eps)?;
 
+        let projection = if let Some(num_labels) = config.num_labels {
+            if vb_root.contains_tensor("linear.weight") {
+                let projection_weight =
+                    vb_root.get((num_labels, config.hidden_size), "linear.weight")?;
+                Some(Linear::new(projection_weight, None, None))
+            } else {
+                tracing::warn!(
+                    "num_labels is set but linear.weight not found, skipping projection layer"
+                );
+                None
+            }
+        } else {
+            None
+        };
+
+        let use_bidirectional_attention = config.use_bidirectional_attention.unwrap_or(false);
+
         let inv_freqs = get_inv_freqs(
             layers[0].attention.attention_head_size,
             config.rope_theta,
@@ -348,9 +371,11 @@ impl FlashQwen3Model {
             embeddings,
             layers,
             norm,
+            projection,
             cos_cache,
             sin_cache,
             pool,
+            use_bidirectional_attention,
             device: vb.device().clone(),
             span: tracing::span!(tracing::Level::TRACE, "model"),
         })
@@ -376,6 +401,8 @@ impl FlashQwen3Model {
         let cos = index_select(&self.cos_cache, &position_ids, 0)?;
         let sin = index_select(&self.sin_cache, &position_ids, 0)?;
 
+        let causal = !self.use_bidirectional_attention;
+
         let mut residual = None;
         for layer in &self.layers {
             let (h, r) = layer.forward(
@@ -385,13 +412,20 @@ impl FlashQwen3Model {
                 &cos,
                 &sin,
                 batch.max_length as usize,
+                causal,
             )?;
             hidden_states = h;
             residual = Some(r);
         }
 
         let (outputs, _) = self.norm.forward(&hidden_states, residual.as_ref())?;
 
+        let outputs = if let Some(ref projection) = self.projection {
+            projection.forward(&outputs)?
+        } else {
+            outputs
+        };
+
         let has_pooling_requests = !batch.pooled_indices.is_empty();
         let has_raw_requests = !batch.raw_indices.is_empty();
 
 
@@ -24,6 +24,10 @@ pub struct Qwen3Config {
     pub sliding_window: Option<usize>,
     pub use_sliding_window: bool,
     pub eos_token_id: usize,
+    #[serde(default)]
+    pub use_bidirectional_attention: Option<bool>,
+    #[serde(default)]
+    pub num_labels: Option<usize>,
 }
 
 struct Qwen3Attention {
@@ -379,11 +383,13 @@ pub struct Qwen3Model {
     embeddings: Embedding,
     layers: Vec<Qwen3Layer>,
     norm: RMSNorm,
+    projection: Option<Linear>,
     rotary_cache: (Tensor, Tensor),
     rotary_dim: usize,
     pool: Pool,
     num_attention_heads: usize,
     pad_token_id: u32,
+    use_bidirectional_attention: bool,
 
     dtype: DType,
     device: Device,
@@ -402,6 +408,8 @@ impl Qwen3Model {
 
         // The Qwen3-Reranker models contain the `model` key
         // https://huggingface.co/collections/Qwen/qwen3-reranker-6841b22d0192d7ade9cdefea
+        // Keep reference to root vb for loading projection layer
+        let vb_root = vb.clone();
         let vb = if vb.contains_tensor("model.embed_tokens.weight") {
             vb.pp("model")
         } else {
@@ -420,6 +428,23 @@ impl Qwen3Model {
 
         let norm = RMSNorm::load(vb.pp("norm"), config.hidden_size, config.rms_norm_eps)?;
 
+        let projection = if let Some(num_labels) = config.num_labels {
+            if vb_root.contains_tensor("linear.weight") {
+                let projection_weight =
+                    vb_root.get((num_labels, config.hidden_size), "linear.weight")?;
+                Some(Linear::new(projection_weight, None, None))
+            } else {
+                tracing::warn!(
+                    "num_labels is set but linear.weight not found, skipping projection layer"
+                );
+                None
+            }
+        } else {
+            None
+        };
+
+        let use_bidirectional_attention = config.use_bidirectional_attention.unwrap_or(false);
+
         let rotary_dim = config
             .head_dim
             .unwrap_or(config.hidden_size / config.num_attention_heads);
@@ -433,11 +458,13 @@ impl Qwen3Model {
             embeddings,
             layers,
             norm,
+            projection,
             rotary_cache,
             rotary_dim,
             pool,
             pad_token_id: config.eos_token_id as u32,
             num_attention_heads: config.num_attention_heads,
+            use_bidirectional_attention,
             dtype: vb.dtype(),
             device: vb.device().clone(),
             span: tracing::span!(tracing::Level::TRACE, "model"),
@@ -555,7 +582,9 @@ impl Qwen3Model {
             (input_ids, position_ids, input_lengths, Some(attention_bias))
         };
 
-        let attention_bias = if let Some(attn_bias) = attention_bias {
+        let attention_bias = if self.use_bidirectional_attention {
+            attention_bias
+        } else if let Some(attn_bias) = attention_bias {
             Some(self.get_causal_attention_bias(attn_bias)?)
         } else {
             None
@@ -581,6 +610,12 @@ impl Qwen3Model {
 
         let (outputs, _) = self.norm.forward(&hidden_states, None)?;
 
+        let outputs = if let Some(ref projection) = self.projection {
+            projection.forward(&outputs)?
+        } else {
+            outputs
+        };
+
         let has_pooling_requests = !batch.pooled_indices.is_empty();
         let has_raw_requests = !batch.raw_indices.is_empty();