Add bidirectional attention and projection layer support for voyage-4-nano

williambarberjr · williambarberjr · commit 2f1e3ee77c8f · 2026-01-30T17:24:22.000-05:00
This change adds support for the voyageai/voyage-4-nano model which is based on
Qwen3 architecture but with two key modifications:

1. Bidirectional attention (is_causal=False) instead of causal attention
   - Added `use_bidirectional_attention` config field (default: false)
   - When true, skips causal masking in attention

2. Projection layer (1024 -&gt; 2048 dimensions)
   - Added `num_labels` config field for output projection dimension
   - When set, loads "linear.weight" and applies projection after final norm

voyage-4-nano config.json includes:
  "use_bidirectional_attention": true
  "num_labels": 2048

Both flash (CUDA) and non-flash implementations are updated.
diff --git a/backends/candle/src/models/flash_qwen3.rs b/backends/candle/src/models/flash_qwen3.rs
@@ -109,6 +109,7 @@ impl Qwen3Attention {
         cos: &Tensor,
         sin: &Tensor,
         max_s: usize,
+        causal: bool,  // voyage-4-nano: false for bidirectional attention
     ) -> Result<Tensor> {
         let _enter = self.span.enter();
 
@@ -158,7 +159,7 @@ impl Qwen3Attention {
             max_s,
             max_s,
             self.softmax_scale,
-            true,
+            causal,  // voyage-4-nano: configurable causal flag
             None,
             None,
         )?;
@@ -262,14 +263,15 @@ impl Qwen3Layer {
         cos: &Tensor,
         sin: &Tensor,
         max_s: usize,
+        causal: bool,  // voyage-4-nano: false for bidirectional attention
     ) -> Result<(Tensor, Tensor)> {
         let _enter = self.span.enter();
 
         let (normed_hidden_states, res) = self.input_layer_norm.forward(hidden_states, residual)?;
 
         let attn_output =
             self.attention
-                .forward(&normed_hidden_states, cu_seqlens, cos, sin, max_s)?;
+                .forward(&normed_hidden_states, cu_seqlens, cos, sin, max_s, causal)?;
 
         let (normed_attn_res_output, attn_res) = self
             .post_attention_layer_norm
@@ -285,9 +287,11 @@ pub struct FlashQwen3Model {
     embeddings: Embedding,
     layers: Vec<Qwen3Layer>,
     norm: RMSNorm,
+    projection: Option<Linear>,  // voyage-4-nano: 1024 -> 2048 projection
     cos_cache: Tensor,
     sin_cache: Tensor,
     pool: Pool,
+    use_bidirectional_attention: bool,  // voyage-4-nano: skip causal masking
     pub device: Device,
 
     span: tracing::Span,
@@ -331,6 +335,30 @@ impl FlashQwen3Model {
 
         let norm = RMSNorm::load(vb.pp("norm"), config.hidden_size, config.rms_norm_eps)?;
 
+        // voyage-4-nano: load projection layer if num_labels is set
+        // The projection transforms hidden_size (1024) to num_labels (2048)
+        let projection = if let Some(num_labels) = config.num_labels {
+            // Try to load from the model root (voyage-4-nano uses "linear.weight")
+            let vb_root = if vb.contains_tensor("linear.weight") {
+                vb.clone()
+            } else {
+                // Also check under "model" prefix for reranker-style models
+                vb.pp("..") // go up one level if we're already in "model"
+            };
+
+            if vb_root.contains_tensor("linear.weight") {
+                let projection_weight = vb_root.get((num_labels, config.hidden_size), "linear.weight")?;
+                Some(Linear::new(projection_weight, None, None))
+            } else {
+                tracing::warn!("num_labels is set but linear.weight not found, skipping projection layer");
+                None
+            }
+        } else {
+            None
+        };
+
+        let use_bidirectional_attention = config.use_bidirectional_attention.unwrap_or(false);
+
         let inv_freqs = get_inv_freqs(
             layers[0].attention.attention_head_size,
             config.rope_theta,
@@ -348,9 +376,11 @@ impl FlashQwen3Model {
             embeddings,
             layers,
             norm,
+            projection,
             cos_cache,
             sin_cache,
             pool,
+            use_bidirectional_attention,
             device: vb.device().clone(),
             span: tracing::span!(tracing::Level::TRACE, "model"),
         })
@@ -376,6 +406,9 @@ impl FlashQwen3Model {
         let cos = index_select(&self.cos_cache, &position_ids, 0)?;
         let sin = index_select(&self.sin_cache, &position_ids, 0)?;
 
+        // voyage-4-nano: use bidirectional attention (causal=false) if configured
+        let causal = !self.use_bidirectional_attention;
+
         let mut residual = None;
         for layer in &self.layers {
             let (h, r) = layer.forward(
@@ -385,13 +418,21 @@ impl FlashQwen3Model {
                 &cos,
                 &sin,
                 batch.max_length as usize,
+                causal,
             )?;
             hidden_states = h;
             residual = Some(r);
         }
 
         let (outputs, _) = self.norm.forward(&hidden_states, residual.as_ref())?;
 
+        // voyage-4-nano: apply projection layer if present (1024 -> 2048)
+        let outputs = if let Some(ref projection) = self.projection {
+            projection.forward(&outputs)?
+        } else {
+            outputs
+        };
+
         let has_pooling_requests = !batch.pooled_indices.is_empty();
         let has_raw_requests = !batch.raw_indices.is_empty();
 
diff --git a/backends/candle/src/models/qwen3.rs b/backends/candle/src/models/qwen3.rs
@@ -24,6 +24,12 @@ pub struct Qwen3Config {
     pub sliding_window: Option<usize>,
     pub use_sliding_window: bool,
     pub eos_token_id: usize,
+    // voyage-4-nano support: bidirectional attention (is_causal=False)
+    #[serde(default)]
+    pub use_bidirectional_attention: Option<bool>,
+    // voyage-4-nano support: projection layer output dimension (1024 -> 2048)
+    #[serde(default)]
+    pub num_labels: Option<usize>,
 }
 
 struct Qwen3Attention {
@@ -379,11 +385,13 @@ pub struct Qwen3Model {
     embeddings: Embedding,
     layers: Vec<Qwen3Layer>,
     norm: RMSNorm,
+    projection: Option<Linear>,  // voyage-4-nano: 1024 -> 2048 projection
     rotary_cache: (Tensor, Tensor),
     rotary_dim: usize,
     pool: Pool,
     num_attention_heads: usize,
     pad_token_id: u32,
+    use_bidirectional_attention: bool,  // voyage-4-nano: skip causal masking
 
     dtype: DType,
     device: Device,
@@ -420,6 +428,30 @@ impl Qwen3Model {
 
         let norm = RMSNorm::load(vb.pp("norm"), config.hidden_size, config.rms_norm_eps)?;
 
+        // voyage-4-nano: load projection layer if num_labels is set
+        // The projection transforms hidden_size (1024) to num_labels (2048)
+        let projection = if let Some(num_labels) = config.num_labels {
+            // Try to load from the model root (voyage-4-nano uses "linear.weight")
+            let vb_root = if vb.contains_tensor("linear.weight") {
+                vb.clone()
+            } else {
+                // Also check under "model" prefix for reranker-style models
+                vb.pp("..") // go up one level if we're already in "model"
+            };
+
+            if vb_root.contains_tensor("linear.weight") {
+                let projection_weight = vb_root.get((num_labels, config.hidden_size), "linear.weight")?;
+                Some(Linear::new(projection_weight, None, None))
+            } else {
+                tracing::warn!("num_labels is set but linear.weight not found, skipping projection layer");
+                None
+            }
+        } else {
+            None
+        };
+
+        let use_bidirectional_attention = config.use_bidirectional_attention.unwrap_or(false);
+
         let rotary_dim = config
             .head_dim
             .unwrap_or(config.hidden_size / config.num_attention_heads);
@@ -433,11 +465,13 @@ impl Qwen3Model {
             embeddings,
             layers,
             norm,
+            projection,
             rotary_cache,
             rotary_dim,
             pool,
             pad_token_id: config.eos_token_id as u32,
             num_attention_heads: config.num_attention_heads,
+            use_bidirectional_attention,
             dtype: vb.dtype(),
             device: vb.device().clone(),
             span: tracing::span!(tracing::Level::TRACE, "model"),
@@ -555,7 +589,11 @@ impl Qwen3Model {
             (input_ids, position_ids, input_lengths, Some(attention_bias))
         };
 
-        let attention_bias = if let Some(attn_bias) = attention_bias {
+        // voyage-4-nano: skip causal masking when using bidirectional attention
+        let attention_bias = if self.use_bidirectional_attention {
+            // Bidirectional attention: only use padding mask (no causal mask)
+            attention_bias
+        } else if let Some(attn_bias) = attention_bias {
             Some(self.get_causal_attention_bias(attn_bias)?)
         } else {
             None
@@ -581,6 +619,13 @@ impl Qwen3Model {
 
         let (outputs, _) = self.norm.forward(&hidden_states, None)?;
 
+        // voyage-4-nano: apply projection layer if present (1024 -> 2048)
+        let outputs = if let Some(ref projection) = self.projection {
+            projection.forward(&outputs)?
+        } else {
+            outputs
+        };
+
         let has_pooling_requests = !batch.pooled_indices.is_empty();
         let has_raw_requests = !batch.raw_indices.is_empty();