feat: support qwen3 reranker

sigridjineth · sigridjineth · commit c30aebc5d958 · 2025-08-11T01:31:41.000+09:00
diff --git a/backends/candle/src/models/flash_qwen3.rs b/backends/candle/src/models/flash_qwen3.rs
@@ -281,13 +281,97 @@ impl Qwen3Layer {
     }
 }
 
+// Define ClassificationHead trait locally (following TEI pattern)
+trait ClassificationHead {
+    fn forward(&self, hidden_states: &Tensor) -> Result<Tensor>;
+}
+
+// Qwen3 Classification Head implementation
+#[derive(Debug)]
+struct Qwen3ClassificationHead {
+    dense: Linear,
+    out_proj: Linear,
+    activation: HiddenAct,
+    span: tracing::Span,
+}
+
+impl Qwen3ClassificationHead {
+    pub fn load(vb: VarBuilder, config: &Qwen3Config) -> Result<Self> {
+        let (dense, out_proj) = if vb.contains_tensor("score.dense.weight") {
+            tracing::info!("Loading Qwen3 classifier with score layers");
+
+            let dense_weight = vb
+                .pp("score.dense")
+                .get((config.hidden_size, config.hidden_size), "weight")?;
+            let dense_bias = vb.pp("score.dense").get(config.hidden_size, "bias")?;
+            let dense = Linear::new(dense_weight, Some(dense_bias), None);
+
+            let out_proj_weight = vb
+                .pp("score.out_proj")
+                .get((1, config.hidden_size), "weight")?;
+            let out_proj_bias = vb.pp("score.out_proj").get(1, "bias")?;
+            let out_proj = Linear::new(out_proj_weight, Some(out_proj_bias), None);
+
+            (dense, out_proj)
+        } else if vb.contains_tensor("classifier.dense.weight") {
+            tracing::info!("Loading Qwen3 classifier with classifier layers");
+
+            let dense_weight = vb
+                .pp("classifier.dense")
+                .get((config.hidden_size, config.hidden_size), "weight")?;
+            let dense_bias = vb.pp("classifier.dense").get(config.hidden_size, "bias")?;
+            let dense = Linear::new(dense_weight, Some(dense_bias), None);
+
+            let out_proj_weight = vb
+                .pp("classifier.out_proj")
+                .get((1, config.hidden_size), "weight")?;
+            let out_proj_bias = vb.pp("classifier.out_proj").get(1, "bias")?;
+            let out_proj = Linear::new(out_proj_weight, Some(out_proj_bias), None);
+
+            (dense, out_proj)
+        } else {
+            candle::bail!(
+                "Classification layers not found in model weights. \
+                Expected 'score.dense.weight' or 'classifier.dense.weight' for reranker models. \
+                This model may not be a trained reranker."
+            );
+        };
+
+        Ok(Self {
+            dense,
+            out_proj,
+            activation: config.hidden_act.clone(),
+            span: tracing::span!(tracing::Level::TRACE, "classifier"),
+        })
+    }
+}
+
+impl ClassificationHead for Qwen3ClassificationHead {
+    fn forward(&self, hidden_states: &Tensor) -> Result<Tensor> {
+        let _enter = self.span.enter();
+
+        // Input is already pooled
+
+        // Apply dense layer with activation
+        let hidden = self.dense.forward(hidden_states)?;
+        let hidden = self.activation.forward(&hidden)?;
+
+        // Project to single score
+        let score = self.out_proj.forward(&hidden)?;
+
+        // Squeeze to remove the last dimension if it's 1
+        score.squeeze(candle::D::Minus1)
+    }
+}
+
 pub struct FlashQwen3Model {
     embeddings: Embedding,
     layers: Vec<Qwen3Layer>,
     norm: RMSNorm,
     cos_cache: Tensor,
     sin_cache: Tensor,
     pool: Pool,
+    classifier: Option<Box<dyn ClassificationHead + Send>>,
     pub device: Device,
 
     span: tracing::Span,
@@ -304,11 +388,19 @@ impl FlashQwen3Model {
             candle::bail!("FlashQwen3 requires DType::F16")
         }
 
-        let pool = match model_type {
+        let (pool, classifier) = match model_type {
             ModelType::Classifier => {
-                candle::bail!("`classifier` model type is not supported for Qwen3")
+                let pool = Pool::LastToken;
+                let classifier: Box<dyn ClassificationHead + Send> =
+                    Box::new(Qwen3ClassificationHead::load(vb.clone(), config)?);
+                (pool, Some(classifier))
+            }
+            ModelType::Embedding(pool) => {
+                if pool == Pool::Splade {
+                    candle::bail!("`splade` is not supported for Qwen3")
+                }
+                (pool, None)
             }
-            ModelType::Embedding(pool) => pool,
         };
 
         // The Qwen3-Reranker models contain the `model` key
@@ -351,6 +443,7 @@ impl FlashQwen3Model {
             cos_cache,
             sin_cache,
             pool,
+            classifier,
             device: vb.device().clone(),
             span: tracing::span!(tracing::Level::TRACE, "model"),
         })
@@ -512,4 +605,23 @@ impl Model for FlashQwen3Model {
     fn embed(&self, batch: Batch) -> Result<(Option<Tensor>, Option<Tensor>)> {
         self.forward(batch)
     }
+
+    fn predict(&self, batch: Batch) -> Result<Tensor> {
+        match &self.classifier {
+            None => candle::bail!("`predict` is not implemented for this model"),
+            Some(classifier) => {
+                // Run forward pass to get hidden states
+                let (pooled_embeddings, _) = self.forward(batch)?;
+                match pooled_embeddings {
+                    Some(embeddings) => {
+                        let scores = classifier.forward(&embeddings)?;
+                        // Apply sigmoid to convert logits to probabilities
+                        let probabilities = candle_nn::ops::sigmoid(&scores)?;
+                        Ok(probabilities)
+                    }
+                    None => candle::bail!("No pooled embeddings returned for classification"),
+                }
+            }
+        }
+    }
 }
diff --git a/backends/candle/src/models/qwen3.rs b/backends/candle/src/models/qwen3.rs
@@ -375,13 +375,89 @@ impl Qwen3Layer {
     }
 }
 
+trait ClassificationHead {
+    fn forward(&self, hidden_states: &Tensor) -> Result<Tensor>;
+}
+
+#[derive(Debug)]
+struct Qwen3ClassificationHead {
+    dense: Linear,
+    out_proj: Linear,
+    activation: HiddenAct,
+    span: tracing::Span,
+}
+
+impl Qwen3ClassificationHead {
+    pub fn load(vb: VarBuilder, config: &Qwen3Config) -> Result<Self> {
+        let (dense, out_proj) = if vb.contains_tensor("score.dense.weight") {
+            tracing::info!("Loading Qwen3 classifier with score layers");
+
+            let dense_weight = vb
+                .pp("score.dense")
+                .get((config.hidden_size, config.hidden_size), "weight")?;
+            let dense_bias = vb.pp("score.dense").get(config.hidden_size, "bias")?;
+            let dense = Linear::new(dense_weight, Some(dense_bias), None);
+
+            let out_proj_weight = vb
+                .pp("score.out_proj")
+                .get((1, config.hidden_size), "weight")?;
+            let out_proj_bias = vb.pp("score.out_proj").get(1, "bias")?;
+            let out_proj = Linear::new(out_proj_weight, Some(out_proj_bias), None);
+
+            (dense, out_proj)
+        } else if vb.contains_tensor("classifier.dense.weight") {
+            tracing::info!("Loading Qwen3 classifier with classifier layers");
+
+            let dense_weight = vb
+                .pp("classifier.dense")
+                .get((config.hidden_size, config.hidden_size), "weight")?;
+            let dense_bias = vb.pp("classifier.dense").get(config.hidden_size, "bias")?;
+            let dense = Linear::new(dense_weight, Some(dense_bias), None);
+
+            let out_proj_weight = vb
+                .pp("classifier.out_proj")
+                .get((1, config.hidden_size), "weight")?;
+            let out_proj_bias = vb.pp("classifier.out_proj").get(1, "bias")?;
+            let out_proj = Linear::new(out_proj_weight, Some(out_proj_bias), None);
+
+            (dense, out_proj)
+        } else {
+            candle::bail!(
+                "Classification layers not found in model weights. \
+                Expected 'score.dense.weight' or 'classifier.dense.weight' for reranker models. \
+                This model may not be a trained reranker."
+            );
+        };
+
+        Ok(Self {
+            dense,
+            out_proj,
+            activation: config.hidden_act.clone(),
+            span: tracing::span!(tracing::Level::TRACE, "classifier"),
+        })
+    }
+}
+
+impl ClassificationHead for Qwen3ClassificationHead {
+    fn forward(&self, hidden_states: &Tensor) -> Result<Tensor> {
+        let _enter = self.span.enter();
+
+        let hidden = self.dense.forward(hidden_states)?;
+        let hidden = self.activation.forward(&hidden)?;
+        let score = self.out_proj.forward(&hidden)?;
+
+        score.squeeze(D::Minus1)
+    }
+}
+
 pub struct Qwen3Model {
     embeddings: Embedding,
     layers: Vec<Qwen3Layer>,
     norm: RMSNorm,
     rotary_cache: (Tensor, Tensor),
     rotary_dim: usize,
     pool: Pool,
+    classifier: Option<Box<dyn ClassificationHead + Send>>,
     num_attention_heads: usize,
     pad_token_id: u32,
 
@@ -393,11 +469,19 @@ pub struct Qwen3Model {
 
 impl Qwen3Model {
     pub fn load(vb: VarBuilder, config: &Qwen3Config, model_type: ModelType) -> Result<Self> {
-        let pool = match model_type {
+        let (pool, classifier) = match model_type {
             ModelType::Classifier => {
-                candle::bail!("`classifier` model type is not supported for Qwen3")
+                let pool = Pool::LastToken;
+                let classifier: Box<dyn ClassificationHead + Send> =
+                    Box::new(Qwen3ClassificationHead::load(vb.clone(), config)?);
+                (pool, Some(classifier))
+            }
+            ModelType::Embedding(pool) => {
+                if pool == Pool::Splade {
+                    candle::bail!("`splade` is not supported for Qwen3")
+                }
+                (pool, None)
             }
-            ModelType::Embedding(pool) => pool,
         };
 
         // The Qwen3-Reranker models contain the `model` key
@@ -436,6 +520,7 @@ impl Qwen3Model {
             rotary_cache,
             rotary_dim,
             pool,
+            classifier,
             pad_token_id: config.eos_token_id as u32,
             num_attention_heads: config.num_attention_heads,
             dtype: vb.dtype(),
@@ -700,4 +785,23 @@ impl Model for Qwen3Model {
     fn embed(&self, batch: Batch) -> Result<(Option<Tensor>, Option<Tensor>)> {
         self.forward(batch)
     }
+
+    fn predict(&self, batch: Batch) -> Result<Tensor> {
+        match &self.classifier {
+            None => candle::bail!("`predict` is not implemented for this model"),
+            Some(classifier) => {
+                // Run forward pass to get hidden states
+                let (pooled_embeddings, _) = self.forward(batch)?;
+                match pooled_embeddings {
+                    Some(embeddings) => {
+                        let scores = classifier.forward(&embeddings)?;
+                        // Apply sigmoid to convert logits to probabilities
+                        let probabilities = candle_nn::ops::sigmoid(&scores)?;
+                        Ok(probabilities)
+                    }
+                    None => candle::bail!("No pooled embeddings returned for classification"),
+                }
+            }
+        }
+    }
 }
diff --git a/backends/src/lib.rs b/backends/src/lib.rs
@@ -179,7 +179,12 @@ impl Backend {
     }
 
     #[instrument(skip_all)]
-    pub fn create_warmup_batch(&self, shape: (u32, u32), max_token: u32, seq_bucket_size: u32) -> Batch {
+    pub fn create_warmup_batch(
+        &self,
+        shape: (u32, u32),
+        max_token: u32,
+        seq_bucket_size: u32,
+    ) -> Batch {
         let (batch_size, length) = shape;
         let min_length = length.saturating_sub(seq_bucket_size).saturating_add(1);
         let tmp_length = if min_length < length {
diff --git a/router/src/lib.rs b/router/src/lib.rs
@@ -110,7 +110,7 @@ pub async fn run(
         serde_json::from_str(&config).context("Failed to parse `config.json`")?;
 
     // Set model type from config
-    let backend_model_type = get_backend_model_type(&config, &model_root, pooling)?;
+    let backend_model_type = get_backend_model_type(&config, &model_root, &model_id, pooling)?;
 
     // Info model type
     let model_type = match &backend_model_type {
@@ -355,6 +355,7 @@ pub async fn run(
 fn get_backend_model_type(
     config: &ModelConfig,
     model_root: &Path,
+    model_id: &str,
     pooling: Option<text_embeddings_backend::Pool>,
 ) -> Result<text_embeddings_backend::ModelType> {
     for arch in &config.architectures {
@@ -381,6 +382,29 @@ fn get_backend_model_type(
         }
     }
 
+    // Qwen3-Reranker detection
+    if config
+        .architectures
+        .iter()
+        .any(|arch| arch == "Qwen3ForCausalLM")
+    {
+        let model_name = model_id
+            .split('/')
+            .last()
+            .unwrap_or(model_id)
+            .to_lowercase();
+
+        if model_name.contains("reranker") {
+            tracing::info!("Detected Qwen3-Reranker model, treating as classifier");
+            if pooling.is_some() {
+                tracing::warn!(
+                    "`--pooling` arg is set but model is a reranker. Ignoring `--pooling` arg."
+                );
+            }
+            return Ok(text_embeddings_backend::ModelType::Classifier);
+        }
+    }
+
     if Some(text_embeddings_backend::Pool::Splade) == pooling {
         return Err(anyhow!(
             "Splade pooling is not supported: model is not a ForMaskedLM model"