vllm-project
diff --git a/‎Makefile‎
Lines changed: 9 additions & 9 deletions b/‎Makefile‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎candle-binding/src/bert_official.rs‎
Lines changed: 205 additions & 66 deletions b/‎candle-binding/src/bert_official.rs‎
Lines changed: 205 additions & 66 deletions
@@ -257,39 +257,39 @@ download-models:
 	fi
 
 	@if [ ! -d "lora_intent_classifier_bert-base-uncased_model" ]; then \
-		hf download OneZero-Y/lora_intent_classifier_bert-base-uncased_model --local-dir models/lora_intent_classifier_bert-base-uncased_model; \
+		hf download LLM-Semantic-Router/lora_intent_classifier_bert-base-uncased_model --local-dir models/lora_intent_classifier_bert-base-uncased_model; \
 	fi
 
 	@if [ ! -d "models/lora_intent_classifier_roberta-base_model" ]; then \
-		hf download OneZero-Y/lora_intent_classifier_roberta-base_model --local-dir models/lora_intent_classifier_roberta-base_model; \
+		hf download LLM-Semantic-Router/lora_intent_classifier_roberta-base_model --local-dir models/lora_intent_classifier_roberta-base_model; \
 	fi
 
 	@if [ ! -d "models/lora_intent_classifier_modernbert-base_model" ]; then \
-		hf download OneZero-Y/lora_intent_classifier_modernbert-base_model --local-dir models/lora_intent_classifier_modernbert-base_model; \
+		hf download LLM-Semantic-Router/lora_intent_classifier_modernbert-base_model --local-dir models/lora_intent_classifier_modernbert-base_model; \
 	fi
 
 	@if [ ! -d "models/lora_pii_detector_bert-base-uncased_model" ]; then \
-		hf download OneZero-Y/lora_pii_detector_bert-base-uncased_model --local-dir models/lora_pii_detector_bert-base-uncased_model; \
+		hf download LLM-Semantic-Router/lora_pii_detector_bert-base-uncased_model --local-dir models/lora_pii_detector_bert-base-uncased_model; \
 	fi
 
 	@if [ ! -d "models/lora_pii_detector_roberta-base_model" ]; then \
-		hf download OneZero-Y/lora_pii_detector_roberta-base_model --local-dir models/lora_pii_detector_roberta-base_model; \
+		hf download LLM-Semantic-Router/lora_pii_detector_roberta-base_model --local-dir models/lora_pii_detector_roberta-base_model; \
 	fi
 
 	@if [ ! -d "models/lora_pii_detector_modernbert-base_model" ]; then \
-		hf download OneZero-Y/lora_pii_detector_modernbert-base_model --local-dir models/lora_pii_detector_modernbert-base_model; \
+		hf download LLM-Semantic-Router/lora_pii_detector_modernbert-base_model --local-dir models/lora_pii_detector_modernbert-base_model; \
 	fi
 
 	@if [ ! -d "models/lora_jailbreak_classifier_bert-base-uncased_model" ]; then \
-		hf download OneZero-Y/lora_jailbreak_classifier_bert-base-uncased_model --local-dir models/lora_jailbreak_classifier_bert-base-uncased_model; \
+		hf download LLM-Semantic-Router/lora_jailbreak_classifier_bert-base-uncased_model --local-dir models/lora_jailbreak_classifier_bert-base-uncased_model; \
 	fi
 
 	@if [ ! -d "models/lora_jailbreak_classifier_roberta-base_model" ]; then \
-		hf download OneZero-Y/lora_jailbreak_classifier_roberta-base_model --local-dir models/lora_jailbreak_classifier_roberta-base_model; \
+		hf download LLM-Semantic-Router/lora_jailbreak_classifier_roberta-base_model --local-dir models/lora_jailbreak_classifier_roberta-base_model; \
 	fi
 
 	@if [ ! -d "models/lora_jailbreak_classifier_modernbert-base_model" ]; then \
-		hf download OneZero-Y/lora_jailbreak_classifier_modernbert-base_model --local-dir models/lora_jailbreak_classifier_modernbert-base_model; \
+		hf download LLM-Semantic-Router/lora_jailbreak_classifier_modernbert-base_model --local-dir models/lora_jailbreak_classifier_modernbert-base_model; \
 	fi
 
 # Milvus container management
 
@@ -18,6 +18,48 @@ pub struct CandleBertClassifier {
 }
 
 impl CandleBertClassifier {
+    /// Shared helper method for efficient batch tensor creation
+    fn create_batch_tensors(
+        &self,
+        texts: &[&str],
+    ) -> Result<(Tensor, Tensor, Tensor, Vec<tokenizers::Encoding>)> {
+        let encodings = self
+            .tokenizer
+            .encode_batch(texts.to_vec(), true)
+            .map_err(E::msg)?;
+
+        let batch_size = texts.len();
+        let max_len = encodings
+            .iter()
+            .map(|enc| enc.get_ids().len())
+            .max()
+            .unwrap_or(0);
+
+        let total_elements = batch_size * max_len;
+        let mut all_token_ids = Vec::with_capacity(total_elements);
+        let mut all_attention_masks = Vec::with_capacity(total_elements);
+
+        for encoding in &encodings {
+            let token_ids = encoding.get_ids();
+            let attention_mask = encoding.get_attention_mask();
+
+            all_token_ids.extend_from_slice(token_ids);
+            all_attention_masks.extend_from_slice(attention_mask);
+
+            let padding_needed = max_len - token_ids.len();
+            all_token_ids.extend(std::iter::repeat(0).take(padding_needed));
+            all_attention_masks.extend(std::iter::repeat(0).take(padding_needed));
+        }
+
+        let token_ids =
+            Tensor::new(all_token_ids.as_slice(), &self.device)?.reshape(&[batch_size, max_len])?;
+        let attention_mask = Tensor::new(all_attention_masks.as_slice(), &self.device)?
+            .reshape(&[batch_size, max_len])?;
+        let token_type_ids = Tensor::zeros(&[batch_size, max_len], DType::U32, &self.device)?;
+
+        Ok((token_ids, attention_mask, token_type_ids, encodings))
+    }
+
     pub fn new(model_path: &str, num_classes: usize, use_cpu: bool) -> Result<Self> {
         let device = if use_cpu {
             Device::Cpu
@@ -137,6 +179,47 @@ impl CandleBertClassifier {
 
         Ok((predicted_class, confidence))
     }
+
+    /// True batch processing for multiple texts - significant performance improvement
+    pub fn classify_batch(&self, texts: &[&str]) -> Result<Vec<(usize, f32)>> {
+        if texts.is_empty() {
+            return Ok(Vec::new());
+        }
+
+        // OPTIMIZATION: Use shared tensor creation method
+        let (token_ids, attention_mask, token_type_ids, _encodings) =
+            self.create_batch_tensors(texts)?;
+
+        // Batch BERT forward pass
+        let sequence_output =
+            self.bert
+                .forward(&token_ids, &token_type_ids, Some(&attention_mask))?;
+
+        // OPTIMIZATION: Use proper CLS token pooling instead of mean pooling
+        let cls_tokens = sequence_output.i((.., 0))?; // Extract CLS tokens for all samples
+        let pooled_output = self.pooler.forward(&cls_tokens)?;
+        let pooled_output = pooled_output.tanh()?;
+
+        let logits = self.classifier.forward(&pooled_output)?;
+        let probabilities = candle_nn::ops::softmax(&logits, 1)?;
+
+        // OPTIMIZATION: Batch result extraction
+        let probs_data = probabilities.to_vec2::<f32>()?;
+        let mut results = Vec::with_capacity(texts.len());
+
+        for row in probs_data {
+            let (predicted_class, confidence) = row
+                .iter()
+                .enumerate()
+                .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
+                .map(|(idx, &conf)| (idx, conf))
+                .unwrap_or((0, 0.0));
+
+            results.push((predicted_class, confidence));
+        }
+
+        Ok(results)
+    }
 }
 
 /// BERT token classifier for PII detection
@@ -148,6 +231,48 @@ pub struct CandleBertTokenClassifier {
 }
 
 impl CandleBertTokenClassifier {
+    /// Shared helper method for efficient batch tensor creation
+    fn create_batch_tensors(
+        &self,
+        texts: &[&str],
+    ) -> Result<(Tensor, Tensor, Tensor, Vec<tokenizers::Encoding>)> {
+        let encodings = self
+            .tokenizer
+            .encode_batch(texts.to_vec(), true)
+            .map_err(E::msg)?;
+
+        let batch_size = texts.len();
+        let max_len = encodings
+            .iter()
+            .map(|enc| enc.get_ids().len())
+            .max()
+            .unwrap_or(0);
+
+        let total_elements = batch_size * max_len;
+        let mut all_token_ids = Vec::with_capacity(total_elements);
+        let mut all_attention_masks = Vec::with_capacity(total_elements);
+
+        for encoding in &encodings {
+            let token_ids = encoding.get_ids();
+            let attention_mask = encoding.get_attention_mask();
+
+            all_token_ids.extend_from_slice(token_ids);
+            all_attention_masks.extend_from_slice(attention_mask);
+
+            let padding_needed = max_len - token_ids.len();
+            all_token_ids.extend(std::iter::repeat(0).take(padding_needed));
+            all_attention_masks.extend(std::iter::repeat(0).take(padding_needed));
+        }
+
+        let token_ids =
+            Tensor::new(all_token_ids.as_slice(), &self.device)?.reshape(&[batch_size, max_len])?;
+        let attention_mask = Tensor::new(all_attention_masks.as_slice(), &self.device)?
+            .reshape(&[batch_size, max_len])?;
+        let token_type_ids = Tensor::zeros(&[batch_size, max_len], DType::U32, &self.device)?;
+
+        Ok((token_ids, attention_mask, token_type_ids, encodings))
+    }
+
     pub fn new(model_path: &str, num_classes: usize, use_cpu: bool) -> Result<Self> {
         let device = if use_cpu {
             Device::Cpu
@@ -208,95 +333,109 @@ impl CandleBertTokenClassifier {
         })
     }
 
-    pub fn classify_tokens(&self, text: &str) -> Result<Vec<(String, usize, f32)>> {
-        // Tokenize
-        let encoding = self.tokenizer.encode(text, true).map_err(E::msg)?;
-        let token_ids = encoding.get_ids().to_vec();
-        let attention_mask = encoding.get_attention_mask().to_vec();
-        let tokens = encoding.get_tokens();
-
-        // Create tensors
-        let token_ids = Tensor::new(&token_ids[..], &self.device)?.unsqueeze(0)?;
-        let token_type_ids = token_ids.zeros_like()?;
-        let attention_mask = Tensor::new(&attention_mask[..], &self.device)?.unsqueeze(0)?;
-
-        // Forward pass
-        let sequence_output =
-            self.bert
-                .forward(&token_ids, &token_type_ids, Some(&attention_mask))?;
-
-        // Apply token classifier to each token
-        let logits = self.classifier.forward(&sequence_output)?;
+    /// Helper method to extract entities from probabilities
+    fn extract_entities_from_probs(
+        &self,
+        probs: &Tensor,
+        tokens: &[String],
+        offsets: &[(usize, usize)],
+    ) -> Result<Vec<(String, usize, f32)>> {
+        let probs_vec = probs.to_vec2::<f32>()?;
+        let mut results = Vec::new();
 
-        // Get predictions for each token
-        let probabilities = candle_nn::ops::softmax(&logits, 2)?;
-        let probabilities = probabilities.squeeze(0)?;
-        let probabilities_vec = probabilities.to_vec2::<f32>()?;
+        for (token_idx, (token, token_probs)) in tokens.iter().zip(probs_vec.iter()).enumerate() {
+            if token_idx >= offsets.len() {
+                break;
+            }
 
-        let mut results = Vec::new();
-        for (token, probs) in tokens.iter().zip(probabilities_vec.iter()) {
-            let (predicted_class, &confidence) = probs
+            let (predicted_class, &confidence) = token_probs
                 .iter()
                 .enumerate()
                 .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
-                .unwrap();
+                .unwrap_or((0, &0.0));
+
+            // Skip padding tokens and special tokens
+            if token.starts_with("[PAD]")
+                || token.starts_with("[CLS]")
+                || token.starts_with("[SEP]")
+            {
+                continue;
+            }
 
             results.push((token.clone(), predicted_class, confidence));
         }
 
         Ok(results)
     }
 
-    pub fn classify_tokens_with_spans(
-        &self,
-        text: &str,
-    ) -> Result<Vec<(String, usize, f32, usize, usize)>> {
-        // Tokenize with offset mapping
-        let encoding = self.tokenizer.encode(text, true).map_err(E::msg)?;
-        let token_ids = encoding.get_ids().to_vec();
-        let attention_mask = encoding.get_attention_mask().to_vec();
-        let tokens = encoding.get_tokens();
-        let offsets = encoding.get_offsets();
+    /// True batch processing for token classification - significant performance improvement
+    pub fn classify_tokens_batch(&self, texts: &[&str]) -> Result<Vec<Vec<(String, usize, f32)>>> {
+        if texts.is_empty() {
+            return Ok(Vec::new());
+        }
 
-        // Create tensors
-        let token_ids = Tensor::new(&token_ids[..], &self.device)?.unsqueeze(0)?;
-        let token_type_ids = token_ids.zeros_like()?;
-        let attention_mask = Tensor::new(&attention_mask[..], &self.device)?.unsqueeze(0)?;
+        // OPTIMIZATION: Use shared tensor creation method
+        let (token_ids, attention_mask, token_type_ids, encodings) =
+            self.create_batch_tensors(texts)?;
 
-        // Forward pass
+        // Batch BERT forward pass
         let sequence_output =
             self.bert
                 .forward(&token_ids, &token_type_ids, Some(&attention_mask))?;
 
-        // Apply token classifier to each token
-        let logits = self.classifier.forward(&sequence_output)?;
-
-        // Get predictions for each token
+        // Batch token classification
+        let logits = self.classifier.forward(&sequence_output)?; // (batch_size, seq_len, num_labels)
         let probabilities = candle_nn::ops::softmax(&logits, 2)?;
-        let probabilities = probabilities.squeeze(0)?;
-        let probabilities_vec = probabilities.to_vec2::<f32>()?;
+
+        // OPTIMIZATION: More efficient result extraction
+        let mut batch_results = Vec::with_capacity(texts.len());
+        for i in 0..texts.len() {
+            let encoding = &encodings[i];
+            let tokens = encoding.get_tokens();
+            let offsets = encoding.get_offsets();
+
+            let text_probs = probabilities.get(i)?; // (seq_len, num_labels)
+            let text_results = self.extract_entities_from_probs(&text_probs, tokens, offsets)?;
+            batch_results.push(text_results);
+        }
+
+        Ok(batch_results)
+    }
+
+    /// Single text token classification with span information (for backward compatibility)
+    pub fn classify_tokens_with_spans(
+        &self,
+        text: &str,
+    ) -> Result<Vec<(String, usize, f32, usize, usize)>> {
+        // Use batch processing for single text
+        let batch_results = self.classify_tokens_batch(&[text])?;
+        if batch_results.is_empty() {
+            return Ok(Vec::new());
+        }
+
+        // Get tokenization info for spans
+        let encoding = self.tokenizer.encode(text, true).map_err(E::msg)?;
+        let offsets = encoding.get_offsets();
 
         let mut results = Vec::new();
-        for ((token, offset), probs) in tokens
-            .iter()
-            .zip(offsets.iter())
-            .zip(probabilities_vec.iter())
-        {
-            let (predicted_class, &confidence) = probs
-                .iter()
-                .enumerate()
-                .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
-                .unwrap();
-
-            results.push((
-                token.clone(),
-                predicted_class,
-                confidence,
-                offset.0,
-                offset.1,
-            ));
+        for (i, (token, class_id, confidence)) in batch_results[0].iter().enumerate() {
+            if i < offsets.len() {
+                let (start_char, end_char) = offsets[i];
+                results.push((token.clone(), *class_id, *confidence, start_char, end_char));
+            }
         }
 
         Ok(results)
     }
+
+    /// Single text token classification (for backward compatibility)
+    pub fn classify_tokens(&self, text: &str) -> Result<Vec<(String, usize, f32)>> {
+        // Use batch processing for single text
+        let batch_results = self.classify_tokens_batch(&[text])?;
+        if batch_results.is_empty() {
+            return Ok(Vec::new());
+        }
+
+        Ok(batch_results.into_iter().next().unwrap())
+    }
 }