fix(lora): add explicit tokenizer truncation to handle inputs >512 tokens

yossiovadia · yossiovadia · commit cf8691b2f174 · 2025-11-24T19:28:36.000-08:00
This commit fixes LoRA tokenization errors that occurred when processing inputs exceeding 512 tokens, which caused "index-select invalid index 512 with dim size 512" errors and resulted in empty predictions. Changes: - Added explicit truncation configuration to BertLoRAClassifier tokenizer - Added safety check in UnifiedTokenizer::tokenize_for_lora() - Ensures all inputs are properly truncated to BERT's 512 token limit Test results: - LoRA accuracy improved from ~40% (with empty predictions) to 80.36% - 0 tokenization errors on 280 MMLU-Pro test cases - 0 empty predictions Fixes the accuracy regression reported in vllm-project#726 Signed-off-by: Yossi Ovadia <yovadia@redhat.com>
diff --git a/candle-binding/src/core/tokenization.rs b/candle-binding/src/core/tokenization.rs
@@ -387,7 +387,19 @@ impl DualPathTokenizer for UnifiedTokenizer {
         let encoding = tokenizer
             .encode(text, self.config.add_special_tokens)
             .map_err(E::msg)?;
-        Ok(self.encoding_to_result(&encoding))
+
+        // Explicitly enforce max_length truncation for LoRA models
+        // This is a safety check to ensure we never exceed the model's position embedding size
+        let mut result = self.encoding_to_result(&encoding);
+        let max_len = self.config.max_length;
+        if result.token_ids.len() > max_len {
+            result.token_ids.truncate(max_len);
+            result.token_ids_u32.truncate(max_len);
+            result.attention_mask.truncate(max_len);
+            result.tokens.truncate(max_len);
+        }
+
+        Ok(result)
     }
 
     fn tokenize_batch_smart(
diff --git a/candle-binding/src/model_architectures/lora/bert_lora.rs b/candle-binding/src/model_architectures/lora/bert_lora.rs
@@ -499,9 +499,18 @@ impl HighPerformanceBertClassifier {
 
         // Load tokenizer
         let tokenizer_path = Path::new(model_path).join("tokenizer.json");
-        let tokenizer = Tokenizer::from_file(&tokenizer_path)
+        let mut tokenizer = Tokenizer::from_file(&tokenizer_path)
             .map_err(|e| E::msg(format!("Failed to load tokenizer: {}", e)))?;
 
+        // Configure truncation to max 512 tokens (BERT's position embedding limit)
+        use tokenizers::TruncationParams;
+        tokenizer
+            .with_truncation(Some(TruncationParams {
+                max_length: 512,
+                ..Default::default()
+            }))
+            .map_err(E::msg)?;
+
         // Load model weights
         let weights_path = if Path::new(model_path).join("model.safetensors").exists() {
             Path::new(model_path).join("model.safetensors")
@@ -690,9 +699,18 @@ impl HighPerformanceBertTokenClassifier {
 
         // Load tokenizer
         let tokenizer_path = Path::new(model_path).join("tokenizer.json");
-        let tokenizer = Tokenizer::from_file(&tokenizer_path)
+        let mut tokenizer = Tokenizer::from_file(&tokenizer_path)
             .map_err(|e| E::msg(format!("Failed to load tokenizer: {}", e)))?;
 
+        // Configure truncation to max 512 tokens (BERT's position embedding limit)
+        use tokenizers::TruncationParams;
+        tokenizer
+            .with_truncation(Some(TruncationParams {
+                max_length: 512,
+                ..Default::default()
+            }))
+            .map_err(E::msg)?;
+
         // Load model weights
         let weights_path = if Path::new(model_path).join("model.safetensors").exists() {
             Path::new(model_path).join("model.safetensors")