Reapply "fix(lora): add explicit tokenizer truncation to handle inputs >512 tokens"

yossiovadia · yossiovadia · commit 4585d119a3e7 · 2025-11-24T13:47:51.000-08:00
This reverts commit c1d68b6.
diff --git a/candle-binding/src/core/tokenization.rs b/candle-binding/src/core/tokenization.rs
@@ -387,7 +387,19 @@ impl DualPathTokenizer for UnifiedTokenizer {
         let encoding = tokenizer
             .encode(text, self.config.add_special_tokens)
             .map_err(E::msg)?;
-        Ok(self.encoding_to_result(&encoding))
+
+        // Explicitly enforce max_length truncation for LoRA models
+        // This is a safety check to ensure we never exceed the model's position embedding size
+        let mut result = self.encoding_to_result(&encoding);
+        let max_len = self.config.max_length;
+        if result.token_ids.len() > max_len {
+            result.token_ids.truncate(max_len);
+            result.token_ids_u32.truncate(max_len);
+            result.attention_mask.truncate(max_len);
+            result.tokens.truncate(max_len);
+        }
+
+        Ok(result)
     }
 
     fn tokenize_batch_smart(
diff --git a/candle-binding/src/model_architectures/lora/bert_lora.rs b/candle-binding/src/model_architectures/lora/bert_lora.rs
@@ -499,9 +499,18 @@ impl HighPerformanceBertClassifier {
 
         // Load tokenizer
         let tokenizer_path = Path::new(model_path).join("tokenizer.json");
-        let tokenizer = Tokenizer::from_file(&tokenizer_path)
+        let mut tokenizer = Tokenizer::from_file(&tokenizer_path)
             .map_err(|e| E::msg(format!("Failed to load tokenizer: {}", e)))?;
 
+        // Configure truncation to max 512 tokens (BERT's position embedding limit)
+        use tokenizers::TruncationParams;
+        tokenizer
+            .with_truncation(Some(TruncationParams {
+                max_length: 512,
+                ..Default::default()
+            }))
+            .map_err(E::msg)?;
+
         // Load model weights
         let weights_path = if Path::new(model_path).join("model.safetensors").exists() {
             Path::new(model_path).join("model.safetensors")
@@ -690,9 +699,18 @@ impl HighPerformanceBertTokenClassifier {
 
         // Load tokenizer
         let tokenizer_path = Path::new(model_path).join("tokenizer.json");
-        let tokenizer = Tokenizer::from_file(&tokenizer_path)
+        let mut tokenizer = Tokenizer::from_file(&tokenizer_path)
             .map_err(|e| E::msg(format!("Failed to load tokenizer: {}", e)))?;
 
+        // Configure truncation to max 512 tokens (BERT's position embedding limit)
+        use tokenizers::TruncationParams;
+        tokenizer
+            .with_truncation(Some(TruncationParams {
+                max_length: 512,
+                ..Default::default()
+            }))
+            .map_err(E::msg)?;
+
         // Load model weights
         let weights_path = if Path::new(model_path).join("model.safetensors").exists() {
             Path::new(model_path).join("model.safetensors")