Skip to content

Commit c1d68b6

Browse files
committed
Revert "fix(lora): add explicit tokenizer truncation to handle inputs >512 tokens"
This reverts commit b09e4ac.
1 parent b09e4ac commit c1d68b6

File tree

2 files changed

+3
-33
lines changed

2 files changed

+3
-33
lines changed

candle-binding/src/core/tokenization.rs

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -387,19 +387,7 @@ impl DualPathTokenizer for UnifiedTokenizer {
387387
let encoding = tokenizer
388388
.encode(text, self.config.add_special_tokens)
389389
.map_err(E::msg)?;
390-
391-
// Explicitly enforce max_length truncation for LoRA models
392-
// This is a safety check to ensure we never exceed the model's position embedding size
393-
let mut result = self.encoding_to_result(&encoding);
394-
let max_len = self.config.max_length;
395-
if result.token_ids.len() > max_len {
396-
result.token_ids.truncate(max_len);
397-
result.token_ids_u32.truncate(max_len);
398-
result.attention_mask.truncate(max_len);
399-
result.tokens.truncate(max_len);
400-
}
401-
402-
Ok(result)
390+
Ok(self.encoding_to_result(&encoding))
403391
}
404392

405393
fn tokenize_batch_smart(

candle-binding/src/model_architectures/lora/bert_lora.rs

Lines changed: 2 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -499,18 +499,9 @@ impl HighPerformanceBertClassifier {
499499

500500
// Load tokenizer
501501
let tokenizer_path = Path::new(model_path).join("tokenizer.json");
502-
let mut tokenizer = Tokenizer::from_file(&tokenizer_path)
502+
let tokenizer = Tokenizer::from_file(&tokenizer_path)
503503
.map_err(|e| E::msg(format!("Failed to load tokenizer: {}", e)))?;
504504

505-
// Configure truncation to max 512 tokens (BERT's position embedding limit)
506-
use tokenizers::TruncationParams;
507-
tokenizer
508-
.with_truncation(Some(TruncationParams {
509-
max_length: 512,
510-
..Default::default()
511-
}))
512-
.map_err(E::msg)?;
513-
514505
// Load model weights
515506
let weights_path = if Path::new(model_path).join("model.safetensors").exists() {
516507
Path::new(model_path).join("model.safetensors")
@@ -699,18 +690,9 @@ impl HighPerformanceBertTokenClassifier {
699690

700691
// Load tokenizer
701692
let tokenizer_path = Path::new(model_path).join("tokenizer.json");
702-
let mut tokenizer = Tokenizer::from_file(&tokenizer_path)
693+
let tokenizer = Tokenizer::from_file(&tokenizer_path)
703694
.map_err(|e| E::msg(format!("Failed to load tokenizer: {}", e)))?;
704695

705-
// Configure truncation to max 512 tokens (BERT's position embedding limit)
706-
use tokenizers::TruncationParams;
707-
tokenizer
708-
.with_truncation(Some(TruncationParams {
709-
max_length: 512,
710-
..Default::default()
711-
}))
712-
.map_err(E::msg)?;
713-
714696
// Load model weights
715697
let weights_path = if Path::new(model_path).join("model.safetensors").exists() {
716698
Path::new(model_path).join("model.safetensors")

0 commit comments

Comments
 (0)