diff --git a/fastembed/late_interaction/colbert.py b/fastembed/late_interaction/colbert.py index 4d65fc29..97c38570 100644 --- a/fastembed/late_interaction/colbert.py +++ b/fastembed/late_interaction/colbert.py @@ -191,6 +191,9 @@ def load_onnx_model(self) -> None: self.tokenizer.encode(symbol, add_special_tokens=False).ids[0] for symbol in string.punctuation } + current_max_length = self.tokenizer.truncation["max_length"] + # ensure not to overflow after adding document-marker + self.tokenizer.enable_truncation(max_length=current_max_length - 1) def embed( self,