Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion fastembed/late_interaction/colbert.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,12 @@ def _preprocess_onnx_input(
)
return onnx_input

def tokenize(self, documents: list[str], is_doc: bool = True, **kwargs: Any) -> list[Encoding]:
def tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]:
return self._tokenize(documents, **kwargs)

def _tokenize(
self, documents: list[str], is_doc: bool = True, **kwargs: Any
) -> list[Encoding]:
return (
self._tokenize_documents(documents=documents)
if is_doc
Expand Down
5 changes: 5 additions & 0 deletions fastembed/late_interaction/late_interaction_embedding_base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from typing import Iterable, Optional, Union, Any

from tokenizers import Encoding

from fastembed.common.model_description import DenseModelDescription
from fastembed.common.types import NumpyArray
from fastembed.common.model_management import ModelManagement
Expand All @@ -19,6 +21,9 @@ def __init__(
self._local_files_only = kwargs.pop("local_files_only", False)
self._embedding_size: Optional[int] = None

def tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]:
raise NotImplementedError("Subclasses must implement this method.")

def embed(
self,
documents: Union[str, Iterable[str]],
Expand Down
15 changes: 15 additions & 0 deletions fastembed/late_interaction/late_interaction_text_embedding.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from typing import Any, Iterable, Optional, Sequence, Type, Union
from dataclasses import asdict

from tokenizers import Encoding

from fastembed.common.model_description import DenseModelDescription
from fastembed.common.types import NumpyArray
from fastembed.common import OnnxProvider
Expand Down Expand Up @@ -114,6 +116,19 @@ def get_embedding_size(cls, model_name: str) -> int:
)
return embedding_size

def tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]:
"""
Tokenize input texts using the model's tokenizer.

Args:
documents: List of strings to tokenize
**kwargs: Additional arguments passed to the tokenizer

Returns:
List of tokenizer Encodings
"""
return self.model.tokenize(documents, **kwargs)

def embed(
self,
documents: Union[str, Iterable[str]],
Expand Down
6 changes: 5 additions & 1 deletion fastembed/late_interaction_multimodal/colpali.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,13 +161,17 @@ def _post_process_onnx_text_output(
return output.model_output

def tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]:
return self._tokenize(documents, **kwargs)

def _tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]:
texts_query: list[str] = []
for query in documents:
query = self.BOS_TOKEN + self.QUERY_PREFIX + query + self.PAD_TOKEN * 10
query += "\n"

texts_query.append(query)
encoded = self.tokenizer.encode_batch(texts_query) # type: ignore[union-attr]
assert self.tokenizer is not None
encoded = self.tokenizer.encode_batch(texts_query)
return encoded
Comment on lines +166 to 175
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Forward kwargs to enable tokenization customization.

The **kwargs parameter is declared but not forwarded to encode_batch at line 174. This prevents callers from customizing tokenization behavior. Other implementations in this PR forward kwargs (e.g., onnx_multimodal_model.py).

Apply this diff:

 def _tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]:
     texts_query: list[str] = []
     for query in documents:
         query = self.BOS_TOKEN + self.QUERY_PREFIX + query + self.PAD_TOKEN * 10
         query += "\n"

         texts_query.append(query)
     assert self.tokenizer is not None
-    encoded = self.tokenizer.encode_batch(texts_query)
+    encoded = self.tokenizer.encode_batch(texts_query, **kwargs)
     return encoded
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def _tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]:
texts_query: list[str] = []
for query in documents:
query = self.BOS_TOKEN + self.QUERY_PREFIX + query + self.PAD_TOKEN * 10
query += "\n"
texts_query.append(query)
encoded = self.tokenizer.encode_batch(texts_query) # type: ignore[union-attr]
assert self.tokenizer is not None
encoded = self.tokenizer.encode_batch(texts_query)
return encoded
def _tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]:
texts_query: list[str] = []
for query in documents:
query = self.BOS_TOKEN + self.QUERY_PREFIX + query + self.PAD_TOKEN * 10
query += "\n"
texts_query.append(query)
assert self.tokenizer is not None
encoded = self.tokenizer.encode_batch(texts_query, **kwargs)
return encoded
🧰 Tools
🪛 Ruff (0.14.3)

166-166: Unused method argument: kwargs

(ARG002)

🤖 Prompt for AI Agents
In fastembed/late_interaction_multimodal/colpali.py around lines 166 to 175, the
method _tokenize accepts **kwargs but does not pass them to
self.tokenizer.encode_batch; update the call to forward kwargs (e.g.,
self.tokenizer.encode_batch(texts_query, **kwargs)) while keeping the existing
assertion that tokenizer is not None so callers can customize tokenization
options like truncation, padding, or return_tensors.


def _preprocess_onnx_text_input(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from typing import Any, Iterable, Optional, Sequence, Type, Union
from dataclasses import asdict

from tokenizers import Encoding

from fastembed.common import OnnxProvider, ImageInput
from fastembed.common.types import NumpyArray
from fastembed.late_interaction_multimodal.colpali import ColPali
Expand Down Expand Up @@ -117,6 +119,19 @@ def get_embedding_size(cls, model_name: str) -> int:
)
return embedding_size

def tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]:
"""
Tokenize input texts using the model's tokenizer.

Args:
documents: List of strings to tokenize
**kwargs: Additional arguments passed to the tokenizer

Returns:
List of tokenizer Encodings
"""
return self.model.tokenize(documents, **kwargs)

def embed_text(
self,
documents: Union[str, Iterable[str]],
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import Iterable, Optional, Union, Any

from tokenizers import Encoding

from fastembed.common import ImageInput
from fastembed.common.model_description import DenseModelDescription
Expand All @@ -21,6 +22,9 @@ def __init__(
self._local_files_only = kwargs.pop("local_files_only", False)
self._embedding_size: Optional[int] = None

def tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]:
raise NotImplementedError("Subclasses must implement this method.")

def embed_text(
self,
documents: Union[str, Iterable[str]],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,15 +80,17 @@ def _load_onnx_model(
def load_onnx_model(self) -> None:
raise NotImplementedError("Subclasses must implement this method")

def tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]:
return self.tokenizer.encode_batch(documents) # type: ignore[union-attr]
def _tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]:
if self.tokenizer is None:
raise RuntimeError("Tokenizer not initialized")
return self.tokenizer.encode_batch(documents, **kwargs) # type: ignore[union-attr]

def onnx_embed_text(
self,
documents: list[str],
**kwargs: Any,
) -> OnnxOutputContext:
encoded = self.tokenize(documents, **kwargs)
encoded = self._tokenize(documents, **kwargs)
input_ids = np.array([e.ids for e in encoded])
attention_mask = np.array([e.attention_mask for e in encoded]) # type: ignore[union-attr]
input_names = {node.name for node in self.model.get_inputs()} # type: ignore[union-attr]
Expand Down
4 changes: 2 additions & 2 deletions fastembed/rerank/cross_encoder/onnx_text_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ def _load_onnx_model(
self.tokenizer, _ = load_tokenizer(model_dir=model_dir)
assert self.tokenizer is not None

def tokenize(self, pairs: list[tuple[str, str]], **_: Any) -> list[Encoding]:
return self.tokenizer.encode_batch(pairs) # type: ignore[union-attr]
def tokenize(self, pairs: list[tuple[str, str]], **kwargs: Any) -> list[Encoding]:
return self.tokenizer.encode_batch(pairs, **kwargs) # type: ignore[union-attr]

def _build_onnx_input(self, tokenized_input: list[Encoding]) -> dict[str, NumpyArray]:
input_names: set[str] = {node.name for node in self.model.get_inputs()} # type: ignore[union-attr]
Expand Down
3 changes: 3 additions & 0 deletions fastembed/sparse/bm25.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,9 @@ def __init__(

self.tokenizer = SimpleTokenizer

def tokenize(self, documents: list[str], **kwargs: Any) -> dict[str, Any]:
raise NotImplementedError("Tokenize method for sparse embeddings is not implemented yet.")

@classmethod
def _list_supported_models(cls) -> list[SparseModelDescription]:
"""Lists the supported models.
Expand Down
3 changes: 3 additions & 0 deletions fastembed/sparse/bm42.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,9 @@ def __init__(
if not self.lazy_load:
self.load_onnx_model()

def tokenize(self, documents: list[str], **kwargs: Any) -> dict[str, Any]:
raise NotImplementedError("Tokenize method for sparse embeddings is not implemented yet.")

def load_onnx_model(self) -> None:
self._load_onnx_model(
model_dir=self._model_dir,
Expand Down
3 changes: 3 additions & 0 deletions fastembed/sparse/minicoil.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,9 @@ def __init__(
if not self.lazy_load:
self.load_onnx_model()

def tokenize(self, documents: list[str], **kwargs: Any) -> dict[str, Any]:
raise NotImplementedError("Tokenize method for sparse embeddings is not implemented yet.")

def load_onnx_model(self) -> None:
self._load_onnx_model(
model_dir=self._model_dir,
Expand Down
3 changes: 3 additions & 0 deletions fastembed/sparse/sparse_embedding_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ def __init__(
self.threads = threads
self._local_files_only = kwargs.pop("local_files_only", False)

def tokenize(self, documents: list[str], **kwargs: Any) -> dict[str, Any]:
raise NotImplementedError("Tokenize method for sparse embeddings is not implemented yet.")

def embed(
self,
documents: Union[str, Iterable[str]],
Expand Down
10 changes: 7 additions & 3 deletions fastembed/sparse/sparse_text_embedding.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from typing import Any, Iterable, Optional, Sequence, Type, Union
import warnings
from dataclasses import asdict
from typing import Any, Iterable, Optional, Sequence, Type, Union


from fastembed.common import OnnxProvider
from fastembed.common.model_description import SparseModelDescription
from fastembed.sparse.bm25 import Bm25
from fastembed.sparse.bm42 import Bm42
from fastembed.sparse.minicoil import MiniCOIL
Expand All @@ -10,8 +13,6 @@
SparseTextEmbeddingBase,
)
from fastembed.sparse.splade_pp import SpladePP
import warnings
from fastembed.common.model_description import SparseModelDescription


class SparseTextEmbedding(SparseTextEmbeddingBase):
Expand Down Expand Up @@ -91,6 +92,9 @@ def __init__(
"Please check the supported models using `SparseTextEmbedding.list_supported_models()`"
)

def tokenize(self, documents: list[str], **kwargs: Any) -> dict[str, Any]:
raise NotImplementedError("Tokenize method for sparse embeddings is not implemented yet.")
Comment on lines +95 to +96
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion | 🟠 Major

Remove duplicate implementation or delegate to self.model.

The tokenize method duplicates the base class implementation exactly. Since SparseTextEmbedding follows a delegation pattern for other methods (embed delegates to self.model.embed(), query_embed delegates to self.model.query_embed()), the tokenize method should either:

  1. Preferred: Delegate to self.model.tokenize() for consistency with the existing pattern, allowing concrete implementations to provide their own tokenization logic in the future.
  2. Alternative: Remove this override entirely and inherit the base class implementation.

The current implementation breaks the delegation pattern and creates unnecessary duplication.

Option 1 (Preferred): Delegate to self.model

 def tokenize(self, documents: list[str], **kwargs: Any) -> dict[str, Any]:
-    raise NotImplementedError("Tokenize method for sparse embeddings is not implemented yet.")
+    """
+    Tokenizes a list of documents.
+    
+    Args:
+        documents (list[str]): The list of documents to tokenize.
+        **kwargs: Additional keyword arguments for tokenization.
+    
+    Returns:
+        dict[str, Any]: Tokenized representation of the documents.
+    """
+    return self.model.tokenize(documents, **kwargs)

Option 2 (Alternative): Remove the override

-def tokenize(self, documents: list[str], **kwargs: Any) -> dict[str, Any]:
-    raise NotImplementedError("Tokenize method for sparse embeddings is not implemented yet.")
-

Committable suggestion skipped: line range outside the PR's diff.

🤖 Prompt for AI Agents
In fastembed/sparse/sparse_text_embedding.py around lines 95-96, the tokenize
override duplicates the base class implementation and breaks the delegation
pattern; change the method to delegate to the underlying model by returning
self.model.tokenize(documents, **kwargs) (preferred), or remove this override
entirely so the class inherits the base implementation; ensure to pass through
kwargs and return the result unchanged.


def embed(
self,
documents: Union[str, Iterable[str]],
Expand Down
4 changes: 4 additions & 0 deletions fastembed/sparse/splade_pp.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Any, Iterable, Optional, Sequence, Type, Union

import numpy as np

from fastembed.common import OnnxProvider
from fastembed.common.onnx_model import OnnxOutputContext
from fastembed.common.utils import define_cache_dir
Expand Down Expand Up @@ -135,6 +136,9 @@ def load_onnx_model(self) -> None:
device_id=self.device_id,
)

def tokenize(self, documents: list[str], **kwargs: Any) -> dict[str, Any]:
raise NotImplementedError("Tokenize method for sparse embeddings is not implemented yet.")

def embed(
self,
documents: Union[str, Iterable[str]],
Expand Down
5 changes: 5 additions & 0 deletions fastembed/text/onnx_embedding.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from typing import Any, Iterable, Optional, Sequence, Type, Union

from tokenizers import Encoding

from fastembed.common.types import NumpyArray, OnnxProvider
from fastembed.common.onnx_model import OnnxOutputContext
from fastembed.common.utils import define_cache_dir, normalize
Expand Down Expand Up @@ -319,6 +321,9 @@ def _post_process_onnx_output(
raise ValueError(f"Unsupported embedding shape: {embeddings.shape}")
return normalize(processed_embeddings)

def tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]:
return self._tokenize(documents, **kwargs)

def load_onnx_model(self) -> None:
self._load_onnx_model(
model_dir=self._model_dir,
Expand Down
6 changes: 3 additions & 3 deletions fastembed/text/onnx_text_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,15 +68,15 @@ def _load_onnx_model(
def load_onnx_model(self) -> None:
raise NotImplementedError("Subclasses must implement this method")

def tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]:
return self.tokenizer.encode_batch(documents) # type: ignore[union-attr]
def _tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]:
return self.tokenizer.encode_batch(documents) # type:ignore[union-attr]
Comment on lines +71 to +72
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Forward kwargs to enable tokenization customization.

The **kwargs parameter is declared but not forwarded to encode_batch. This breaks customization options that callers might pass through the tokenization chain. Other implementations in this PR (e.g., onnx_multimodal_model.py, onnx_text_model.py in rerank) forward kwargs.

Apply this diff:

 def _tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]:
-    return self.tokenizer.encode_batch(documents)  # type:ignore[union-attr]
+    return self.tokenizer.encode_batch(documents, **kwargs)  # type:ignore[union-attr]
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def _tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]:
return self.tokenizer.encode_batch(documents) # type:ignore[union-attr]
def _tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]:
return self.tokenizer.encode_batch(documents, **kwargs) # type:ignore[union-attr]
🧰 Tools
🪛 Ruff (0.14.3)

71-71: Unused method argument: kwargs

(ARG002)

🤖 Prompt for AI Agents
In fastembed/text/onnx_text_model.py around lines 71 to 72, the _tokenize method
accepts **kwargs but does not forward them to tokenizer.encode_batch; update the
return to call self.tokenizer.encode_batch(documents, **kwargs) (preserving the
existing type-ignore if needed) so callers can customize tokenization options
and match other implementations in this PR.


def onnx_embed(
self,
documents: list[str],
**kwargs: Any,
) -> OnnxOutputContext:
encoded = self.tokenize(documents, **kwargs)
encoded = self._tokenize(documents, **kwargs)
input_ids = np.array([e.ids for e in encoded])
attention_mask = np.array([e.attention_mask for e in encoded])
input_names = {node.name for node in self.model.get_inputs()} # type: ignore[union-attr]
Expand Down
14 changes: 14 additions & 0 deletions fastembed/text/text_embedding.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import warnings
from typing import Any, Iterable, Optional, Sequence, Type, Union
from tokenizers import Encoding
from dataclasses import asdict

from fastembed.common.types import NumpyArray, OnnxProvider
Expand Down Expand Up @@ -162,6 +163,19 @@ def get_embedding_size(cls, model_name: str) -> int:
)
return embedding_size

def tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]:
"""
Tokenize input texts using the model's tokenizer.

Args:
documents: List of strings to tokenize
**kwargs: Additional arguments passed to the tokenizer

Returns:
List of tokenizer Encodings
"""
return self.model.tokenize(documents, **kwargs)

def embed(
self,
documents: Union[str, Iterable[str]],
Expand Down
4 changes: 4 additions & 0 deletions fastembed/text/text_embedding_base.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import Iterable, Optional, Union, Any
from tokenizers import Encoding

from fastembed.common.model_description import DenseModelDescription
from fastembed.common.types import NumpyArray
Expand All @@ -19,6 +20,9 @@ def __init__(
self._local_files_only = kwargs.pop("local_files_only", False)
self._embedding_size: Optional[int] = None

def tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]:
raise NotImplementedError("Subclasses must implement this method.")

def embed(
self,
documents: Union[str, Iterable[str]],
Expand Down
56 changes: 43 additions & 13 deletions tests/test_late_interaction_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,23 +249,27 @@ def test_single_embedding_query(model_cache, model_name: str):


@pytest.mark.parametrize("token_dim,model_name", [(96, "answerdotai/answerai-colbert-small-v1")])
def test_parallel_processing(model_cache, token_dim: int, model_name: str):
with model_cache(model_name) as model:
docs = ["hello world", "flag embedding"] * 100
embeddings = list(model.embed(docs, batch_size=10, parallel=2))

embeddings_2 = list(model.embed(docs, batch_size=10, parallel=None))
def test_parallel_processing(token_dim: int, model_name: str):
# this test loads a copy of a model per process, might cause oom in parallel=0 on machines with
# an insufficient mem-to-cpus-ratio
is_ci = os.getenv("CI")
model = LateInteractionTextEmbedding(model_name=model_name)
docs = ["hello world", "flag embedding"] * 100
embeddings = list(model.embed(docs, batch_size=10, parallel=2))

embeddings_2 = list(model.embed(docs, batch_size=10, parallel=None))

# embeddings_3 = list(model.embed(docs, batch_size=10, parallel=0)) # inherits OnnxTextModel which
# # is tested in TextEmbedding, disabling it here to reduce number of requests to hf
# # multiprocessing is enough to test with `parallel=2`, and `parallel=None` is okay to tests since it reuses
# # model from cache
# embeddings_3 = list(model.embed(docs, batch_size=10, parallel=0)) # inherits OnnxTextModel which
# # is tested in TextEmbedding, disabling it here to reduce number of requests to hf
# # multiprocessing is enough to test with `parallel=2`, and `parallel=None` is okay to tests since it reuses
# # model from cache

assert len(embeddings) == len(docs) and embeddings[0].shape[-1] == token_dim
assert len(embeddings) == len(docs) and embeddings[0].shape[-1] == token_dim

for i in range(len(embeddings)):
assert np.allclose(embeddings[i], embeddings_2[i], atol=1e-3)
# assert np.allclose(embeddings[i], embeddings_3[i], atol=1e-3)
for i in range(len(embeddings)):
assert np.allclose(embeddings[i], embeddings_2[i], atol=1e-3)
# assert np.allclose(embeddings[i], embeddings_3[i], atol=1e-3)


@pytest.mark.parametrize("model_name", ["answerdotai/answerai-colbert-small-v1"])
Expand Down Expand Up @@ -308,3 +312,29 @@ def test_embedding_size():
assert model.embedding_size == 96
if is_ci:
delete_model_cache(model.model._model_dir)


@pytest.mark.parametrize("model_name", ["answerdotai/answerai-colbert-small-v1"])
def test_tokenize(model_name: str) -> None:
is_ci = os.getenv("CI")
model = LateInteractionTextEmbedding(model_name=model_name)

texts = ["hello world", "flag embedding"]
enc_doc = model.tokenize(texts, is_doc=True)
assert len(enc_doc) == 2
for encoding in enc_doc:
assert encoding.ids is not None
assert len(encoding.ids) > 0

enc_query = model.tokenize(["hello world"], is_doc=False)
assert len(enc_query) == 1
assert enc_query[0].ids is not None
assert len(enc_query[0].ids) == 31 # colbert requires query to be at least 32 tokens,
# padding is done during tokenization, the last token is added preprocess onnx input

doc_ids = list(enc_doc[0].ids)
query_ids = list(enc_query[0].ids)
assert doc_ids != query_ids

if is_ci:
delete_model_cache(model.model._model_dir)
Loading
Loading