-
Notifications
You must be signed in to change notification settings - Fork 168
feat: add public tokenize method for TextEmbedding class #564
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d2611d6
d7abe5f
4f0ad84
cc431f3
c4f2297
97b862c
4045721
cecaf29
98f0fc8
39b0d7f
7dba1d0
b9d80a8
683eba2
a26aad6
42974dd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -161,13 +161,17 @@ def _post_process_onnx_text_output( | |||||||||||||||||||||||||||||||||||||||||||
| return output.model_output | ||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
| def tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]: | ||||||||||||||||||||||||||||||||||||||||||||
| return self._tokenize(documents, **kwargs) | ||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
| def _tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]: | ||||||||||||||||||||||||||||||||||||||||||||
| texts_query: list[str] = [] | ||||||||||||||||||||||||||||||||||||||||||||
| for query in documents: | ||||||||||||||||||||||||||||||||||||||||||||
| query = self.BOS_TOKEN + self.QUERY_PREFIX + query + self.PAD_TOKEN * 10 | ||||||||||||||||||||||||||||||||||||||||||||
| query += "\n" | ||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
| texts_query.append(query) | ||||||||||||||||||||||||||||||||||||||||||||
| encoded = self.tokenizer.encode_batch(texts_query) # type: ignore[union-attr] | ||||||||||||||||||||||||||||||||||||||||||||
| assert self.tokenizer is not None | ||||||||||||||||||||||||||||||||||||||||||||
| encoded = self.tokenizer.encode_batch(texts_query) | ||||||||||||||||||||||||||||||||||||||||||||
| return encoded | ||||||||||||||||||||||||||||||||||||||||||||
|
Comment on lines
+166
to
175
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Forward kwargs to enable tokenization customization. The Apply this diff: def _tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]:
texts_query: list[str] = []
for query in documents:
query = self.BOS_TOKEN + self.QUERY_PREFIX + query + self.PAD_TOKEN * 10
query += "\n"
texts_query.append(query)
assert self.tokenizer is not None
- encoded = self.tokenizer.encode_batch(texts_query)
+ encoded = self.tokenizer.encode_batch(texts_query, **kwargs)
return encoded📝 Committable suggestion
Suggested change
🧰 Tools🪛 Ruff (0.14.3)166-166: Unused method argument: (ARG002) 🤖 Prompt for AI Agents |
||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
| def _preprocess_onnx_text_input( | ||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,7 +1,10 @@ | ||
| from typing import Any, Iterable, Optional, Sequence, Type, Union | ||
| import warnings | ||
| from dataclasses import asdict | ||
| from typing import Any, Iterable, Optional, Sequence, Type, Union | ||
|
|
||
|
|
||
| from fastembed.common import OnnxProvider | ||
| from fastembed.common.model_description import SparseModelDescription | ||
| from fastembed.sparse.bm25 import Bm25 | ||
| from fastembed.sparse.bm42 import Bm42 | ||
| from fastembed.sparse.minicoil import MiniCOIL | ||
|
|
@@ -10,8 +13,6 @@ | |
| SparseTextEmbeddingBase, | ||
| ) | ||
| from fastembed.sparse.splade_pp import SpladePP | ||
| import warnings | ||
| from fastembed.common.model_description import SparseModelDescription | ||
|
|
||
|
|
||
| class SparseTextEmbedding(SparseTextEmbeddingBase): | ||
|
|
@@ -91,6 +92,9 @@ def __init__( | |
| "Please check the supported models using `SparseTextEmbedding.list_supported_models()`" | ||
| ) | ||
|
|
||
| def tokenize(self, documents: list[str], **kwargs: Any) -> dict[str, Any]: | ||
| raise NotImplementedError("Tokenize method for sparse embeddings is not implemented yet.") | ||
|
Comment on lines
+95
to
+96
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion | 🟠 Major Remove duplicate implementation or delegate to self.model. The
The current implementation breaks the delegation pattern and creates unnecessary duplication. Option 1 (Preferred): Delegate to self.model def tokenize(self, documents: list[str], **kwargs: Any) -> dict[str, Any]:
- raise NotImplementedError("Tokenize method for sparse embeddings is not implemented yet.")
+ """
+ Tokenizes a list of documents.
+
+ Args:
+ documents (list[str]): The list of documents to tokenize.
+ **kwargs: Additional keyword arguments for tokenization.
+
+ Returns:
+ dict[str, Any]: Tokenized representation of the documents.
+ """
+ return self.model.tokenize(documents, **kwargs)Option 2 (Alternative): Remove the override -def tokenize(self, documents: list[str], **kwargs: Any) -> dict[str, Any]:
- raise NotImplementedError("Tokenize method for sparse embeddings is not implemented yet.")
-
🤖 Prompt for AI Agents |
||
|
|
||
| def embed( | ||
| self, | ||
| documents: Union[str, Iterable[str]], | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -68,15 +68,15 @@ def _load_onnx_model( | |||||||||
| def load_onnx_model(self) -> None: | ||||||||||
| raise NotImplementedError("Subclasses must implement this method") | ||||||||||
|
|
||||||||||
| def tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]: | ||||||||||
| return self.tokenizer.encode_batch(documents) # type: ignore[union-attr] | ||||||||||
| def _tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]: | ||||||||||
| return self.tokenizer.encode_batch(documents) # type:ignore[union-attr] | ||||||||||
|
Comment on lines
+71
to
+72
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Forward kwargs to enable tokenization customization. The Apply this diff: def _tokenize(self, documents: list[str], **kwargs: Any) -> list[Encoding]:
- return self.tokenizer.encode_batch(documents) # type:ignore[union-attr]
+ return self.tokenizer.encode_batch(documents, **kwargs) # type:ignore[union-attr]📝 Committable suggestion
Suggested change
🧰 Tools🪛 Ruff (0.14.3)71-71: Unused method argument: (ARG002) 🤖 Prompt for AI Agents |
||||||||||
|
|
||||||||||
| def onnx_embed( | ||||||||||
| self, | ||||||||||
| documents: list[str], | ||||||||||
| **kwargs: Any, | ||||||||||
| ) -> OnnxOutputContext: | ||||||||||
| encoded = self.tokenize(documents, **kwargs) | ||||||||||
| encoded = self._tokenize(documents, **kwargs) | ||||||||||
| input_ids = np.array([e.ids for e in encoded]) | ||||||||||
| attention_mask = np.array([e.attention_mask for e in encoded]) | ||||||||||
| input_names = {node.name for node in self.model.get_inputs()} # type: ignore[union-attr] | ||||||||||
|
|
||||||||||
Uh oh!
There was an error while loading. Please reload this page.