feat: switch LocalEmbedder to sentence-transformers backend (#508)

amaciaszek-dsai · web-flow · commit 4fe30250a906 · 2025-04-17T08:19:39.000Z
diff --git a/packages/ragbits-core/CHANGELOG.md b/packages/ragbits-core/CHANGELOG.md
@@ -7,6 +7,7 @@
 - Add new fusion strategies for the hybrid vector store: RRF and DBSF (#413)
 - move sources from ragbits-document-search to ragbits-core (#496)
 - adding connection check to Azure get_blob_service (#502)
+- modify LocalEmbedder to use sentence-transformers instead of torch (#508)
 
 ## 0.13.0 (2025-04-02)
 - Make the score in VectorStoreResult consistent (always bigger is better)
diff --git a/packages/ragbits-core/pyproject.toml b/packages/ragbits-core/pyproject.toml
@@ -51,6 +51,7 @@ chroma = [
     "chromadb>=0.6.3,<1.0.0",
 ]
 local = [
+    "sentence-transformers>=4.0.2,<5.0.0",
     "torch>=2.2.1,<3.0.0",
     "transformers>=4.44.2,<5.0.0",
     "numpy>=1.26.0,<2.0.0"
diff --git a/packages/ragbits-core/src/ragbits/core/embeddings/local.py b/packages/ragbits-core/src/ragbits/core/embeddings/local.py
@@ -1,13 +1,12 @@
-from collections.abc import Iterator
+from dataclasses import field
+from typing import Any
 
 from ragbits.core.audit import trace
 from ragbits.core.embeddings import Embedder
 from ragbits.core.options import Options
 
 try:
-    import torch
-    import torch.nn.functional as F
-    from transformers import AutoModel, AutoTokenizer
+    from sentence_transformers import SentenceTransformer
 
     HAS_LOCAL_EMBEDDINGS = True
 except ImportError:
@@ -19,30 +18,31 @@ class LocalEmbedderOptions(Options):
     Dataclass that represents available call options for the LocalEmbedder client.
     """
 
-    batch_size: int = 1
+    encode_kwargs: dict = field(default_factory=dict)
 
 
 class LocalEmbedder(Embedder[LocalEmbedderOptions]):
     """
     Class for interaction with any encoder available in HuggingFace.
 
-    Note: Local implementation is not dedicated for production. Use it only in experiments / evaluation
+    Note: Local implementation is not dedicated for production. Use it only in experiments / evaluation.
     """
 
     options_cls = LocalEmbedderOptions
 
     def __init__(
         self,
         model_name: str,
-        api_key: str | None = None,
         default_options: LocalEmbedderOptions | None = None,
+        **model_kwargs: Any,  # noqa: ANN401
     ) -> None:
-        """Constructs a new local LLM instance.
+        """
+        Constructs a new local LLM instance.
 
         Args:
             model_name: Name of the model to use.
-            api_key: The API key for Hugging Face authentication.
             default_options: Default options for the embedding model.
+            model_kwargs: Additional arguments to pass to the SentenceTransformer.
 
         Raises:
             ImportError: If the 'local' extra requirements are not installed.
@@ -52,15 +52,12 @@ def __init__(
 
         super().__init__(default_options=default_options)
 
-        self.hf_api_key = api_key
         self.model_name = model_name
-
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.model = AutoModel.from_pretrained(self.model_name, token=self.hf_api_key).to(self.device)
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, token=self.hf_api_key)
+        self.model = SentenceTransformer(self.model_name, **model_kwargs)
 
     async def embed_text(self, data: list[str], options: LocalEmbedderOptions | None = None) -> list[list[float]]:
-        """Calls the appropriate encoder endpoint with the given data and options.
+        """
+        Calls the appropriate encoder endpoint with the given data and options.
 
         Args:
             data: List of strings to get embeddings for.
@@ -74,36 +71,7 @@ async def embed_text(self, data: list[str], options: LocalEmbedderOptions | None
             data=data,
             model_name=self.model_name,
             model_obj=repr(self.model),
-            tokenizer=repr(self.tokenizer),
-            device=self.device,
             options=merged_options.dict(),
         ) as outputs:
-            embeddings = []
-            for batch in self._batch(data, merged_options.batch_size):
-                batch_dict = self.tokenizer(
-                    batch,
-                    max_length=self.tokenizer.model_max_length,
-                    padding=True,
-                    truncation=True,
-                    return_tensors="pt",
-                ).to(self.device)
-                with torch.no_grad():
-                    model_outputs = self.model(**batch_dict)
-                    batch_embeddings = self._average_pool(model_outputs.last_hidden_state, batch_dict["attention_mask"])
-                    batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1)
-                embeddings.extend(batch_embeddings.to("cpu").tolist())
-
-            torch.cuda.empty_cache()
-            outputs.embeddings = embeddings
-        return embeddings
-
-    @staticmethod
-    def _batch(data: list[str], batch_size: int) -> Iterator[list[str]]:
-        length = len(data)
-        for ndx in range(0, length, batch_size):
-            yield data[ndx : min(ndx + batch_size, length)]
-
-    @staticmethod
-    def _average_pool(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
-        last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
-        return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+            outputs.embeddings = self.model.encode(data, **merged_options.encode_kwargs).tolist()
+        return outputs.embeddings
diff --git a/packages/ragbits-core/tests/unit/embeddings/test_fastembed.py b/packages/ragbits-core/tests/unit/embeddings/test_fastembed.py
diff --git a/packages/ragbits-core/tests/unit/embeddings/test_local.py b/packages/ragbits-core/tests/unit/embeddings/test_local.py
@@ -0,0 +1,48 @@
+import pickle
+
+import numpy as np
+
+from ragbits.core.embeddings.local import LocalEmbedder, LocalEmbedderOptions
+
+
+async def test_local_embedder_embed_text():
+    embedder = LocalEmbedder("sentence-transformers/all-MiniLM-L6-v2")
+
+    result = await embedder.embed_text(["test text"])
+
+    # Check that embeddings have the expected shape
+    assert len(result) == 1
+    assert len(result[0]) == 384  # This dimension depends on the model
+
+
+async def test_local_embedder_with_custom_encode_kwargs():
+    # Test with custom encode parameters
+    embedder = LocalEmbedder(
+        "BAAI/bge-small-en-v1.5",
+        prompts={
+            "classification": "Classify the following text: ",
+            "retrieval": "Retrieve semantically similar text: ",
+            "clustering": "Identify the topic or theme based on the text: ",
+        },
+    )
+    options = LocalEmbedderOptions(encode_kwargs={"prompt_name": "retrieval"})
+    result = await embedder.embed_text(["test text"], options=options)
+
+    assert len(result) == 1
+    assert len(result[0]) > 0
+
+    embedder = LocalEmbedder("BAAI/bge-small-en-v1.5")
+    result_no_prompt = await embedder.embed_text(["test text"])
+
+    # Check that the embeddings with custom prompt are different from the default ones
+    assert not np.array_equal(result[0], result_no_prompt[0])
+
+
+def test_local_embedder_pickling():
+    embedder = LocalEmbedder("sentence-transformers/all-MiniLM-L6-v2")
+    pickled = pickle.dumps(embedder)
+    unpickled = pickle.loads(pickled)  # noqa: S301
+
+    assert isinstance(unpickled, LocalEmbedder)
+    assert unpickled.model_name == "sentence-transformers/all-MiniLM-L6-v2"
+    assert unpickled.default_options == embedder.default_options
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -51,6 +51,7 @@ chroma = [`
`51`	`51`	`"chromadb>=0.6.3,<1.0.0",`
`52`	`52`	`]`
`53`	`53`	`local = [`
	`54`	`+ "sentence-transformers>=4.0.2,<5.0.0",`
`54`	`55`	`"torch>=2.2.1,<3.0.0",`
`55`	`56`	`"transformers>=4.44.2,<5.0.0",`
`56`	`57`	`"numpy>=1.26.0,<2.0.0"`