deeppavlov
diff --git a/‎autointent/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎autointent/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎autointent/_embedder.py‎
Lines changed: 53 additions & 2 deletions b/‎autointent/_embedder.py‎
Lines changed: 53 additions & 2 deletions
diff --git a/‎autointent/_hash.py‎
Lines changed: 72 additions & 0 deletions b/‎autointent/_hash.py‎
Lines changed: 72 additions & 0 deletions
diff --git a/‎autointent/configs/_optimization_cli.py‎
Lines changed: 2 additions & 0 deletions b/‎autointent/configs/_optimization_cli.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎autointent/context/_context.py‎
Lines changed: 9 additions & 0 deletions b/‎autointent/context/_context.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎autointent/context/vector_index_client/_vector_index.py‎
Lines changed: 3 additions & 0 deletions b/‎autointent/context/vector_index_client/_vector_index.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎autointent/context/vector_index_client/_vector_index_client.py‎
Lines changed: 17 additions & 2 deletions b/‎autointent/context/vector_index_client/_vector_index_client.py‎
Lines changed: 17 additions & 2 deletions
diff --git a/‎autointent/modules/retrieval/_vectordb.py‎
Lines changed: 6 additions & 0 deletions b/‎autointent/modules/retrieval/_vectordb.py‎
Lines changed: 6 additions & 0 deletions
@@ -1,6 +1,7 @@
 from ._embedder import Embedder
+from ._hash import Hasher
 from .context import Context
 from .context.data_handler import Dataset
 from .pipeline import Pipeline
 
-__all__ = ["Context", "Dataset", "Embedder", "Pipeline"]
+__all__ = ["Context", "Dataset", "Embedder", "Hasher", "Pipeline"]
@@ -12,8 +12,27 @@
 
 import numpy as np
 import numpy.typing as npt
+from appdirs import user_cache_dir
 from sentence_transformers import SentenceTransformer
 
+from ._hash import Hasher
+
+
+def get_embeddings_path(filename: str) -> Path:
+    """
+    Get the path to the embeddings file.
+
+    This function constructs the full path to an embeddings file stored
+    in a specific directory under the user's home directory. The embeddings
+    file is named based on the provided filename, with the `.npy` extension
+    added.
+
+    :param filename: The name of the embeddings file (without extension).
+
+    :return: The full path to the embeddings file.
+    """
+    return Path(user_cache_dir("autointent")) / "embeddings" / f"{filename}.npy"
+
 
 class EmbedderDumpMetadata(TypedDict):
     """Metadata for saving and loading an Embedder instance."""
@@ -41,6 +60,7 @@ def __init__(
         device: str = "cpu",
         batch_size: int = 32,
         max_length: int | None = None,
+        use_cache: bool = False,
     ) -> None:
         """
         Initialize the Embedder.
@@ -49,11 +69,13 @@ def __init__(
         :param device: Device to run the model on (e.g., "cpu", "cuda").
         :param batch_size: Batch size for embedding calculations.
         :param max_length: Maximum sequence length for the embedding model.
+        :param embedder_use_cache: Flag indicating whether to cache intermediate embeddings.
         """
         self.model_name = model_name
         self.device = device
         self.batch_size = batch_size
         self.max_length = max_length
+        self.use_cache = use_cache
 
         if Path(model_name).exists():
             self.load(model_name)
@@ -62,6 +84,18 @@ def __init__(
 
         self.logger = logging.getLogger(__name__)
 
+    def __hash__(self) -> int:
+        """
+        Compute a hash value for the Embedder.
+
+        :returns: The hash value of the Embedder.
+        """
+        hasher = Hasher()
+        for parameter in self.embedding_model.parameters():
+            hasher.update(parameter.detach().cpu().numpy())
+        hasher.update(self.max_length)
+        return hasher.intdigest()
+
     def clear_ram(self) -> None:
         """Move the embedding model to CPU and delete it from memory."""
         self.logger.debug("Clearing embedder %s from memory", self.model_name)
@@ -114,18 +148,35 @@ def embed(self, utterances: list[str]) -> npt.NDArray[np.float32]:
         :param utterances: List of input texts to calculate embeddings for.
         :return: A numpy array of embeddings.
         """
+        if self.use_cache:
+            hasher = Hasher()
+            hasher.update(self)
+            hasher.update(utterances)
+
+            embeddings_path = get_embeddings_path(hasher.hexdigest())
+            if embeddings_path.exists():
+                return np.load(embeddings_path)  # type: ignore[no-any-return]
+
         self.logger.debug(
             "Calculating embeddings with model %s, batch_size=%d, max_seq_length=%s, device=%s",
             self.model_name,
             self.batch_size,
             str(self.max_length),
             self.device,
         )
+
         if self.max_length is not None:
             self.embedding_model.max_seq_length = self.max_length
-        return self.embedding_model.encode(
+
+        embeddings = self.embedding_model.encode(
             utterances,
             convert_to_numpy=True,
             batch_size=self.batch_size,
             normalize_embeddings=True,
-        )  # type: ignore[return-value]
+        )
+
+        if self.use_cache:
+            embeddings_path.parent.mkdir(parents=True, exist_ok=True)
+            np.save(embeddings_path, embeddings)
+
+        return embeddings  # type: ignore[return-value]
@@ -0,0 +1,72 @@
+"""This module provides functionality for hashing data using the xxhash algorithm."""
+
+import pickle
+from typing import Any
+
+import xxhash
+
+
+class Hasher:
+    """
+    A class that provides methods for hashing data using xxhash.
+
+    This class supports both a class-level method for generating hashes from
+    any given value, as well as an instance-level method for progressively
+    updating a hash state with new values.
+    """
+
+    def __init__(self) -> None:
+        """
+        Initialize the Hasher instance and sets up the internal xxhash state.
+
+        This state will be used for progressively hashing values using the
+        `update` method and obtaining the final digest using `hexdigest`.
+        """
+        self._state = xxhash.xxh64()
+
+    @classmethod
+    def hash(cls, value: Any) -> int:  # noqa: ANN401
+        """
+        Generate a hash for the given value using xxhash.
+
+        :param value: The value to be hashed. This can be any Python object.
+
+        :return: The resulting hash digest as a hexadecimal string.
+        """
+        if hasattr(value, "__hash__") and value.__hash__ not in {None, object.__hash__}:
+            return hash(value)
+        return xxhash.xxh64(pickle.dumps(value)).intdigest()
+
+    def update(self, value: Any) -> None:  # noqa: ANN401
+        """
+        Update the internal hash state with the provided value.
+
+        This method will first hash the type of the value, then hash the value
+        itself, and update the internal state accordingly.
+
+        :param value: The value to update the hash state with.
+        """
+        self._state.update(str(type(value)).encode())
+        self._state.update(str(self.hash(value)).encode())
+
+    def hexdigest(self) -> str:
+        """
+        Return the current hash digest as a hexadecimal string.
+
+        This method should be called after one or more `update` calls to get
+        the final hash result.
+
+        :return: The resulting hash digest as a hexadecimal string.
+        """
+        return self._state.hexdigest()
+
+    def intdigest(self) -> int:
+        """
+        Return the current hash digest as an integer.
+
+        This method should be called after one or more `update` calls to get
+        the final hash result.
+
+        :return: The resulting hash digest as an integer.
+        """
+        return self._state.intdigest()
@@ -107,6 +107,8 @@ class EmbedderConfig:
     """Batch size for the embedder"""
     max_length: int | None = None
     """Max length for the embedder. If None, the max length will be taken from model config"""
+    use_cache: bool = False
+    """Flag indicating whether to cache embeddings for reuse, improving performance in repeated operations."""
 
 
 @dataclass
 
@@ -68,6 +68,7 @@ def configure_vector_index(self, config: VectorIndexConfig, embedder_config: Emb
             self.vector_index_config.db_dir,
             self.embedder_config.batch_size,
             self.embedder_config.max_length,
+            self.embedder_config.use_cache,
         )
 
     def configure_data(self, config: DataConfig) -> None:
@@ -190,6 +191,14 @@ def get_max_length(self) -> int | None:
         """
         return self.vector_index_client.embedder_max_length
 
+    def get_use_cache(self) -> bool:
+        """
+        Check if caching is enabled for the embedder.
+
+        :return: True if caching is enabled, False otherwise.
+        """
+        return self.vector_index_client.embedder_use_cache
+
     def get_dump_dir(self) -> Path | None:
         """
         Get the directory for saving dumped modules.
 
@@ -31,6 +31,7 @@ def __init__(
         device: str,
         embedder_batch_size: int = 32,
         embedder_max_length: int | None = None,
+        embedder_use_cache: bool = False,
     ) -> None:
         """
         Initialize the vector index.
@@ -39,13 +40,15 @@ def __init__(
         :param device: Device for running the embedding model (e.g., "cpu", "cuda").
         :param embedder_batch_size: Batch size for the embedder.
         :param embedder_max_length: Maximum sequence length for the embedder.
+        :param embedder_use_cache: Flag indicating whether to cache intermediate embeddings.
         """
         self.model_name = model_name
         self.embedder = Embedder(
             model_name=model_name,
             batch_size=embedder_batch_size,
             device=device,
             max_length=embedder_max_length,
+            use_cache=embedder_use_cache,
         )
         self.device = device
 
 
@@ -32,6 +32,7 @@ def __init__(
         db_dir: str | Path | None,
         embedder_batch_size: int = 32,
         embedder_max_length: int | None = None,
+        embedder_use_cache: bool = False,
     ) -> None:
         """
         Initialize the VectorIndexClient.
@@ -40,12 +41,14 @@ def __init__(
         :param db_dir: Directory for storing vector indexes. Defaults to a cache directory.
         :param embedder_batch_size: Batch size for the embedding model.
         :param embedder_max_length: Maximum sequence length for the embedding model.
+        :param embedder_use_cache: Flag indicating whether to cache intermediate embeddings.
         """
         self._logger = logging.getLogger(__name__)
         self.device = device
         self.db_dir = get_db_dir(db_dir)
         self.embedder_batch_size = embedder_batch_size
         self.embedder_max_length = embedder_max_length
+        self.embedder_use_cache = embedder_use_cache
 
     def create_index(
         self,
@@ -64,7 +67,13 @@ def create_index(
         """
         self._logger.info("Creating index for model: %s", model_name)
 
-        index = VectorIndex(model_name, self.device, self.embedder_batch_size, self.embedder_max_length)
+        index = VectorIndex(
+            model_name,
+            self.device,
+            self.embedder_batch_size,
+            self.embedder_max_length,
+            self.embedder_use_cache,
+        )
         if utterances is not None and labels is not None:
             index.add(utterances, labels)
             self.dump(index)
@@ -165,7 +174,13 @@ def get_index(self, model_name: str) -> VectorIndex:
         """
         dirpath = self._get_index_dirpath(model_name)
         if dirpath is not None:
-            index = VectorIndex(model_name, self.device, self.embedder_batch_size, self.embedder_max_length)
+            index = VectorIndex(
+                model_name,
+                self.device,
+                self.embedder_batch_size,
+                self.embedder_max_length,
+                self.embedder_use_cache,
+            )
             index.load(dirpath)
             return index
 
 
@@ -70,6 +70,7 @@ def __init__(
         device: str = "cpu",
         batch_size: int = 32,
         max_length: int | None = None,
+        embedder_use_cache: bool = False,
     ) -> None:
         """
         Initialize the VectorDBModule.
@@ -80,12 +81,14 @@ def __init__(
         :param device: Device to run operations on, e.g., "cpu" or "cuda".
         :param batch_size: Batch size for embedding generation.
         :param max_length: Maximum sequence length for embeddings. None if not set.
+        :param embedder_use_cache: Flag indicating whether to cache intermediate embeddings.
         """
         self.embedder_name = embedder_name
         self.device = device
         self._db_dir = db_dir
         self.batch_size = batch_size
         self.max_length = max_length
+        self.embedder_use_cache = embedder_use_cache
 
         super().__init__(k=k)
 
@@ -111,6 +114,7 @@ def from_context(
             device=context.get_device(),
             batch_size=context.get_batch_size(),
             max_length=context.get_max_length(),
+            embedder_use_cache=context.get_use_cache(),
         )
 
     @property
@@ -136,6 +140,7 @@ def fit(self, utterances: list[str], labels: list[LabelType]) -> None:
             self.db_dir,
             embedder_batch_size=self.batch_size,
             embedder_max_length=self.max_length,
+            embedder_use_cache=self.embedder_use_cache,
         )
         self.vector_index = vector_index_client.create_index(self.embedder_name, utterances, labels)
 
@@ -209,6 +214,7 @@ def load(self, path: str) -> None:
             db_dir=self.metadata["db_dir"],
             embedder_batch_size=self.metadata["batch_size"],
             embedder_max_length=self.metadata["max_length"],
+            embedder_use_cache=self.embedder_use_cache,
         )
         self.vector_index = vector_index_client.get_index(self.embedder_name)