deeppavlov
diff --git a/‎Makefile‎
Lines changed: 6 additions & 0 deletions b/‎Makefile‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎autointent/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎autointent/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎autointent/_dataset/_dataset.py‎
Lines changed: 2 additions & 3 deletions b/‎autointent/_dataset/_dataset.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎autointent/_dataset/_validation.py‎
Lines changed: 3 additions & 4 deletions b/‎autointent/_dataset/_validation.py‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎autointent/_embedder.py‎
Lines changed: 53 additions & 2 deletions b/‎autointent/_embedder.py‎
Lines changed: 53 additions & 2 deletions
diff --git a/‎autointent/_hash.py‎
Lines changed: 72 additions & 0 deletions b/‎autointent/_hash.py‎
Lines changed: 72 additions & 0 deletions
diff --git a/‎autointent/_pipeline/_pipeline.py‎
Lines changed: 5 additions & 6 deletions b/‎autointent/_pipeline/_pipeline.py‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎autointent/configs/_optimization_cli.py‎
Lines changed: 2 additions & 0 deletions b/‎autointent/configs/_optimization_cli.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎autointent/context/_context.py‎
Lines changed: 9 additions & 0 deletions b/‎autointent/context/_context.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎autointent/context/vector_index_client/_vector_index.py‎
Lines changed: 3 additions & 0 deletions b/‎autointent/context/vector_index_client/_vector_index.py‎
Lines changed: 3 additions & 0 deletions
@@ -38,5 +38,11 @@ test-docs: docs
 serve-docs: docs
 	$(poetry) python -m http.server -d docs/build/html 8333
 
+.PHONY: clean-docs
+clean-docs:
+	rm -rf docs/build
+	rm -rf docs/source/autoapi
+	rm -rf docs/source/tutorials
+
 .PHONY: all
 all: lint
@@ -1,6 +1,7 @@
 from ._embedder import Embedder
 from ._dataset import Dataset
+from ._hash import Hasher
 from .context import Context
 from ._pipeline import Pipeline
 
-__all__ = ["Context", "Dataset", "Embedder", "Pipeline"]
+__all__ = ["Context", "Dataset", "Embedder", "Hasher", "Pipeline"]
@@ -7,7 +7,6 @@
 
 from datasets import ClassLabel, Sequence, concatenate_datasets, get_dataset_config_names, load_dataset
 from datasets import Dataset as HFDataset
-from typing_extensions import Self
 
 from autointent.custom_types import LabelType, Split
 from autointent.schemas import Intent, Tag
@@ -122,7 +121,7 @@ def dump(self) -> dict[str, list[dict[str, Any]]]:
         """
         return {split_name: split.to_list() for split_name, split in self.items()}
 
-    def encode_labels(self) -> Self:
+    def encode_labels(self) -> "Dataset":
         """
         Encode dataset labels into one-hot or multilabel format.
 
@@ -133,7 +132,7 @@ def encode_labels(self) -> Self:
         self._encoded_labels = True
         return self
 
-    def to_multilabel(self) -> Self:
+    def to_multilabel(self) -> "Dataset":
         """
         Convert dataset labels to multilabel format.
 
 
@@ -1,7 +1,6 @@
 """File with definitions of DatasetReader and DatasetValidator."""
 
 from pydantic import BaseModel, model_validator
-from typing_extensions import Self
 
 from autointent.schemas import Intent, Sample
 
@@ -21,7 +20,7 @@ class DatasetReader(BaseModel):
     intents: list[Intent] = []
 
     @model_validator(mode="after")
-    def validate_dataset(self) -> Self:
+    def validate_dataset(self) -> "DatasetReader":
         """
         Validate the dataset by ensuring intents and data splits are consistent.
 
@@ -33,7 +32,7 @@ def validate_dataset(self) -> Self:
             self._validate_split(split)
         return self
 
-    def _validate_intents(self) -> Self:
+    def _validate_intents(self) -> "DatasetReader":
         """
         Validate the intents by checking their IDs for sequential order.
 
@@ -52,7 +51,7 @@ def _validate_intents(self) -> Self:
             raise ValueError(message)
         return self
 
-    def _validate_split(self, split: list[Sample]) -> Self:
+    def _validate_split(self, split: list[Sample]) -> "DatasetReader":
         """
         Validate a dataset split to ensure all sample labels reference valid intent IDs.
 
 
@@ -12,8 +12,27 @@
 
 import numpy as np
 import numpy.typing as npt
+from appdirs import user_cache_dir
 from sentence_transformers import SentenceTransformer
 
+from ._hash import Hasher
+
+
+def get_embeddings_path(filename: str) -> Path:
+    """
+    Get the path to the embeddings file.
+
+    This function constructs the full path to an embeddings file stored
+    in a specific directory under the user's home directory. The embeddings
+    file is named based on the provided filename, with the `.npy` extension
+    added.
+
+    :param filename: The name of the embeddings file (without extension).
+
+    :return: The full path to the embeddings file.
+    """
+    return Path(user_cache_dir("autointent")) / "embeddings" / f"{filename}.npy"
+
 
 class EmbedderDumpMetadata(TypedDict):
     """Metadata for saving and loading an Embedder instance."""
@@ -41,6 +60,7 @@ def __init__(
         device: str = "cpu",
         batch_size: int = 32,
         max_length: int | None = None,
+        use_cache: bool = False,
     ) -> None:
         """
         Initialize the Embedder.
@@ -49,11 +69,13 @@ def __init__(
         :param device: Device to run the model on (e.g., "cpu", "cuda").
         :param batch_size: Batch size for embedding calculations.
         :param max_length: Maximum sequence length for the embedding model.
+        :param embedder_use_cache: Flag indicating whether to cache intermediate embeddings.
         """
         self.model_name = model_name
         self.device = device
         self.batch_size = batch_size
         self.max_length = max_length
+        self.use_cache = use_cache
 
         if Path(model_name).exists():
             self.load(model_name)
@@ -62,6 +84,18 @@ def __init__(
 
         self.logger = logging.getLogger(__name__)
 
+    def __hash__(self) -> int:
+        """
+        Compute a hash value for the Embedder.
+
+        :returns: The hash value of the Embedder.
+        """
+        hasher = Hasher()
+        for parameter in self.embedding_model.parameters():
+            hasher.update(parameter.detach().cpu().numpy())
+        hasher.update(self.max_length)
+        return hasher.intdigest()
+
     def clear_ram(self) -> None:
         """Move the embedding model to CPU and delete it from memory."""
         self.logger.debug("Clearing embedder %s from memory", self.model_name)
@@ -114,18 +148,35 @@ def embed(self, utterances: list[str]) -> npt.NDArray[np.float32]:
         :param utterances: List of input texts to calculate embeddings for.
         :return: A numpy array of embeddings.
         """
+        if self.use_cache:
+            hasher = Hasher()
+            hasher.update(self)
+            hasher.update(utterances)
+
+            embeddings_path = get_embeddings_path(hasher.hexdigest())
+            if embeddings_path.exists():
+                return np.load(embeddings_path)  # type: ignore[no-any-return]
+
         self.logger.debug(
             "Calculating embeddings with model %s, batch_size=%d, max_seq_length=%s, device=%s",
             self.model_name,
             self.batch_size,
             str(self.max_length),
             self.device,
         )
+
         if self.max_length is not None:
             self.embedding_model.max_seq_length = self.max_length
-        return self.embedding_model.encode(
+
+        embeddings = self.embedding_model.encode(
             utterances,
             convert_to_numpy=True,
             batch_size=self.batch_size,
             normalize_embeddings=True,
-        )  # type: ignore[return-value]
+        )
+
+        if self.use_cache:
+            embeddings_path.parent.mkdir(parents=True, exist_ok=True)
+            np.save(embeddings_path, embeddings)
+
+        return embeddings  # type: ignore[return-value]
@@ -0,0 +1,72 @@
+"""This module provides functionality for hashing data using the xxhash algorithm."""
+
+import pickle
+from typing import Any
+
+import xxhash
+
+
+class Hasher:
+    """
+    A class that provides methods for hashing data using xxhash.
+
+    This class supports both a class-level method for generating hashes from
+    any given value, as well as an instance-level method for progressively
+    updating a hash state with new values.
+    """
+
+    def __init__(self) -> None:
+        """
+        Initialize the Hasher instance and sets up the internal xxhash state.
+
+        This state will be used for progressively hashing values using the
+        `update` method and obtaining the final digest using `hexdigest`.
+        """
+        self._state = xxhash.xxh64()
+
+    @classmethod
+    def hash(cls, value: Any) -> int:  # noqa: ANN401
+        """
+        Generate a hash for the given value using xxhash.
+
+        :param value: The value to be hashed. This can be any Python object.
+
+        :return: The resulting hash digest as a hexadecimal string.
+        """
+        if hasattr(value, "__hash__") and value.__hash__ not in {None, object.__hash__}:
+            return hash(value)
+        return xxhash.xxh64(pickle.dumps(value)).intdigest()
+
+    def update(self, value: Any) -> None:  # noqa: ANN401
+        """
+        Update the internal hash state with the provided value.
+
+        This method will first hash the type of the value, then hash the value
+        itself, and update the internal state accordingly.
+
+        :param value: The value to update the hash state with.
+        """
+        self._state.update(str(type(value)).encode())
+        self._state.update(str(self.hash(value)).encode())
+
+    def hexdigest(self) -> str:
+        """
+        Return the current hash digest as a hexadecimal string.
+
+        This method should be called after one or more `update` calls to get
+        the final hash result.
+
+        :return: The resulting hash digest as a hexadecimal string.
+        """
+        return self._state.hexdigest()
+
+    def intdigest(self) -> int:
+        """
+        Return the current hash digest as an integer.
+
+        This method should be called after one or more `update` calls to get
+        the final hash result.
+
+        :return: The resulting hash digest as an integer.
+        """
+        return self._state.intdigest()
@@ -8,7 +8,6 @@
 import numpy as np
 import numpy.typing as npt
 import yaml
-from typing_extensions import Self
 
 from autointent import Context, Dataset
 from autointent.configs import EmbedderConfig, InferenceNodeConfig, LoggingConfig, VectorIndexConfig
@@ -60,7 +59,7 @@ def set_config(self, config: LoggingConfig | VectorIndexConfig | EmbedderConfig)
             raise TypeError(msg)
 
     @classmethod
-    def from_search_space(cls, search_space: list[dict[str, Any]] | Path | str) -> Self:
+    def from_search_space(cls, search_space: list[dict[str, Any]] | Path | str) -> "Pipeline":
         """
         Create pipeline optimizer from dictionary search space.
 
@@ -73,7 +72,7 @@ def from_search_space(cls, search_space: list[dict[str, Any]] | Path | str) -> S
         return cls(nodes)
 
     @classmethod
-    def default_optimizer(cls, multilabel: bool) -> Self:
+    def default_optimizer(cls, multilabel: bool) -> "Pipeline":
         """
         Create pipeline optimizer with default search space for given classification task.
 
@@ -137,7 +136,7 @@ def fit(self, dataset: Dataset, force_multilabel: bool = False, init_for_inferen
         return context
 
     @classmethod
-    def from_dict_config(cls, nodes_configs: list[dict[str, Any]]) -> Self:
+    def from_dict_config(cls, nodes_configs: list[dict[str, Any]]) -> "Pipeline":
         """
         Create inference pipeline from dictionary config.
 
@@ -147,7 +146,7 @@ def from_dict_config(cls, nodes_configs: list[dict[str, Any]]) -> Self:
         return cls.from_config([InferenceNodeConfig(**cfg) for cfg in nodes_configs])
 
     @classmethod
-    def from_config(cls, nodes_configs: list[InferenceNodeConfig]) -> Self:
+    def from_config(cls, nodes_configs: list[InferenceNodeConfig]) -> "Pipeline":
         """
         Create inference pipeline from config.
 
@@ -157,7 +156,7 @@ def from_config(cls, nodes_configs: list[InferenceNodeConfig]) -> Self:
         return cls(nodes)
 
     @classmethod
-    def load(cls, path: str | Path) -> Self:
+    def load(cls, path: str | Path) -> "Pipeline":
         """
         Load pipeline in inference mode.
 
 
@@ -107,6 +107,8 @@ class EmbedderConfig:
     """Batch size for the embedder"""
     max_length: int | None = None
     """Max length for the embedder. If None, the max length will be taken from model config"""
+    use_cache: bool = False
+    """Flag indicating whether to cache embeddings for reuse, improving performance in repeated operations."""
 
 
 @dataclass
 
@@ -69,6 +69,7 @@ def configure_vector_index(self, config: VectorIndexConfig, embedder_config: Emb
             self.vector_index_config.db_dir,
             self.embedder_config.batch_size,
             self.embedder_config.max_length,
+            self.embedder_config.use_cache,
         )
 
     def configure_data(self, config: DataConfig) -> None:
@@ -189,6 +190,14 @@ def get_max_length(self) -> int | None:
         """
         return self.vector_index_client.embedder_max_length
 
+    def get_use_cache(self) -> bool:
+        """
+        Check if caching is enabled for the embedder.
+
+        :return: True if caching is enabled, False otherwise.
+        """
+        return self.vector_index_client.embedder_use_cache
+
     def get_dump_dir(self) -> Path | None:
         """
         Get the directory for saving dumped modules.
 
@@ -31,6 +31,7 @@ def __init__(
         device: str,
         embedder_batch_size: int = 32,
         embedder_max_length: int | None = None,
+        embedder_use_cache: bool = False,
     ) -> None:
         """
         Initialize the vector index.
@@ -39,13 +40,15 @@ def __init__(
         :param device: Device for running the embedding model (e.g., "cpu", "cuda").
         :param embedder_batch_size: Batch size for the embedder.
         :param embedder_max_length: Maximum sequence length for the embedder.
+        :param embedder_use_cache: Flag indicating whether to cache intermediate embeddings.
         """
         self.model_name = model_name
         self.embedder = Embedder(
             model_name=model_name,
             batch_size=embedder_batch_size,
             device=device,
             max_length=embedder_max_length,
+            use_cache=embedder_use_cache,
         )
         self.device = device