deeppavlov
diff --git a/‎autointent/__init__.py‎
Lines changed: 4 additions & 1 deletion b/‎autointent/__init__.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎autointent/_dump_tools.py‎
Lines changed: 102 additions & 0 deletions b/‎autointent/_dump_tools.py‎
Lines changed: 102 additions & 0 deletions
diff --git a/‎autointent/_embedder.py‎
Lines changed: 25 additions & 19 deletions b/‎autointent/_embedder.py‎
Lines changed: 25 additions & 19 deletions
diff --git a/‎autointent/_pipeline/_cli_endpoint.py‎
Lines changed: 1 addition & 1 deletion b/‎autointent/_pipeline/_cli_endpoint.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎autointent/_pipeline/_pipeline.py‎
Lines changed: 7 additions & 3 deletions b/‎autointent/_pipeline/_pipeline.py‎
Lines changed: 7 additions & 3 deletions
@@ -1,9 +1,12 @@
 """This is AutoIntent API reference."""
 
+from ._ranker import Ranker
 from ._embedder import Embedder
+from ._vector_index import VectorIndex
 from ._dataset import Dataset
 from ._hash import Hasher
 from .context import Context
 from ._pipeline import Pipeline
 
-__all__ = ["Context", "Dataset", "Embedder", "Hasher", "Pipeline"]
+
+__all__ = ["Context", "Dataset", "Embedder", "Hasher", "Pipeline", "Ranker", "VectorIndex"]
@@ -0,0 +1,102 @@
+import json
+import logging
+from pathlib import Path
+from typing import Any, TypeAlias
+
+import joblib
+import numpy as np
+import numpy.typing as npt
+from sklearn.base import BaseEstimator
+
+from autointent import Embedder, Ranker, VectorIndex
+from autointent.schemas import TagsList
+
+ModuleSimpleAttributes = None | str | int | float | bool | list  # type: ignore[type-arg]
+
+ModuleAttributes: TypeAlias = (
+    ModuleSimpleAttributes | TagsList | np.ndarray | Embedder | VectorIndex | BaseEstimator | Ranker  # type: ignore[type-arg]
+)
+
+logger = logging.getLogger(__name__)
+
+
+class Dumper:
+    tags = "tags"
+    simple_attrs = "simple_attrs.json"
+    arrays = "arrays.npz"
+    embedders = "embedders"
+    indexes = "vector_indexes"
+    estimators = "estimators"
+    cross_encoders = "cross_encoders"
+
+    @staticmethod
+    def make_subdirectories(path: Path) -> None:
+        subdirectories = [
+            path / Dumper.tags,
+            path / Dumper.embedders,
+            path / Dumper.indexes,
+            path / Dumper.estimators,
+            path / Dumper.cross_encoders,
+        ]
+        for subdir in subdirectories:
+            subdir.mkdir(parents=True, exist_ok=True)
+
+    @staticmethod
+    def dump(obj: Any, path: Path) -> None:  # noqa: ANN401
+        """Dump modules attributes to filestystem."""
+        attrs: dict[str, ModuleAttributes] = vars(obj)
+        simple_attrs = {}
+        arrays: dict[str, npt.NDArray[Any]] = {}
+
+        Dumper.make_subdirectories(path)
+
+        for key, val in attrs.items():
+            if isinstance(val, TagsList):
+                val.dump(path / Dumper.tags / key)
+            elif isinstance(val, ModuleSimpleAttributes):
+                simple_attrs[key] = val
+            elif isinstance(val, np.ndarray):
+                arrays[key] = val
+            elif isinstance(val, Embedder):
+                val.dump(path / Dumper.embedders / key)
+            elif isinstance(val, VectorIndex):
+                val.dump(path / Dumper.indexes / key)
+            elif isinstance(val, BaseEstimator):
+                joblib.dump(val, path / Dumper.estimators / key)
+            elif isinstance(val, Ranker):
+                val.save(str(path / Dumper.cross_encoders / key))
+            else:
+                msg = f"Attribute {key} of type {type(val)} cannot be dumped to file system."
+                logger.error(msg)
+
+        with (path / Dumper.simple_attrs).open("w") as file:
+            json.dump(simple_attrs, file, ensure_ascii=False, indent=4)
+
+        np.savez(path / Dumper.arrays, allow_pickle=False, **arrays)
+
+    @staticmethod
+    def load(obj: Any, path: Path) -> None:  # noqa: ANN401
+        """Load attributes from file system."""
+        for child in path.iterdir():
+            if child.name == Dumper.tags:
+                tags = {tags_dump.name: TagsList.load(tags_dump) for tags_dump in child.iterdir()}
+            elif child.name == Dumper.simple_attrs:
+                with child.open() as file:
+                    simple_attrs = json.load(file)
+            elif child.name == Dumper.arrays:
+                arrays = dict(np.load(child))
+            elif child.name == Dumper.embedders:
+                # TODO propagate custom loading params (such as device, batch size etc) to this line
+                embedders = {embedder_dump.name: Embedder.load(embedder_dump) for embedder_dump in child.iterdir()}
+            elif child.name == Dumper.indexes:
+                indexes = {index_dump.name: VectorIndex.load(index_dump) for index_dump in child.iterdir()}
+            elif child.name == Dumper.estimators:
+                estimators = {estimator_dump.name: joblib.load(estimator_dump) for estimator_dump in child.iterdir()}
+            elif child.name == Dumper.cross_encoders:
+                cross_encoders = {
+                    cross_encoder_dump.name: Ranker.load(cross_encoder_dump) for cross_encoder_dump in child.iterdir()
+                }
+            else:
+                msg = f"Found unexpected child {child}"
+                logger.error(msg)
+        obj.__dict__.update(tags | simple_attrs | arrays | embedders | indexes | estimators | cross_encoders)
@@ -37,10 +37,16 @@ def get_embeddings_path(filename: str) -> Path:
 class EmbedderDumpMetadata(TypedDict):
     """Metadata for saving and loading an Embedder instance."""
 
+    model_name_or_path: str
+    """Name of the hugging face model or a local path to sentence transformers dump."""
+    device: str
+    """Torch notation for CPU or CUDA."""
     batch_size: int
     """Batch size used for embedding calculations."""
     max_length: int | None
     """Maximum sequence length for the embedding model."""
+    use_cache: bool
+    """Whether to use embeddings caching."""
 
 
 class Embedder:
@@ -51,12 +57,11 @@ class Embedder:
     embedding models, as well as calculating embeddings for input texts.
     """
 
-    embedder_subdir: str = "sentence_transformers"
     metadata_dict_name: str = "metadata.json"
 
     def __init__(
         self,
-        model_name: str | Path,
+        model_name_or_path: str | Path,
         device: str = "cpu",
         batch_size: int = 32,
         max_length: int | None = None,
@@ -71,16 +76,13 @@ def __init__(
         :param max_length: Maximum sequence length for the embedding model.
         :param use_cache: Flag indicating whether to cache intermediate embeddings.
         """
-        self.model_name = model_name
+        self.model_name = model_name_or_path
         self.device = device
         self.batch_size = batch_size
         self.max_length = max_length
         self.use_cache = use_cache
 
-        if Path(model_name).exists():
-            self.load(model_name)
-        else:
-            self.embedding_model = SentenceTransformer(str(model_name), device=device)
+        self.embedding_model = SentenceTransformer(str(model_name_or_path), device=device)
 
         self.logger = logging.getLogger(__name__)
 
@@ -105,10 +107,7 @@ def clear_ram(self) -> None:
     def delete(self) -> None:
         """Delete the embedding model and its associated directory."""
         self.clear_ram()
-        shutil.rmtree(
-            self.dump_dir,
-            ignore_errors=True,
-        )  # TODO: `ignore_errors=True` is workaround for PermissionError: [WinError 5] Access is denied
+        shutil.rmtree(self.dump_dir)
 
     def dump(self, path: Path) -> None:
         """
@@ -118,28 +117,35 @@ def dump(self, path: Path) -> None:
         """
         self.dump_dir = path
         metadata = EmbedderDumpMetadata(
+            model_name_or_path=str(self.model_name),
+            device=self.device,
             batch_size=self.batch_size,
             max_length=self.max_length,
+            use_cache=self.use_cache,
         )
         path.mkdir(parents=True, exist_ok=True)
-        self.embedding_model.save(str(path / self.embedder_subdir))
         with (path / self.metadata_dict_name).open("w") as file:
             json.dump(metadata, file, indent=4)
 
-    def load(self, path: Path | str) -> None:
+    @classmethod
+    def load(
+        cls, path: Path | str, batch_size: int | None = None, use_cache: bool | None = None, device: str | None = None
+    ) -> "Embedder":
         """
         Load the embedding model and metadata from disk.
 
         :param path: Path to the directory where the model is stored.
         """
-        self.dump_dir = Path(path)
-        path = Path(path)
-        with (path / self.metadata_dict_name).open() as file:
+        with (Path(path) / cls.metadata_dict_name).open() as file:
             metadata: EmbedderDumpMetadata = json.load(file)
-        self.batch_size = metadata["batch_size"]
-        self.max_length = metadata["max_length"]
 
-        self.embedding_model = SentenceTransformer(str(path / self.embedder_subdir), device=self.device)
+        return cls(
+            model_name_or_path=metadata["model_name_or_path"],
+            device=device or metadata["device"],
+            batch_size=batch_size or metadata["batch_size"],
+            max_length=metadata["max_length"],
+            use_cache=use_cache or metadata["use_cache"],
+        )
 
     def embed(self, utterances: list[str]) -> npt.NDArray[np.float32]:
         """
 
@@ -27,14 +27,14 @@ def optimize(cfg: OptimizationConfig) -> None:
 
     logger.debug("Run Name: %s", cfg.logs.run_name)
     logger.debug("logs and assets: %s", cfg.logs.dirpath)
-    logger.debug("Vector index path: %s", cfg.vector_index.db_dir)
 
     # create shared objects for a whole pipeline
     context = Context(cfg.seed)
     cfg.logs.clear_ram = True
     context.configure_logging(cfg.logs)
     context.configure_vector_index(cfg.vector_index, cfg.embedder)
     context.configure_data(cfg.data)
+    context.configure_cross_encoder(cfg.cross_encoder)
 
     # run optimization
     search_space_config = load_config(cfg.task.search_space_path, context.is_multilabel(), logger)
 
@@ -10,7 +10,7 @@
 import yaml
 
 from autointent import Context, Dataset
-from autointent.configs import EmbedderConfig, InferenceNodeConfig, LoggingConfig, VectorIndexConfig
+from autointent.configs import CrossEncoderConfig, EmbedderConfig, InferenceNodeConfig, LoggingConfig, VectorIndexConfig
 from autointent.custom_types import NodeType
 from autointent.metrics import PREDICTION_METRICS_MULTILABEL
 from autointent.nodes import InferenceNode, NodeOptimizer
@@ -38,11 +38,12 @@ def __init__(
             self.logging_config = LoggingConfig(dump_dir=None)
             self.vector_index_config = VectorIndexConfig()
             self.embedder_config = EmbedderConfig()
+            self.cross_encoder_config = CrossEncoderConfig()
         elif not isinstance(nodes[0], InferenceNode):
             msg = "Pipeline should be initialized with list of NodeOptimizers or InferenceNodes"
             raise TypeError(msg)
 
-    def set_config(self, config: LoggingConfig | VectorIndexConfig | EmbedderConfig) -> None:
+    def set_config(self, config: LoggingConfig | VectorIndexConfig | EmbedderConfig | CrossEncoderConfig) -> None:
         """
         Set configuration for the optimizer.
 
@@ -54,6 +55,8 @@ def set_config(self, config: LoggingConfig | VectorIndexConfig | EmbedderConfig)
             self.vector_index_config = config
         elif isinstance(config, EmbedderConfig):
             self.embedder_config = config
+        elif isinstance(config, CrossEncoderConfig):
+            self.cross_encoder_config = config
         else:
             msg = "unknown config type"
             raise TypeError(msg)
@@ -97,7 +100,7 @@ def _fit(self, context: Context) -> None:
                 node_optimizer.fit(context)  # type: ignore[union-attr]
         if not context.vector_index_config.save_db:
             self._logger.info("removing vector database from file system...")
-            context.vector_index_client.delete_db()
+            # TODO clear cache from appdirs
         self.context.callback_handler.end_run()
 
     def _is_inference(self) -> bool:
@@ -124,6 +127,7 @@ def fit(self, dataset: Dataset, force_multilabel: bool = False) -> Context:
         context.set_dataset(dataset, force_multilabel)
         context.configure_logging(self.logging_config)
         context.configure_vector_index(self.vector_index_config, self.embedder_config)
+        context.configure_cross_encoder(self.cross_encoder_config)
 
         self._fit(context)