deeppavlov · voorhs · Jan 20, 2025 · Jan 16, 2025 · Jan 16, 2025 · Jan 16, 2025
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -50,41 +50,6 @@ make lint
 
 ![](assets/dependency-graph.png)
 
-## Настройка логгера
-Чтобы видеть debug строчки у вас есть несколько опций:
-
-1. Включить весь debug output через опцию коммандной строки: 
-```bash 
-autointent hydra.verbose=true
-```
-2. Включить debug output только для определенных модулей, пример для autointent.pipeline.optimization.cli_endpoint и самой hydra: 
-```bash
-autointent hydra.verbose=[hydra,autointent/pipeline/optimization/cli_endpoint] hydra.job_logging.root.level=DEBUG
-```
-
-Само конфигурирование логгера сделано в autointent.configs.optimization_cli.logger_config. Вы можете изменить любой параметр логгера через коммандную строку. Вот пример, как поменять уровень логгера на ERROR:
-```bash
-autointent hydra.job_logging.root.level=ERROR
-```
-
-Еще можно изменить параметры логгера через yaml файлы:
-1. Создадим папку с конфиг. файлами: test_config
-2. test_config/config.yaml:
-```yaml
-defaults:
-  - optimization_config
-  - _self_
-  - override hydra/job_logging: custom
-
-# set your config params for optimization here
-embedder_batch_size: 32
-```
-3. Поместите конфигурацию логгера в test_config/hydra/job_logging/custom.yaml (параметры см. [здесь](https://docs.python.org/3/howto/logging.html))
-4. Запускаем с конфиг файлом config.yaml:
-```bash
-autointent --config-path FULL_PATH/test_config --config-name config
-```
-
 ## Построение документации
 
 Построить html версию в папке `docs/build`:

diff --git a/autointent/_pipeline/_cli_endpoint.py b/autointent/_pipeline/_cli_endpoint.py
diff --git a/autointent/_pipeline/_pipeline.py b/autointent/_pipeline/_pipeline.py
@@ -25,14 +25,17 @@ class Pipeline:
     def __init__(
         self,
         nodes: list[NodeOptimizer] | list[InferenceNode],
+        seed: int = 42,
     ) -> None:
         """
         Initialize the pipeline optimizer.
 
         :param nodes: list of nodes
+        :param seed: random seed
         """
         self._logger = logging.getLogger(__name__)
         self.nodes = {node.node_type: node for node in nodes}
+        self.seed = seed
 
         if isinstance(nodes[0], NodeOptimizer):
             self.logging_config = LoggingConfig(dump_dir=None)
@@ -62,7 +65,7 @@ def set_config(self, config: LoggingConfig | VectorIndexConfig | EmbedderConfig
             raise TypeError(msg)
 
     @classmethod
-    def from_search_space(cls, search_space: list[dict[str, Any]] | Path | str) -> "Pipeline":
+    def from_search_space(cls, search_space: list[dict[str, Any]] | Path | str, seed: int = 42) -> "Pipeline":
         """
         Create pipeline optimizer from dictionary search space.
 
@@ -71,16 +74,16 @@ def from_search_space(cls, search_space: list[dict[str, Any]] | Path | str) -> "
         if isinstance(search_space, Path | str):
             search_space = load_search_space(search_space)
         nodes = [NodeOptimizer(**node) for node in search_space]
-        return cls(nodes)
+        return cls(nodes=nodes, seed=seed)
 
     @classmethod
-    def default_optimizer(cls, multilabel: bool) -> "Pipeline":
+    def default_optimizer(cls, multilabel: bool, seed: int = 42) -> "Pipeline":
         """
         Create pipeline optimizer with default search space for given classification task.
 
         :param multilabel: Whether the task multi-label, or single-label.
         """
-        return cls.from_search_space(load_default_search_space(multilabel))
+        return cls.from_search_space(search_space=load_default_search_space(multilabel), seed=seed)
 
     def _fit(self, context: Context) -> None:
         """
@@ -91,8 +94,8 @@ def _fit(self, context: Context) -> None:
         self.context = context
         self._logger.info("starting pipeline optimization...")
         self.context.callback_handler.start_run(
-            run_name=self.context.logging_config.get_run_name(),
-            dirpath=self.context.logging_config.get_dirpath(),
+            run_name=self.context.logging_config.run_name,
+            dirpath=self.context.logging_config.dirpath,
         )
         for node_type in NodeType:
             node_optimizer = self.nodes.get(node_type, None)
@@ -111,20 +114,19 @@ def _is_inference(self) -> bool:
         """
         return isinstance(self.nodes[NodeType.scoring], InferenceNode)
 
-    def fit(self, dataset: Dataset, force_multilabel: bool = False) -> Context:
+    def fit(self, dataset: Dataset) -> Context:
         """
         Optimize the pipeline from dataset.
 
         :param dataset: Dataset for optimization
-        :param force_multilabel: Whether to force multilabel or not
         :return: Context
         """
         if self._is_inference():
             msg = "Pipeline in inference mode cannot be fitted"
             raise RuntimeError(msg)
 
         context = Context()
-        context.set_dataset(dataset, force_multilabel)
+        context.set_dataset(dataset)
         context.configure_logging(self.logging_config)
         context.configure_vector_index(self.vector_index_config, self.embedder_config)
         context.configure_cross_encoder(self.cross_encoder_config)

diff --git a/autointent/configs/__init__.py b/autointent/configs/__init__.py
@@ -1,12 +1,11 @@
 """Dataclasses for the configuration of the :class:`autointent.Embedder` and other objects."""
 
 from ._inference_node import InferenceNodeConfig
-from ._optimization_cli import (
+from ._optimization import (
     CrossEncoderConfig,
     DataConfig,
     EmbedderConfig,
     LoggingConfig,
-    OptimizationConfig,
     TaskConfig,
     VectorIndexConfig,
 )
@@ -18,7 +17,6 @@
     "InferenceNodeConfig",
     "InferenceNodeConfig",
     "LoggingConfig",
-    "OptimizationConfig",
     "TaskConfig",
     "VectorIndexConfig",
 ]
diff --git a/autointent/configs/_optimization.py b/autointent/configs/_optimization.py
@@ -0,0 +1,98 @@
+"""Configuration for the optimization process."""
+
+from pathlib import Path
+
+from pydantic import BaseModel, Field
+
+from ._name import get_run_name
+
+
+class DataConfig(BaseModel):
+    """Configuration for the data used in the optimization process."""
+
+    train_path: str | Path
+    """Path to the training data. Can be local path or HF repo."""
+
+
+class TaskConfig(BaseModel):
+    """Configuration for the task to optimize."""
+
+    search_space_path: Path | None = None
+    """Path to the search space configuration file. If None, the default search space will be used"""
+
+
+class LoggingConfig(BaseModel):
+    """Configuration for the logging."""
+
+    project_dir: Path = Field(default_factory=lambda: Path.cwd() / "runs")
+    """Path to the directory with different runs."""
+    run_name: str = Field(default_factory=get_run_name)
+    """Name of the run. If None, a random name will be generated"""
+    dump_modules: bool = False
+    """Whether to dump the modules or not"""
+    clear_ram: bool = False
+    """Whether to clear the RAM after dumping the modules"""
+    report_to: list[str] | None = None
+    """List of callbacks to report to. If None, no callbacks will be used"""
+
+    @property
+    def dirpath(self) -> Path:
+        """Path to the directory where the logs will be saved."""
+        if not hasattr(self, "_dirpath"):
+            self._dirpath = self.project_dir / self.run_name
+        return self._dirpath
+
+    @property
+    def dump_dir(self) -> Path:
+        """Path to the directory where the modules will be dumped."""
+        if not hasattr(self, "_dump_dir"):
+            self._dump_dir = self.dirpath / "modules_dumps"
+        return self._dump_dir
+
+
+class VectorIndexConfig(BaseModel):
+    """Configuration for the vector index."""
+
+    save_db: bool = False
+    """Whether to save the vector index database or not"""
+
+
+class TransformerConfig(BaseModel):
+    """
+    Base class for configuration for the transformer.
+
+    Transformer is used under the hood in :py:class:`autointent.Embedder` and :py:class:`autointent.Ranker`.
+    """
+
+    batch_size: int = 32
+    """Batch size for the embedder"""
+    max_length: int | None = None
+    """Max length for the embedder. If None, the max length will be taken from model config"""
+    device: str = "cpu"
+    """Device to use for the vector index. Can be 'cpu', 'cuda', 'cuda:0', 'mps', etc."""
+
+
+class EmbedderConfig(TransformerConfig):
+    """
+    Configuration for the embedder.
+
+    The embedder is used to embed the data before training the model. These parameters
+    will be applied to the embedder used in the optimization process in vector db.
+    Only one model can be used globally.
+    """
+
+    use_cache: bool = True
+    """Whether to cache embeddings for reuse, improving performance in repeated operations."""
+
+
+class CrossEncoderConfig(TransformerConfig):
+    """
+    Configuration for the embedder.
+
+    The embedder is used to embed the data before training the model. These parameters
+    will be applied to the embedder used in the optimization process in vector db.
+    Only one model can be used globally.
+    """
+
+    train_head: bool = False
+    """Whether to train the ranking head of a cross encoder."""