deeppavlov
diff --git a/‎autointent/__init__.py‎
Lines changed: 3 additions & 1 deletion b/‎autointent/__init__.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎autointent/nodes/nodes_info/_retrieval.py‎
Lines changed: 1 addition & 1 deletion b/‎autointent/nodes/nodes_info/_retrieval.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎autointent/pipeline/inference/_inference_pipeline.py‎
Lines changed: 8 additions & 0 deletions b/‎autointent/pipeline/inference/_inference_pipeline.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎autointent/pipeline/optimization/_cli_endpoint.py‎
Lines changed: 2 additions & 2 deletions b/‎autointent/pipeline/optimization/_cli_endpoint.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎autointent/pipeline/optimization/_pipeline_optimizer.py‎
Lines changed: 16 additions & 6 deletions b/‎autointent/pipeline/optimization/_pipeline_optimizer.py‎
Lines changed: 16 additions & 6 deletions
diff --git a/‎autointent/utils.py‎
Lines changed: 19 additions & 0 deletions b/‎autointent/utils.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎docs/source/guides/search_space_configuration.rst‎
Lines changed: 168 additions & 1 deletion b/‎docs/source/guides/search_space_configuration.rst‎
Lines changed: 168 additions & 1 deletion
diff --git a/‎docs/source/index.rst‎
Lines changed: 18 additions & 12 deletions b/‎docs/source/index.rst‎
Lines changed: 18 additions & 12 deletions
diff --git a/‎docs/source/learn/greedy_optimization.rst‎
Lines changed: 0 additions & 4 deletions b/‎docs/source/learn/greedy_optimization.rst‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎docs/source/learn/optimization.rst‎
Lines changed: 46 additions & 0 deletions b/‎docs/source/learn/optimization.rst‎
Lines changed: 46 additions & 0 deletions
@@ -1,4 +1,6 @@
 from ._embedder import Embedder
 from .context import Context
+from .context.data_handler import Dataset
+from .pipeline import InferencePipeline, PipelineOptimizer
 
-__all__ = ["Context", "Embedder"]
+__all__ = ["Context", "Dataset", "Embedder", "InferencePipeline", "PipelineOptimizer"]
@@ -22,7 +22,7 @@ class RetrievalNodeInfo(NodeInfo):
     )
 
     modules_available: ClassVar[Mapping[str, type[Module]]] = (
-        RETRIEVAL_MODULES_MULTICLASS | RETRIEVAL_MODULES_MULTILABEL
+        RETRIEVAL_MODULES_MULTICLASS | RETRIEVAL_MODULES_MULTILABEL  # type: ignore[has-type]
     )
 
     node_type = NodeType.retrieval
@@ -1,7 +1,9 @@
 """Inference pipeline for prediction."""
 
+from pathlib import Path
 from typing import Any
 
+import yaml
 from pydantic import BaseModel
 from typing_extensions import Self
 
@@ -63,6 +65,12 @@ def from_config(cls, nodes_configs: list[InferenceNodeConfig]) -> Self:
         nodes = [InferenceNode.from_config(cfg) for cfg in nodes_configs]
         return cls(nodes)
 
+    @classmethod
+    def load(cls, path: str | Path) -> Self:
+        with (Path(path) / "inference_config.yaml").open() as file:
+            inference_dict_config = yaml.safe_load(file)
+        return cls.from_dict_config(inference_dict_config["nodes_configs"])
+
     def predict(self, utterances: list[str]) -> list[LabelType]:
         """
         Predict the labels for the utterances.
 
@@ -33,8 +33,8 @@ def main(cfg: OptimizationConfig) -> None:
 
     # run optimization
     search_space_config = load_config(cfg.task.search_space_path, context.is_multilabel(), logger)
-    pipeline = PipelineOptimizer.from_dict_config(search_space_config)
-    pipeline.optimize(context)
+    pipeline = PipelineOptimizer.from_dict(search_space_config)
+    pipeline._fit(context)  # noqa: SLF001
 
     # save results
     context.dump()
@@ -14,6 +14,7 @@
 from autointent.context.data_handler import Dataset
 from autointent.custom_types import NodeType
 from autointent.nodes import NodeOptimizer
+from autointent.utils import load_default_search_space
 
 
 class PipelineOptimizer:
@@ -52,15 +53,24 @@ def set_config(self, config: LoggingConfig | VectorIndexConfig | EmbedderConfig)
             raise TypeError(msg)
 
     @classmethod
-    def from_dict_config(cls, config: dict[str, Any]) -> Self:
+    def from_dict(cls, search_space: dict[str, Any]) -> Self:
         """
-        Create pipeline optimizer from dictionary config.
+        Create pipeline optimizer from dictionary search space.
 
         :param config: Dictionary config
         """
-        return instantiate(PipelineOptimizerConfig, **config)  # type: ignore[no-any-return]
+        return instantiate(PipelineOptimizerConfig, **search_space)  # type: ignore[no-any-return]
 
-    def optimize(self, context: Context) -> None:
+    @classmethod
+    def default(cls, multilabel: bool) -> Self:
+        """
+        Create pipeline optimizer with default search space for given classification task.
+
+        :param multilabel: Wether the task multi-label, or single-label.
+        """
+        return cls.from_dict(load_default_search_space(multilabel))
+
+    def _fit(self, context: Context) -> None:
         """
         Optimize the pipeline.
 
@@ -74,7 +84,7 @@ def optimize(self, context: Context) -> None:
             self._logger.info("removing vector database from file system...")
             context.vector_index_client.delete_db()
 
-    def optimize_from_dataset(self, dataset: Dataset, force_multilabel: bool = False) -> Context:
+    def fit(self, dataset: Dataset, force_multilabel: bool = False) -> Context:
         """
         Optimize the pipeline from dataset.
 
@@ -87,7 +97,7 @@ def optimize_from_dataset(self, dataset: Dataset, force_multilabel: bool = False
         context.configure_logging(self.logging_config)
         context.configure_vector_index(self.vector_index_config, self.embedder_config)
 
-        self.optimize(context)
+        self._fit(context)
         self.inference_config = context.optimization_info.get_inference_nodes_config()
         return context
 
 
@@ -0,0 +1,19 @@
+"""AutoIntent utilities."""
+
+import importlib.resources as ires
+from typing import Any
+
+import yaml
+
+
+def load_default_search_space(multilabel: bool) -> dict[str, Any]:
+    """
+    Load configuration from the given path or load default configuration.
+
+    :param multilabel: Whether to use multilabel or not
+    :return:
+    """
+    config_name = "default-multilabel-config.yaml" if multilabel else "default-multiclass-config.yaml"
+    with ires.files("autointent.datafiles").joinpath(config_name).open() as file:
+        file_content = file.read()
+    return yaml.safe_load(file_content)  # type: ignore[no-any-return]
@@ -1,4 +1,171 @@
 Search Space Configuration
 ==========================
 
-В этом гайде вы узнаете как настраивать кастомное пространство поиска гипепараметров.
+In this guide, you will learn how to configure a custom hyperparameter search space.
+
+Python API
+##########
+
+.. note::
+
+    Before reading this guide, we recommend familiarizing yourself with the sections :doc:`../concepts` and :doc:`../learn/optimization`.
+
+Optimization Module
+-------------------
+
+To set up the optimization module, you need to create the following dictionary:
+
+.. code-block:: python
+
+    knn_module = {
+        "module_type": "knn",
+        "k": [1, 5, 10, 50],
+        "embedder_name": [
+            "avsolatorio/GIST-small-Embedding-v0",
+            "infgrad/stella-base-en-v2"
+        ]
+    }
+
+The ``module_type`` field specifies the name of the module. You can find the names, for example, in :py:data:`autointent.modules.SCORING_MODULES_MULTICLASS`.
+
+All fields except ``module_type`` are lists that define the search space for each hyperparameter. If you omit them, the default set of hyperparameters will be used during auto-configuration:
+
+.. code-block:: python
+
+    linear_module = {"module_type": "linear"}
+
+Optimization Node
+-----------------
+
+To set up the optimization node, you need to create a list of modules and specify the metric for optimization:
+
+.. code-block:: python
+
+    scoring_node = {
+        "node_type": "scoring",
+        "metric_name": "scoring_roc_auc",
+        "search_space": [
+            knn_module,
+            linear_module,
+        ]
+    }
+
+Search Space
+------------
+
+The search space for the entire pipeline looks approximately like this:
+
+.. code-block:: python
+
+    search_space = [
+        {
+            "node_type": "retrieval",
+            "metric": "retrieval_hit_rate",
+            "search_space": [
+                {
+                    "module_type": "vector_db",
+                    "k": [10],
+                    "embedder_name": [
+                        "avsolatorio/GIST-small-Embedding-v0",
+                        "infgrad/stella-base-en-v2"
+                    ]
+                }
+            ]
+        },
+        {
+            "node_type": "scoring",
+            "metric": "scoring_roc_auc",
+            "search_space": [
+                {
+                    "module_type": "knn",
+                    "k": [1, 3, 5, 10],
+                    "weights": ["uniform", "distance", "closest"]
+                },
+                {
+                    "module_type": "linear"
+                },
+                {
+                    "module_type": "dnnc",
+                    "cross_encoder_name": [
+                        "BAAI/bge-reranker-base",
+                        "cross-encoder/ms-marco-MiniLM-L-6-v2"
+                    ],
+                    "k": [1, 3, 5, 10]
+                }
+            ]
+        },
+        {
+            "node_type": "prediction",
+            "metric": "prediction_accuracy",
+            "search_space": [
+                {
+                    "module_type": "threshold",
+                    "thresh": [0.5]
+                },
+                {
+                    "module_type": "argmax"
+                }
+            ]
+        }
+    ]
+
+Start Auto Configuration
+------------------------
+
+.. code-block:: python
+
+    from autointent.pipeline import PipelineOptimizer
+
+    pipeline_optimizer = PipelineOptimizer.from_dict(search_space)
+    pipeline_optimizer.fit(dataset)
+
+CLI
+###
+
+Yaml Format
+-----------
+
+YAML (YAML Ain't Markup Language) is a human-readable data serialization standard that is often used for configuration files and data exchange between languages with different data structures. It serves similar purposes as JSON but is much easier to read.
+
+Here's an example YAML file:
+
+.. code-block:: yaml
+
+    database:
+      host: localhost
+      port: 5432
+      username: admin
+      # this is a comment
+      password: secret
+
+    counts:
+    - 10
+    - 20
+    - 30
+
+    literal_counts: [10, 20, 30]
+
+    users:
+    - name: Alice
+      age: 30
+      email: [email protected]
+    - name: Bob
+      age: 25
+      email: [email protected]
+
+    settings:
+    debug: true
+    timeout: 30
+
+Explanation:
+
+- the whole file represents a dictionary with keys ``database``, ``counts``, ``users``, ``settings``, ``debug``, ``timeout``
+- ``database`` itself is a dictionary with keys ``host``, ``port``, and so on
+- ``counts`` is a list (Python ``[10, 20, 30]``)
+- ``literal_counts`` is a list too
+- ``users`` is a list of dictionaries
+
+Start Auto Configuration
+------------------------
+
+To set up the search space for optimization from the command line, you need to...
@@ -1,26 +1,32 @@
-.. AutoIntent documentation master file, created by
-   sphinx-quickstart on Fri Nov 15 10:59:47 2024.
-   You can adapt this file completely to your liking, but it should at least
-   contain the root `toctree` directive.
-
 AutoIntent documentation
 ========================
 
 **AutoIntent** is an open source tool for automatic configuration of a text classification pipeline for intent prediction.
 
-.. `See us on GitHub! <https://github.com/deeppavlov/AutoIntent>`_
-
-.. Check out the :doc:`usage` section to begin with, including :ref:`installation <installation>` section.
-
 .. note::
 
    This project is under active development.
 
-Задача распознавания интентов является одной из основных подзадач создания задачеориентированных диалоговых систем наряду с написанием сценария и заполнением слотов. Проект AutoIntent предлагает пользователям следующее:
+The task of intent detection is one of the main subtasks in creating task-oriented dialogue systems, along with scriptwriting and slot filling. AutoIntent project offers users the following:
+
+- A convenient library of methods for intent classification that can be used in a sklearn-like "fit-predict" format.
+- An AutoML approach to creating classifiers, where the only thing needed is to upload a set of labeled data.
+
+Example of building an intent classifier in a couple of lines of code:
+
+.. code-block:: python
+
+   from autointent import PipelineOptimizer, InferencePipeline, Dataset
+
+   dataset = Dataset.from_json("/path/to/json")
+   pipeline_optimizer = PipelineOptimizer.default(multilabel=False)
+   pipeline_optimizer.fit(dataset)
+   pipeline_optimizer.dump()
 
-- удобная библиотека методов для классификации интентов, с которыми можно работать в sklearn-like формате "fit-predict".
-- AutoML-подход к созданию классификаторов, при котором достаточно лишь загрузить небольшой набор размеченных данных
+   inference_pipeline = InferencePipeline.load("/path/to/run")
+   inference_pipeline.predict(["Hello, World!"])
 
+We recommend you to begin your exploration of our library from the :doc:`quickstart` page.
 
 .. toctree::
    :maxdepth: 1
 
@@ -0,0 +1,46 @@
+Optimization
+============
+
+In this section, you will learn how hyperparameter optimization works in our library.
+
+Pipeline
+--------
+
+The entire process of configuring a classifier in our library is divided into sequential steps:
+
+1. Selecting an embedder (EmbeddingNode)
+2. Selecting a classifier (ScoringNode)
+3. Selecting a decision rule (PredictionNode)
+
+Each step has its own set of hyperparameters. To theoretically guarantee finding the ideal configuration through exhaustive search, it is necessary to check every element of the Cartesian product of the hyperparameter sets of these steps (grid search). In practice, achieving this is usually impossible because the number of combinations is too large.
+
+Greedy Strategy
+---------------
+
+This is one of the ways to solve the problem of an overwhelming number of combinations. In our case, the greedy optimization algorithm is as follows:
+
+1. Iterate through the hyperparameters of the embedder and fix the best one.
+2. Iterate through the hyperparameters of the classifier and fix the best one.
+3. Iterate through the hyperparameters of the decision rule and fix the best one.
+
+This algorithm checks fewer combinations, which speeds up the process. To implement such an algorithm, it is necessary to be able to evaluate the quality of not only the final prediction of the entire pipeline but also its intermediate predictions. The main drawback of this approach is that the decisions made are optimal only locally, not globally. The metrics for evaluating intermediate predictions are only a proxy signal for the quality of the final prediction.
+
+This approach has been available in our library since release v0.0.1.
+
+Random Search
+-------------
+
+A simpler strategy is to take a random subset of the full search space (random grid search). A straightforward strategy is to iterate through all combinations in random order until a certain time budget is exhausted.
+
+This approach is less intelligent than the greedy strategy because, at any moment during the random combination search, poor embedders or any other bad parameters might keep appearing, despite they have been tested already. The greedy strategy would have eliminated such embedders at the beginning and not revisited them. On the other hand, random search, by its nature, does not rely on any local decisions.
+
+The implementation of this optimization method is planned for release v0.1.0.
+
+Bayesian Optimization
+---------------------
+
+This is similar to random search over a subset, but during the search, we attempt to model the probabilistic space of hyperparameters. This allows us to avoid repeating hyperparameter values that have previously performed poorly. The search itself aims to balance exploration and exploitation.
+
+This approach is more sophisticated and can lead to better results by intelligently exploring the hyperparameter space.
+
+The implementation of Bayesian optimization is planned for release v0.1.0.
Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@ class RetrievalNodeInfo(NodeInfo):`
`22`	`22`	`)`
`23`	`23`
`24`	`24`	`modules_available: ClassVar[Mapping[str, type[Module]]] = (`
`25`		`- RETRIEVAL_MODULES_MULTICLASS \| RETRIEVAL_MODULES_MULTILABEL`
	`25`	`+ RETRIEVAL_MODULES_MULTICLASS \| RETRIEVAL_MODULES_MULTILABEL # type: ignore[has-type]`
`26`	`26`	`)`
`27`	`27`
`28`	`28`	`node_type = NodeType.retrieval`