Skip to content

Commit 31205c4

Browse files
authored
Docs/guides (#56)
1 parent bf2ec3e commit 31205c4

File tree

14 files changed

+365
-53
lines changed

14 files changed

+365
-53
lines changed

autointent/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
from ._embedder import Embedder
22
from .context import Context
3+
from .context.data_handler import Dataset
4+
from .pipeline import InferencePipeline, PipelineOptimizer
35

4-
__all__ = ["Context", "Embedder"]
6+
__all__ = ["Context", "Dataset", "Embedder", "InferencePipeline", "PipelineOptimizer"]

autointent/nodes/nodes_info/_retrieval.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ class RetrievalNodeInfo(NodeInfo):
2222
)
2323

2424
modules_available: ClassVar[Mapping[str, type[Module]]] = (
25-
RETRIEVAL_MODULES_MULTICLASS | RETRIEVAL_MODULES_MULTILABEL
25+
RETRIEVAL_MODULES_MULTICLASS | RETRIEVAL_MODULES_MULTILABEL # type: ignore[has-type]
2626
)
2727

2828
node_type = NodeType.retrieval

autointent/pipeline/inference/_inference_pipeline.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
"""Inference pipeline for prediction."""
22

3+
from pathlib import Path
34
from typing import Any
45

6+
import yaml
57
from pydantic import BaseModel
68
from typing_extensions import Self
79

@@ -63,6 +65,12 @@ def from_config(cls, nodes_configs: list[InferenceNodeConfig]) -> Self:
6365
nodes = [InferenceNode.from_config(cfg) for cfg in nodes_configs]
6466
return cls(nodes)
6567

68+
@classmethod
69+
def load(cls, path: str | Path) -> Self:
70+
with (Path(path) / "inference_config.yaml").open() as file:
71+
inference_dict_config = yaml.safe_load(file)
72+
return cls.from_dict_config(inference_dict_config["nodes_configs"])
73+
6674
def predict(self, utterances: list[str]) -> list[LabelType]:
6775
"""
6876
Predict the labels for the utterances.

autointent/pipeline/optimization/_cli_endpoint.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ def main(cfg: OptimizationConfig) -> None:
3333

3434
# run optimization
3535
search_space_config = load_config(cfg.task.search_space_path, context.is_multilabel(), logger)
36-
pipeline = PipelineOptimizer.from_dict_config(search_space_config)
37-
pipeline.optimize(context)
36+
pipeline = PipelineOptimizer.from_dict(search_space_config)
37+
pipeline._fit(context) # noqa: SLF001
3838

3939
# save results
4040
context.dump()

autointent/pipeline/optimization/_pipeline_optimizer.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from autointent.context.data_handler import Dataset
1515
from autointent.custom_types import NodeType
1616
from autointent.nodes import NodeOptimizer
17+
from autointent.utils import load_default_search_space
1718

1819

1920
class PipelineOptimizer:
@@ -52,15 +53,24 @@ def set_config(self, config: LoggingConfig | VectorIndexConfig | EmbedderConfig)
5253
raise TypeError(msg)
5354

5455
@classmethod
55-
def from_dict_config(cls, config: dict[str, Any]) -> Self:
56+
def from_dict(cls, search_space: dict[str, Any]) -> Self:
5657
"""
57-
Create pipeline optimizer from dictionary config.
58+
Create pipeline optimizer from dictionary search space.
5859
5960
:param config: Dictionary config
6061
"""
61-
return instantiate(PipelineOptimizerConfig, **config) # type: ignore[no-any-return]
62+
return instantiate(PipelineOptimizerConfig, **search_space) # type: ignore[no-any-return]
6263

63-
def optimize(self, context: Context) -> None:
64+
@classmethod
65+
def default(cls, multilabel: bool) -> Self:
66+
"""
67+
Create pipeline optimizer with default search space for given classification task.
68+
69+
:param multilabel: Wether the task multi-label, or single-label.
70+
"""
71+
return cls.from_dict(load_default_search_space(multilabel))
72+
73+
def _fit(self, context: Context) -> None:
6474
"""
6575
Optimize the pipeline.
6676
@@ -74,7 +84,7 @@ def optimize(self, context: Context) -> None:
7484
self._logger.info("removing vector database from file system...")
7585
context.vector_index_client.delete_db()
7686

77-
def optimize_from_dataset(self, dataset: Dataset, force_multilabel: bool = False) -> Context:
87+
def fit(self, dataset: Dataset, force_multilabel: bool = False) -> Context:
7888
"""
7989
Optimize the pipeline from dataset.
8090
@@ -87,7 +97,7 @@ def optimize_from_dataset(self, dataset: Dataset, force_multilabel: bool = False
8797
context.configure_logging(self.logging_config)
8898
context.configure_vector_index(self.vector_index_config, self.embedder_config)
8999

90-
self.optimize(context)
100+
self._fit(context)
91101
self.inference_config = context.optimization_info.get_inference_nodes_config()
92102
return context
93103

autointent/utils.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
"""AutoIntent utilities."""
2+
3+
import importlib.resources as ires
4+
from typing import Any
5+
6+
import yaml
7+
8+
9+
def load_default_search_space(multilabel: bool) -> dict[str, Any]:
10+
"""
11+
Load configuration from the given path or load default configuration.
12+
13+
:param multilabel: Whether to use multilabel or not
14+
:return:
15+
"""
16+
config_name = "default-multilabel-config.yaml" if multilabel else "default-multiclass-config.yaml"
17+
with ires.files("autointent.datafiles").joinpath(config_name).open() as file:
18+
file_content = file.read()
19+
return yaml.safe_load(file_content) # type: ignore[no-any-return]
Lines changed: 168 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,171 @@
11
Search Space Configuration
22
==========================
33

4-
В этом гайде вы узнаете как настраивать кастомное пространство поиска гипепараметров.
4+
In this guide, you will learn how to configure a custom hyperparameter search space.
5+
6+
Python API
7+
##########
8+
9+
.. note::
10+
11+
Before reading this guide, we recommend familiarizing yourself with the sections :doc:`../concepts` and :doc:`../learn/optimization`.
12+
13+
Optimization Module
14+
-------------------
15+
16+
To set up the optimization module, you need to create the following dictionary:
17+
18+
.. code-block:: python
19+
20+
knn_module = {
21+
"module_type": "knn",
22+
"k": [1, 5, 10, 50],
23+
"embedder_name": [
24+
"avsolatorio/GIST-small-Embedding-v0",
25+
"infgrad/stella-base-en-v2"
26+
]
27+
}
28+
29+
The ``module_type`` field specifies the name of the module. You can find the names, for example, in :py:data:`autointent.modules.SCORING_MODULES_MULTICLASS`.
30+
31+
All fields except ``module_type`` are lists that define the search space for each hyperparameter. If you omit them, the default set of hyperparameters will be used during auto-configuration:
32+
33+
.. code-block:: python
34+
35+
linear_module = {"module_type": "linear"}
36+
37+
Optimization Node
38+
-----------------
39+
40+
To set up the optimization node, you need to create a list of modules and specify the metric for optimization:
41+
42+
.. code-block:: python
43+
44+
scoring_node = {
45+
"node_type": "scoring",
46+
"metric_name": "scoring_roc_auc",
47+
"search_space": [
48+
knn_module,
49+
linear_module,
50+
]
51+
}
52+
53+
Search Space
54+
------------
55+
56+
The search space for the entire pipeline looks approximately like this:
57+
58+
.. code-block:: python
59+
60+
search_space = [
61+
{
62+
"node_type": "retrieval",
63+
"metric": "retrieval_hit_rate",
64+
"search_space": [
65+
{
66+
"module_type": "vector_db",
67+
"k": [10],
68+
"embedder_name": [
69+
"avsolatorio/GIST-small-Embedding-v0",
70+
"infgrad/stella-base-en-v2"
71+
]
72+
}
73+
]
74+
},
75+
{
76+
"node_type": "scoring",
77+
"metric": "scoring_roc_auc",
78+
"search_space": [
79+
{
80+
"module_type": "knn",
81+
"k": [1, 3, 5, 10],
82+
"weights": ["uniform", "distance", "closest"]
83+
},
84+
{
85+
"module_type": "linear"
86+
},
87+
{
88+
"module_type": "dnnc",
89+
"cross_encoder_name": [
90+
"BAAI/bge-reranker-base",
91+
"cross-encoder/ms-marco-MiniLM-L-6-v2"
92+
],
93+
"k": [1, 3, 5, 10]
94+
}
95+
]
96+
},
97+
{
98+
"node_type": "prediction",
99+
"metric": "prediction_accuracy",
100+
"search_space": [
101+
{
102+
"module_type": "threshold",
103+
"thresh": [0.5]
104+
},
105+
{
106+
"module_type": "argmax"
107+
}
108+
]
109+
}
110+
]
111+
112+
Start Auto Configuration
113+
------------------------
114+
115+
.. code-block:: python
116+
117+
from autointent.pipeline import PipelineOptimizer
118+
119+
pipeline_optimizer = PipelineOptimizer.from_dict(search_space)
120+
pipeline_optimizer.fit(dataset)
121+
122+
CLI
123+
###
124+
125+
Yaml Format
126+
-----------
127+
128+
YAML (YAML Ain't Markup Language) is a human-readable data serialization standard that is often used for configuration files and data exchange between languages with different data structures. It serves similar purposes as JSON but is much easier to read.
129+
130+
Here's an example YAML file:
131+
132+
.. code-block:: yaml
133+
134+
database:
135+
host: localhost
136+
port: 5432
137+
username: admin
138+
# this is a comment
139+
password: secret
140+
141+
counts:
142+
- 10
143+
- 20
144+
- 30
145+
146+
literal_counts: [10, 20, 30]
147+
148+
users:
149+
- name: Alice
150+
age: 30
151+
152+
- name: Bob
153+
age: 25
154+
155+
156+
settings:
157+
debug: true
158+
timeout: 30
159+
160+
Explanation:
161+
162+
- the whole file represents a dictionary with keys ``database``, ``counts``, ``users``, ``settings``, ``debug``, ``timeout``
163+
- ``database`` itself is a dictionary with keys ``host``, ``port``, and so on
164+
- ``counts`` is a list (Python ``[10, 20, 30]``)
165+
- ``literal_counts`` is a list too
166+
- ``users`` is a list of dictionaries
167+
168+
Start Auto Configuration
169+
------------------------
170+
171+
To set up the search space for optimization from the command line, you need to...

docs/source/index.rst

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,32 @@
1-
.. AutoIntent documentation master file, created by
2-
sphinx-quickstart on Fri Nov 15 10:59:47 2024.
3-
You can adapt this file completely to your liking, but it should at least
4-
contain the root `toctree` directive.
5-
61
AutoIntent documentation
72
========================
83

94
**AutoIntent** is an open source tool for automatic configuration of a text classification pipeline for intent prediction.
105

11-
.. `See us on GitHub! <https://github.com/deeppavlov/AutoIntent>`_
12-
13-
.. Check out the :doc:`usage` section to begin with, including :ref:`installation <installation>` section.
14-
156
.. note::
167

178
This project is under active development.
189

19-
Задача распознавания интентов является одной из основных подзадач создания задачеориентированных диалоговых систем наряду с написанием сценария и заполнением слотов. Проект AutoIntent предлагает пользователям следующее:
10+
The task of intent detection is one of the main subtasks in creating task-oriented dialogue systems, along with scriptwriting and slot filling. AutoIntent project offers users the following:
11+
12+
- A convenient library of methods for intent classification that can be used in a sklearn-like "fit-predict" format.
13+
- An AutoML approach to creating classifiers, where the only thing needed is to upload a set of labeled data.
14+
15+
Example of building an intent classifier in a couple of lines of code:
16+
17+
.. code-block:: python
18+
19+
from autointent import PipelineOptimizer, InferencePipeline, Dataset
20+
21+
dataset = Dataset.from_json("/path/to/json")
22+
pipeline_optimizer = PipelineOptimizer.default(multilabel=False)
23+
pipeline_optimizer.fit(dataset)
24+
pipeline_optimizer.dump()
2025
21-
- удобная библиотека методов для классификации интентов, с которыми можно работать в sklearn-like формате "fit-predict".
22-
- AutoML-подход к созданию классификаторов, при котором достаточно лишь загрузить небольшой набор размеченных данных
26+
inference_pipeline = InferencePipeline.load("/path/to/run")
27+
inference_pipeline.predict(["Hello, World!"])
2328
29+
We recommend you to begin your exploration of our library from the :doc:`quickstart` page.
2430

2531
.. toctree::
2632
:maxdepth: 1

docs/source/learn/greedy_optimization.rst

Lines changed: 0 additions & 4 deletions
This file was deleted.

docs/source/learn/optimization.rst

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
Optimization
2+
============
3+
4+
In this section, you will learn how hyperparameter optimization works in our library.
5+
6+
Pipeline
7+
--------
8+
9+
The entire process of configuring a classifier in our library is divided into sequential steps:
10+
11+
1. Selecting an embedder (EmbeddingNode)
12+
2. Selecting a classifier (ScoringNode)
13+
3. Selecting a decision rule (PredictionNode)
14+
15+
Each step has its own set of hyperparameters. To theoretically guarantee finding the ideal configuration through exhaustive search, it is necessary to check every element of the Cartesian product of the hyperparameter sets of these steps (grid search). In practice, achieving this is usually impossible because the number of combinations is too large.
16+
17+
Greedy Strategy
18+
---------------
19+
20+
This is one of the ways to solve the problem of an overwhelming number of combinations. In our case, the greedy optimization algorithm is as follows:
21+
22+
1. Iterate through the hyperparameters of the embedder and fix the best one.
23+
2. Iterate through the hyperparameters of the classifier and fix the best one.
24+
3. Iterate through the hyperparameters of the decision rule and fix the best one.
25+
26+
This algorithm checks fewer combinations, which speeds up the process. To implement such an algorithm, it is necessary to be able to evaluate the quality of not only the final prediction of the entire pipeline but also its intermediate predictions. The main drawback of this approach is that the decisions made are optimal only locally, not globally. The metrics for evaluating intermediate predictions are only a proxy signal for the quality of the final prediction.
27+
28+
This approach has been available in our library since release v0.0.1.
29+
30+
Random Search
31+
-------------
32+
33+
A simpler strategy is to take a random subset of the full search space (random grid search). A straightforward strategy is to iterate through all combinations in random order until a certain time budget is exhausted.
34+
35+
This approach is less intelligent than the greedy strategy because, at any moment during the random combination search, poor embedders or any other bad parameters might keep appearing, despite they have been tested already. The greedy strategy would have eliminated such embedders at the beginning and not revisited them. On the other hand, random search, by its nature, does not rely on any local decisions.
36+
37+
The implementation of this optimization method is planned for release v0.1.0.
38+
39+
Bayesian Optimization
40+
---------------------
41+
42+
This is similar to random search over a subset, but during the search, we attempt to model the probabilistic space of hyperparameters. This allows us to avoid repeating hyperparameter values that have previously performed poorly. The search itself aims to balance exploration and exploitation.
43+
44+
This approach is more sophisticated and can lead to better results by intelligently exploring the hyperparameter space.
45+
46+
The implementation of Bayesian optimization is planned for release v0.1.0.

0 commit comments

Comments
 (0)