Skip to content

Commit 0956f13

Browse files
authored
Refactor/remove CLI (#97)
* allow setting random seed for a pipeline from python api * remove CLI * fix typing & default calue of OptimizationConfig * upd tests (remove outdated `force_multilabel` argument) * remove CLI tutorials * refactor configs to pydantic * fix typing * finally configure post initialization * upd docstrings * remove CLI from docs * remove hydra from poetry * remove omegaconf * remove file that was added accidentally * remove optimization config * refactor `LoggingConfig` * remove `test_path` and `force_multilabel` params everywhere * refactor logs directory structure * remove unnecessary `setup_environment()` calls * fix codestyle * fix docs building * fix docs building * fix codestyle
1 parent 461c5b0 commit 0956f13

35 files changed

+193
-699
lines changed

CONTRIBUTING.md

Lines changed: 0 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -50,41 +50,6 @@ make lint
5050

5151
![](assets/dependency-graph.png)
5252

53-
## Настройка логгера
54-
Чтобы видеть debug строчки у вас есть несколько опций:
55-
56-
1. Включить весь debug output через опцию коммандной строки:
57-
```bash
58-
autointent hydra.verbose=true
59-
```
60-
2. Включить debug output только для определенных модулей, пример для autointent.pipeline.optimization.cli_endpoint и самой hydra:
61-
```bash
62-
autointent hydra.verbose=[hydra,autointent/pipeline/optimization/cli_endpoint] hydra.job_logging.root.level=DEBUG
63-
```
64-
65-
Само конфигурирование логгера сделано в autointent.configs.optimization_cli.logger_config. Вы можете изменить любой параметр логгера через коммандную строку. Вот пример, как поменять уровень логгера на ERROR:
66-
```bash
67-
autointent hydra.job_logging.root.level=ERROR
68-
```
69-
70-
Еще можно изменить параметры логгера через yaml файлы:
71-
1. Создадим папку с конфиг. файлами: test_config
72-
2. test_config/config.yaml:
73-
```yaml
74-
defaults:
75-
- optimization_config
76-
- _self_
77-
- override hydra/job_logging: custom
78-
79-
# set your config params for optimization here
80-
embedder_batch_size: 32
81-
```
82-
3. Поместите конфигурацию логгера в test_config/hydra/job_logging/custom.yaml (параметры см. [здесь](https://docs.python.org/3/howto/logging.html))
83-
4. Запускаем с конфиг файлом config.yaml:
84-
```bash
85-
autointent --config-path FULL_PATH/test_config --config-name config
86-
```
87-
8853
## Построение документации
8954

9055
Построить html версию в папке `docs/build`:

autointent/_pipeline/_cli_endpoint.py

Lines changed: 0 additions & 68 deletions
This file was deleted.

autointent/_pipeline/_pipeline.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,17 @@ class Pipeline:
2525
def __init__(
2626
self,
2727
nodes: list[NodeOptimizer] | list[InferenceNode],
28+
seed: int = 42,
2829
) -> None:
2930
"""
3031
Initialize the pipeline optimizer.
3132
3233
:param nodes: list of nodes
34+
:param seed: random seed
3335
"""
3436
self._logger = logging.getLogger(__name__)
3537
self.nodes = {node.node_type: node for node in nodes}
38+
self.seed = seed
3639

3740
if isinstance(nodes[0], NodeOptimizer):
3841
self.logging_config = LoggingConfig(dump_dir=None)
@@ -62,7 +65,7 @@ def set_config(self, config: LoggingConfig | VectorIndexConfig | EmbedderConfig
6265
raise TypeError(msg)
6366

6467
@classmethod
65-
def from_search_space(cls, search_space: list[dict[str, Any]] | Path | str) -> "Pipeline":
68+
def from_search_space(cls, search_space: list[dict[str, Any]] | Path | str, seed: int = 42) -> "Pipeline":
6669
"""
6770
Create pipeline optimizer from dictionary search space.
6871
@@ -71,16 +74,16 @@ def from_search_space(cls, search_space: list[dict[str, Any]] | Path | str) -> "
7174
if isinstance(search_space, Path | str):
7275
search_space = load_search_space(search_space)
7376
nodes = [NodeOptimizer(**node) for node in search_space]
74-
return cls(nodes)
77+
return cls(nodes=nodes, seed=seed)
7578

7679
@classmethod
77-
def default_optimizer(cls, multilabel: bool) -> "Pipeline":
80+
def default_optimizer(cls, multilabel: bool, seed: int = 42) -> "Pipeline":
7881
"""
7982
Create pipeline optimizer with default search space for given classification task.
8083
8184
:param multilabel: Whether the task multi-label, or single-label.
8285
"""
83-
return cls.from_search_space(load_default_search_space(multilabel))
86+
return cls.from_search_space(search_space=load_default_search_space(multilabel), seed=seed)
8487

8588
def _fit(self, context: Context) -> None:
8689
"""
@@ -91,8 +94,8 @@ def _fit(self, context: Context) -> None:
9194
self.context = context
9295
self._logger.info("starting pipeline optimization...")
9396
self.context.callback_handler.start_run(
94-
run_name=self.context.logging_config.get_run_name(),
95-
dirpath=self.context.logging_config.get_dirpath(),
97+
run_name=self.context.logging_config.run_name,
98+
dirpath=self.context.logging_config.dirpath,
9699
)
97100
for node_type in NodeType:
98101
node_optimizer = self.nodes.get(node_type, None)
@@ -111,20 +114,19 @@ def _is_inference(self) -> bool:
111114
"""
112115
return isinstance(self.nodes[NodeType.scoring], InferenceNode)
113116

114-
def fit(self, dataset: Dataset, force_multilabel: bool = False) -> Context:
117+
def fit(self, dataset: Dataset) -> Context:
115118
"""
116119
Optimize the pipeline from dataset.
117120
118121
:param dataset: Dataset for optimization
119-
:param force_multilabel: Whether to force multilabel or not
120122
:return: Context
121123
"""
122124
if self._is_inference():
123125
msg = "Pipeline in inference mode cannot be fitted"
124126
raise RuntimeError(msg)
125127

126128
context = Context()
127-
context.set_dataset(dataset, force_multilabel)
129+
context.set_dataset(dataset)
128130
context.configure_logging(self.logging_config)
129131
context.configure_vector_index(self.vector_index_config, self.embedder_config)
130132
context.configure_cross_encoder(self.cross_encoder_config)

autointent/configs/__init__.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
"""Dataclasses for the configuration of the :class:`autointent.Embedder` and other objects."""
22

33
from ._inference_node import InferenceNodeConfig
4-
from ._optimization_cli import (
4+
from ._optimization import (
55
CrossEncoderConfig,
66
DataConfig,
77
EmbedderConfig,
88
LoggingConfig,
9-
OptimizationConfig,
109
TaskConfig,
1110
VectorIndexConfig,
1211
)
@@ -18,7 +17,6 @@
1817
"InferenceNodeConfig",
1918
"InferenceNodeConfig",
2019
"LoggingConfig",
21-
"OptimizationConfig",
2220
"TaskConfig",
2321
"VectorIndexConfig",
2422
]
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
"""Configuration for the optimization process."""
2+
3+
from pathlib import Path
4+
5+
from pydantic import BaseModel, Field
6+
7+
from ._name import get_run_name
8+
9+
10+
class DataConfig(BaseModel):
11+
"""Configuration for the data used in the optimization process."""
12+
13+
train_path: str | Path
14+
"""Path to the training data. Can be local path or HF repo."""
15+
16+
17+
class TaskConfig(BaseModel):
18+
"""Configuration for the task to optimize."""
19+
20+
search_space_path: Path | None = None
21+
"""Path to the search space configuration file. If None, the default search space will be used"""
22+
23+
24+
class LoggingConfig(BaseModel):
25+
"""Configuration for the logging."""
26+
27+
project_dir: Path = Field(default_factory=lambda: Path.cwd() / "runs")
28+
"""Path to the directory with different runs."""
29+
run_name: str = Field(default_factory=get_run_name)
30+
"""Name of the run. If None, a random name will be generated"""
31+
dump_modules: bool = False
32+
"""Whether to dump the modules or not"""
33+
clear_ram: bool = False
34+
"""Whether to clear the RAM after dumping the modules"""
35+
report_to: list[str] | None = None
36+
"""List of callbacks to report to. If None, no callbacks will be used"""
37+
38+
@property
39+
def dirpath(self) -> Path:
40+
"""Path to the directory where the logs will be saved."""
41+
if not hasattr(self, "_dirpath"):
42+
self._dirpath = self.project_dir / self.run_name
43+
return self._dirpath
44+
45+
@property
46+
def dump_dir(self) -> Path:
47+
"""Path to the directory where the modules will be dumped."""
48+
if not hasattr(self, "_dump_dir"):
49+
self._dump_dir = self.dirpath / "modules_dumps"
50+
return self._dump_dir
51+
52+
53+
class VectorIndexConfig(BaseModel):
54+
"""Configuration for the vector index."""
55+
56+
save_db: bool = False
57+
"""Whether to save the vector index database or not"""
58+
59+
60+
class TransformerConfig(BaseModel):
61+
"""
62+
Base class for configuration for the transformer.
63+
64+
Transformer is used under the hood in :py:class:`autointent.Embedder` and :py:class:`autointent.Ranker`.
65+
"""
66+
67+
batch_size: int = 32
68+
"""Batch size for the embedder"""
69+
max_length: int | None = None
70+
"""Max length for the embedder. If None, the max length will be taken from model config"""
71+
device: str = "cpu"
72+
"""Device to use for the vector index. Can be 'cpu', 'cuda', 'cuda:0', 'mps', etc."""
73+
74+
75+
class EmbedderConfig(TransformerConfig):
76+
"""
77+
Configuration for the embedder.
78+
79+
The embedder is used to embed the data before training the model. These parameters
80+
will be applied to the embedder used in the optimization process in vector db.
81+
Only one model can be used globally.
82+
"""
83+
84+
use_cache: bool = True
85+
"""Whether to cache embeddings for reuse, improving performance in repeated operations."""
86+
87+
88+
class CrossEncoderConfig(TransformerConfig):
89+
"""
90+
Configuration for the embedder.
91+
92+
The embedder is used to embed the data before training the model. These parameters
93+
will be applied to the embedder used in the optimization process in vector db.
94+
Only one model can be used globally.
95+
"""
96+
97+
train_head: bool = False
98+
"""Whether to train the ranking head of a cross encoder."""

0 commit comments

Comments
 (0)