Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 0 additions & 35 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,41 +50,6 @@ make lint

![](assets/dependency-graph.png)

## Настройка логгера
Чтобы видеть debug строчки у вас есть несколько опций:

1. Включить весь debug output через опцию коммандной строки:
```bash
autointent hydra.verbose=true
```
2. Включить debug output только для определенных модулей, пример для autointent.pipeline.optimization.cli_endpoint и самой hydra:
```bash
autointent hydra.verbose=[hydra,autointent/pipeline/optimization/cli_endpoint] hydra.job_logging.root.level=DEBUG
```

Само конфигурирование логгера сделано в autointent.configs.optimization_cli.logger_config. Вы можете изменить любой параметр логгера через коммандную строку. Вот пример, как поменять уровень логгера на ERROR:
```bash
autointent hydra.job_logging.root.level=ERROR
```

Еще можно изменить параметры логгера через yaml файлы:
1. Создадим папку с конфиг. файлами: test_config
2. test_config/config.yaml:
```yaml
defaults:
- optimization_config
- _self_
- override hydra/job_logging: custom

# set your config params for optimization here
embedder_batch_size: 32
```
3. Поместите конфигурацию логгера в test_config/hydra/job_logging/custom.yaml (параметры см. [здесь](https://docs.python.org/3/howto/logging.html))
4. Запускаем с конфиг файлом config.yaml:
```bash
autointent --config-path FULL_PATH/test_config --config-name config
```

## Построение документации

Построить html версию в папке `docs/build`:
Expand Down
68 changes: 0 additions & 68 deletions autointent/_pipeline/_cli_endpoint.py

This file was deleted.

20 changes: 11 additions & 9 deletions autointent/_pipeline/_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,17 @@ class Pipeline:
def __init__(
self,
nodes: list[NodeOptimizer] | list[InferenceNode],
seed: int = 42,
) -> None:
"""
Initialize the pipeline optimizer.

:param nodes: list of nodes
:param seed: random seed
"""
self._logger = logging.getLogger(__name__)
self.nodes = {node.node_type: node for node in nodes}
self.seed = seed

if isinstance(nodes[0], NodeOptimizer):
self.logging_config = LoggingConfig(dump_dir=None)
Expand Down Expand Up @@ -62,7 +65,7 @@ def set_config(self, config: LoggingConfig | VectorIndexConfig | EmbedderConfig
raise TypeError(msg)

@classmethod
def from_search_space(cls, search_space: list[dict[str, Any]] | Path | str) -> "Pipeline":
def from_search_space(cls, search_space: list[dict[str, Any]] | Path | str, seed: int = 42) -> "Pipeline":
"""
Create pipeline optimizer from dictionary search space.

Expand All @@ -71,16 +74,16 @@ def from_search_space(cls, search_space: list[dict[str, Any]] | Path | str) -> "
if isinstance(search_space, Path | str):
search_space = load_search_space(search_space)
nodes = [NodeOptimizer(**node) for node in search_space]
return cls(nodes)
return cls(nodes=nodes, seed=seed)

@classmethod
def default_optimizer(cls, multilabel: bool) -> "Pipeline":
def default_optimizer(cls, multilabel: bool, seed: int = 42) -> "Pipeline":
"""
Create pipeline optimizer with default search space for given classification task.

:param multilabel: Whether the task multi-label, or single-label.
"""
return cls.from_search_space(load_default_search_space(multilabel))
return cls.from_search_space(search_space=load_default_search_space(multilabel), seed=seed)

def _fit(self, context: Context) -> None:
"""
Expand All @@ -91,8 +94,8 @@ def _fit(self, context: Context) -> None:
self.context = context
self._logger.info("starting pipeline optimization...")
self.context.callback_handler.start_run(
run_name=self.context.logging_config.get_run_name(),
dirpath=self.context.logging_config.get_dirpath(),
run_name=self.context.logging_config.run_name,
dirpath=self.context.logging_config.dirpath,
)
for node_type in NodeType:
node_optimizer = self.nodes.get(node_type, None)
Expand All @@ -111,20 +114,19 @@ def _is_inference(self) -> bool:
"""
return isinstance(self.nodes[NodeType.scoring], InferenceNode)

def fit(self, dataset: Dataset, force_multilabel: bool = False) -> Context:
def fit(self, dataset: Dataset) -> Context:
"""
Optimize the pipeline from dataset.

:param dataset: Dataset for optimization
:param force_multilabel: Whether to force multilabel or not
:return: Context
"""
if self._is_inference():
msg = "Pipeline in inference mode cannot be fitted"
raise RuntimeError(msg)

context = Context()
context.set_dataset(dataset, force_multilabel)
context.set_dataset(dataset)
context.configure_logging(self.logging_config)
context.configure_vector_index(self.vector_index_config, self.embedder_config)
context.configure_cross_encoder(self.cross_encoder_config)
Expand Down
4 changes: 1 addition & 3 deletions autointent/configs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
"""Dataclasses for the configuration of the :class:`autointent.Embedder` and other objects."""

from ._inference_node import InferenceNodeConfig
from ._optimization_cli import (
from ._optimization import (
CrossEncoderConfig,
DataConfig,
EmbedderConfig,
LoggingConfig,
OptimizationConfig,
TaskConfig,
VectorIndexConfig,
)
Expand All @@ -18,7 +17,6 @@
"InferenceNodeConfig",
"InferenceNodeConfig",
"LoggingConfig",
"OptimizationConfig",
"TaskConfig",
"VectorIndexConfig",
]
98 changes: 98 additions & 0 deletions autointent/configs/_optimization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
"""Configuration for the optimization process."""

from pathlib import Path

from pydantic import BaseModel, Field

from ._name import get_run_name


class DataConfig(BaseModel):
"""Configuration for the data used in the optimization process."""

train_path: str | Path
"""Path to the training data. Can be local path or HF repo."""


class TaskConfig(BaseModel):
"""Configuration for the task to optimize."""

search_space_path: Path | None = None
"""Path to the search space configuration file. If None, the default search space will be used"""


class LoggingConfig(BaseModel):
"""Configuration for the logging."""

project_dir: Path = Field(default_factory=lambda: Path.cwd() / "runs")
"""Path to the directory with different runs."""
run_name: str = Field(default_factory=get_run_name)
"""Name of the run. If None, a random name will be generated"""
dump_modules: bool = False
"""Whether to dump the modules or not"""
clear_ram: bool = False
"""Whether to clear the RAM after dumping the modules"""
report_to: list[str] | None = None
"""List of callbacks to report to. If None, no callbacks will be used"""

@property
def dirpath(self) -> Path:
"""Path to the directory where the logs will be saved."""
if not hasattr(self, "_dirpath"):
self._dirpath = self.project_dir / self.run_name
return self._dirpath

@property
def dump_dir(self) -> Path:
"""Path to the directory where the modules will be dumped."""
if not hasattr(self, "_dump_dir"):
self._dump_dir = self.dirpath / "modules_dumps"
return self._dump_dir


class VectorIndexConfig(BaseModel):
"""Configuration for the vector index."""

save_db: bool = False
"""Whether to save the vector index database or not"""


class TransformerConfig(BaseModel):
"""
Base class for configuration for the transformer.

Transformer is used under the hood in :py:class:`autointent.Embedder` and :py:class:`autointent.Ranker`.
"""

batch_size: int = 32
"""Batch size for the embedder"""
max_length: int | None = None
"""Max length for the embedder. If None, the max length will be taken from model config"""
device: str = "cpu"
"""Device to use for the vector index. Can be 'cpu', 'cuda', 'cuda:0', 'mps', etc."""


class EmbedderConfig(TransformerConfig):
"""
Configuration for the embedder.

The embedder is used to embed the data before training the model. These parameters
will be applied to the embedder used in the optimization process in vector db.
Only one model can be used globally.
"""

use_cache: bool = True
"""Whether to cache embeddings for reuse, improving performance in repeated operations."""


class CrossEncoderConfig(TransformerConfig):
"""
Configuration for the embedder.

The embedder is used to embed the data before training the model. These parameters
will be applied to the embedder used in the optimization process in vector db.
Only one model can be used globally.
"""

train_head: bool = False
"""Whether to train the ranking head of a cross encoder."""
Loading
Loading