From 2d41329090436702523c5fc0170fbfe051059709 Mon Sep 17 00:00:00 2001 From: voorhs Date: Thu, 16 Jan 2025 15:52:59 +0300 Subject: [PATCH 01/22] allow setting random seed for a pipeline from python api --- autointent/_pipeline/_pipeline.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/autointent/_pipeline/_pipeline.py b/autointent/_pipeline/_pipeline.py index 632e1ff10..6f709f5a9 100644 --- a/autointent/_pipeline/_pipeline.py +++ b/autointent/_pipeline/_pipeline.py @@ -25,14 +25,17 @@ class Pipeline: def __init__( self, nodes: list[NodeOptimizer] | list[InferenceNode], + seed: int = 42, ) -> None: """ Initialize the pipeline optimizer. :param nodes: list of nodes + :param seed: random seed """ self._logger = logging.getLogger(__name__) self.nodes = {node.node_type: node for node in nodes} + self.seed = seed if isinstance(nodes[0], NodeOptimizer): self.logging_config = LoggingConfig(dump_dir=None) @@ -62,7 +65,7 @@ def set_config(self, config: LoggingConfig | VectorIndexConfig | EmbedderConfig raise TypeError(msg) @classmethod - def from_search_space(cls, search_space: list[dict[str, Any]] | Path | str) -> "Pipeline": + def from_search_space(cls, search_space: list[dict[str, Any]] | Path | str, seed: int = 42) -> "Pipeline": """ Create pipeline optimizer from dictionary search space. @@ -71,16 +74,16 @@ def from_search_space(cls, search_space: list[dict[str, Any]] | Path | str) -> " if isinstance(search_space, Path | str): search_space = load_search_space(search_space) nodes = [NodeOptimizer(**node) for node in search_space] - return cls(nodes) + return cls(nodes=nodes, seed=seed) @classmethod - def default_optimizer(cls, multilabel: bool) -> "Pipeline": + def default_optimizer(cls, multilabel: bool, seed: int = 42) -> "Pipeline": """ Create pipeline optimizer with default search space for given classification task. :param multilabel: Whether the task multi-label, or single-label. """ - return cls.from_search_space(load_default_search_space(multilabel)) + return cls.from_search_space(search_space=load_default_search_space(multilabel), seed=seed) def _fit(self, context: Context) -> None: """ @@ -111,7 +114,7 @@ def _is_inference(self) -> bool: """ return isinstance(self.nodes[NodeType.scoring], InferenceNode) - def fit(self, dataset: Dataset, force_multilabel: bool = False) -> Context: + def fit(self, dataset: Dataset) -> Context: """ Optimize the pipeline from dataset. @@ -124,7 +127,7 @@ def fit(self, dataset: Dataset, force_multilabel: bool = False) -> Context: raise RuntimeError(msg) context = Context() - context.set_dataset(dataset, force_multilabel) + context.set_dataset(dataset) context.configure_logging(self.logging_config) context.configure_vector_index(self.vector_index_config, self.embedder_config) context.configure_cross_encoder(self.cross_encoder_config) From fcc4be36a1711c01d68518ed1ce2da804e63893c Mon Sep 17 00:00:00 2001 From: voorhs Date: Thu, 16 Jan 2025 15:57:58 +0300 Subject: [PATCH 02/22] remove CLI --- autointent/_pipeline/_cli_endpoint.py | 68 ------------------- autointent/configs/__init__.py | 2 +- ...{_optimization_cli.py => _optimization.py} | 63 +---------------- tests/nodes/conftest.py | 2 +- tests/pipeline/test_optimization.py | 27 -------- 5 files changed, 4 insertions(+), 158 deletions(-) delete mode 100644 autointent/_pipeline/_cli_endpoint.py rename autointent/configs/{_optimization_cli.py => _optimization.py} (74%) diff --git a/autointent/_pipeline/_cli_endpoint.py b/autointent/_pipeline/_cli_endpoint.py deleted file mode 100644 index ddee7dc7a..000000000 --- a/autointent/_pipeline/_cli_endpoint.py +++ /dev/null @@ -1,68 +0,0 @@ -"""Cli endpoint.""" - -import importlib.resources as ires -import logging -from logging import Logger -from pathlib import Path -from typing import Any - -import hydra -import yaml - -from autointent import Context -from autointent.configs._optimization_cli import OptimizationConfig - -from ._pipeline import Pipeline - - -@hydra.main(config_name="optimization_config", config_path=".", version_base=None) -def optimize(cfg: OptimizationConfig) -> None: - """ - Run the optimization pipeline. - - :param cfg: Configuration for the optimization pipeline - :return: - """ - logger = logging.getLogger(__name__) - - logger.debug("Run Name: %s", cfg.logs.run_name) - logger.debug("logs and assets: %s", cfg.logs.dirpath) - - # create shared objects for a whole pipeline - context = Context(cfg.seed) - cfg.logs.clear_ram = True - context.configure_logging(cfg.logs) - context.configure_vector_index(cfg.vector_index, cfg.embedder) - context.configure_data(cfg.data) - context.configure_cross_encoder(cfg.cross_encoder) - - # run optimization - search_space_config = load_config(cfg.task.search_space_path, context.is_multilabel(), logger) - pipeline = Pipeline.from_search_space(search_space_config) - pipeline._fit(context) # noqa: SLF001 - - # save results - context.dump() - - -def load_config(config_path: str | Path | None, multilabel: bool, logger: Logger | None = None) -> list[dict[str, Any]]: - """ - Load configuration from the given path or load default configuration. - - :param config_path: Path to the configuration file - :param multilabel: Whether to use multilabel or not - :param logger: Logger - :return: - """ - if config_path is not None: - if logger is not None: - logger.debug("loading optimization search space config from %s...)", config_path) - with Path(config_path).open() as file: - file_content = file.read() - else: - if logger is not None: - logger.debug("loading default optimization search space config...") - config_name = "default-multilabel-config.yaml" if multilabel else "default-multiclass-config.yaml" - with ires.files("autointent._datafiles").joinpath(config_name).open() as file: - file_content = file.read() - return yaml.safe_load(file_content) # type: ignore[no-any-return] diff --git a/autointent/configs/__init__.py b/autointent/configs/__init__.py index 92d8fc47e..6eaba133b 100644 --- a/autointent/configs/__init__.py +++ b/autointent/configs/__init__.py @@ -1,7 +1,7 @@ """Dataclasses for the configuration of the :class:`autointent.Embedder` and other objects.""" from ._inference_node import InferenceNodeConfig -from ._optimization_cli import ( +from ._optimization import ( CrossEncoderConfig, DataConfig, EmbedderConfig, diff --git a/autointent/configs/_optimization_cli.py b/autointent/configs/_optimization.py similarity index 74% rename from autointent/configs/_optimization_cli.py rename to autointent/configs/_optimization.py index 1d8548e01..4313a6f00 100644 --- a/autointent/configs/_optimization_cli.py +++ b/autointent/configs/_optimization.py @@ -2,10 +2,6 @@ from dataclasses import dataclass, field from pathlib import Path -from typing import Any - -from hydra.core.config_store import ConfigStore -from omegaconf import MISSING from ._name import get_run_name @@ -14,8 +10,8 @@ class DataConfig: """Configuration for the data used in the optimization process.""" - train_path: str | Path = MISSING - """Path to the training data""" + train_path: str | Path + """Path to the training data. Can be local path or HF repo.""" test_path: Path | None = None """Path to the testing data. If None, no testing data will be used""" force_multilabel: bool = False @@ -155,58 +151,3 @@ class OptimizationConfig: """Configuration for the embedder""" cross_encoder: CrossEncoderConfig = field(default_factory=CrossEncoderConfig) """Configuration for the cross encoder""" - - defaults: list[Any] = field( - default_factory=lambda: [ - "_self_", - {"override hydra/job_logging": "autointent_standard_job_logger"}, - {"override hydra/help": "autointent_help"}, - ], - ) - - -logger_config = { - "version": 1, - "formatters": {"simple": {"format": "%(asctime)s - %(name)s [%(levelname)s] %(message)s"}}, - "handlers": { - "console": { - "class": "logging.StreamHandler", - "formatter": "simple", - "stream": "ext://sys.stdout", - }, - "file": { - "class": "logging.FileHandler", - "formatter": "simple", - "filename": "${hydra.runtime.output_dir}/${hydra.job.name}.log", - }, - }, - "root": {"level": "WARN", "handlers": ["console", "file"]}, - "disable_existing_loggers": "false", -} - -help_config = { - "app_name": "AutoIntent", - "header": "== ${hydra.help.app_name} ==", - "footer": """ -Powered by Hydra (https://hydra.cc) -Use --hydra-help to view Hydra specific help""", - "template": """ - ${hydra.help.header} - - This is ${hydra.help.app_name}! - == Config == - This is the config generated for this run. - You can override everything, for example: - python my_app.py db.user=foo db.pass=bar - ------- - $CONFIG - ------- - - ${hydra.help.footer}""", -} - - -cs = ConfigStore.instance() -cs.store(name="optimization_config", node=OptimizationConfig) -cs.store(name="autointent_standard_job_logger", group="hydra/job_logging", node=logger_config) -cs.store(name="autointent_help", group="hydra/help", node=help_config) diff --git a/tests/nodes/conftest.py b/tests/nodes/conftest.py index 1ec8997b1..9f046e7b9 100644 --- a/tests/nodes/conftest.py +++ b/tests/nodes/conftest.py @@ -1,7 +1,7 @@ import pytest from autointent import Context -from autointent.configs._optimization_cli import ( +from autointent.configs import ( CrossEncoderConfig, DataConfig, EmbedderConfig, diff --git a/tests/pipeline/test_optimization.py b/tests/pipeline/test_optimization.py index 91f96642d..3eb751b6a 100644 --- a/tests/pipeline/test_optimization.py +++ b/tests/pipeline/test_optimization.py @@ -6,13 +6,9 @@ import pytest from autointent import Pipeline -from autointent._pipeline._cli_endpoint import optimize from autointent.configs import ( - DataConfig, EmbedderConfig, LoggingConfig, - OptimizationConfig, - TaskConfig, VectorIndexConfig, ) from autointent.utils import load_search_space @@ -84,26 +80,3 @@ def test_dump_modules(dataset, task_type): context.dump() assert os.listdir(dump_dir) - - -@pytest.mark.parametrize( - "task_type", - ["multiclass", "multilabel", "description"], -) -def test_optimization_pipeline_cli(task_type): - dump_dir, logs_dir = setup_environment() - config = OptimizationConfig( - data=DataConfig( - train_path=ires.files("tests.assets.data").joinpath("clinc_subset.json"), - force_multilabel=(task_type == "multilabel"), - ), - task=TaskConfig( - search_space_path=get_search_space_path(task_type), - ), - vector_index=VectorIndexConfig(), - logs=LoggingConfig( - dirpath=Path(logs_dir), - ), - embedder=EmbedderConfig(device="cpu"), - ) - optimize(config) From 4f87b73527162f1a9610b671b9267d7705128ca9 Mon Sep 17 00:00:00 2001 From: voorhs Date: Thu, 16 Jan 2025 16:03:28 +0300 Subject: [PATCH 03/22] fix typing & default calue of OptimizationConfig --- autointent/configs/_optimization.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/autointent/configs/_optimization.py b/autointent/configs/_optimization.py index 4313a6f00..1e4d88b2e 100644 --- a/autointent/configs/_optimization.py +++ b/autointent/configs/_optimization.py @@ -137,9 +137,7 @@ class CrossEncoderConfig(TransformerConfig): class OptimizationConfig: """Configuration for the optimization process.""" - seed: int = 0 - """Seed for the random number generator""" - data: DataConfig = field(default_factory=DataConfig) + data: DataConfig """Configuration for the data used in the optimization process""" task: TaskConfig = field(default_factory=TaskConfig) """Configuration for the task to optimize""" @@ -151,3 +149,5 @@ class OptimizationConfig: """Configuration for the embedder""" cross_encoder: CrossEncoderConfig = field(default_factory=CrossEncoderConfig) """Configuration for the cross encoder""" + seed: int = 0 + """Seed for the random number generator""" From 25271fc49c01b406acbd4203a79f40be5b460bea Mon Sep 17 00:00:00 2001 From: voorhs Date: Thu, 16 Jan 2025 16:13:36 +0300 Subject: [PATCH 04/22] upd tests (remove outdated `force_multilabel` argument) --- tests/pipeline/test_inference.py | 10 ++++++++-- tests/pipeline/test_optimization.py | 15 ++++++++++++--- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/tests/pipeline/test_inference.py b/tests/pipeline/test_inference.py index 3a6730a0c..8fe9048d1 100644 --- a/tests/pipeline/test_inference.py +++ b/tests/pipeline/test_inference.py @@ -36,7 +36,10 @@ def test_inference_config(dataset, task_type): pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32, device="cpu")) pipeline_optimizer.set_config(CrossEncoderConfig()) - context = pipeline_optimizer.fit(dataset, force_multilabel=(task_type == "multilabel")) + if task_type == "multilabel": + dataset = dataset.to_multilabel() + + context = pipeline_optimizer.fit(dataset) inference_config = context.optimization_info.get_inference_nodes_config() inference_pipeline = Pipeline.from_config(inference_config) @@ -67,7 +70,10 @@ def test_inference_context(dataset, task_type): pipeline.set_config(VectorIndexConfig(save_db=True)) pipeline.set_config(EmbedderConfig(batch_size=16, max_length=32, device="cpu")) - context = pipeline.fit(dataset, force_multilabel=(task_type == "multilabel")) + if task_type == "multilabel": + dataset = dataset.to_multilabel() + + context = pipeline.fit(dataset) utterances = ["123", "hello world"] prediction = pipeline.predict(utterances) diff --git a/tests/pipeline/test_optimization.py b/tests/pipeline/test_optimization.py index 3eb751b6a..e11c92fc4 100644 --- a/tests/pipeline/test_optimization.py +++ b/tests/pipeline/test_optimization.py @@ -40,7 +40,10 @@ def test_no_context_optimization(dataset, task_type): pipeline_optimizer.set_config(VectorIndexConfig()) pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32, device="cpu")) - context = pipeline_optimizer.fit(dataset, force_multilabel=(task_type == "multilabel")) + if task_type == "multilabel": + dataset = dataset.to_multilabel() + + context = pipeline_optimizer.fit(dataset) context.dump() @@ -58,7 +61,10 @@ def test_save_db(dataset, task_type): pipeline_optimizer.set_config(VectorIndexConfig(save_db=True)) pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32, device="cpu")) - context = pipeline_optimizer.fit(dataset, force_multilabel=(task_type == "multilabel")) + if task_type == "multilabel": + dataset = dataset.to_multilabel() + + context = pipeline_optimizer.fit(dataset) context.dump() @@ -76,7 +82,10 @@ def test_dump_modules(dataset, task_type): pipeline_optimizer.set_config(VectorIndexConfig()) pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32, device="cpu")) - context = pipeline_optimizer.fit(dataset, force_multilabel=(task_type == "multilabel")) + if task_type == "multilabel": + dataset = dataset.to_multilabel() + + context = pipeline_optimizer.fit(dataset) context.dump() assert os.listdir(dump_dir) From 213ac5bcec755d6e73223a5c04078f85eabb9316 Mon Sep 17 00:00:00 2001 From: voorhs Date: Thu, 16 Jan 2025 16:16:48 +0300 Subject: [PATCH 05/22] remove CLI tutorials --- docs/source/conf.py | 1 - user_guides/cli/01_search_space.py | 80 -------------------- user_guides/cli/02_basic_usage.py | 117 ----------------------------- 3 files changed, 198 deletions(-) delete mode 100644 user_guides/cli/01_search_space.py delete mode 100644 user_guides/cli/02_basic_usage.py diff --git a/docs/source/conf.py b/docs/source/conf.py index ca0db02c6..825ce2431 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -179,7 +179,6 @@ def setup(app: Sphinx) -> None: "user_guides.advanced", "Advanced Usage", ), - ("user_guides.cli", "CLI Usage"), ], source="user_guides", destination=user_guids_dir, diff --git a/user_guides/cli/01_search_space.py b/user_guides/cli/01_search_space.py deleted file mode 100644 index 9d1d869cb..000000000 --- a/user_guides/cli/01_search_space.py +++ /dev/null @@ -1,80 +0,0 @@ -# %% [markdown] -""" -# Search Space as YAML - -If you want to use default search space, you can skip this tutorial. Here we discuss how to save your custom search space as YAML file in order to use it in CLI for pipeline auto-configuration. - -## YAML - -YAML (YAML Ain't Markup Language) is a human-readable data serialization standard that is often used for configuration files and data exchange between languages with different data structures. It serves similar purposes as JSON but is much easier to read. - -Here's an example YAML file: - - -```yaml -database: - host: localhost - port: 5432 - username: admin - # this is a comment - password: secret - -counts: -- 10 -- 20 -- 30 - -literal_counts: [10, 20, 30] - -users: -- name: Alice - age: 30 - email: alice@example.com -- name: Bob - age: 25 - email: bob@example.com - -settings: -debug: true -timeout: 30 -``` - -Explanation: - -- the whole file represents a dictionary with keys ``database``, ``counts``, ``users``, ``settings``, ``debug``, ``timeout`` -- ``database`` itself is a dictionary with keys ``host``, ``port``, and so on -- ``counts`` is a list (Python ``[10, 20, 30]``) -- ``literal_counts`` is a list too -- ``users`` is a list of dictionaries - -## Example Search Space - -```yaml -- node_type: embedding - metric: retrieval_hit_rate - search_space: - - module_name: retrieval - k: [10] - embedder_name: - - avsolatorio/GIST-small-Embedding-v0 - - infgrad/stella-base-en-v2 -- node_type: scoring - metric: scoring_roc_auc - search_space: - - module_name: knn - k: [1, 3, 5, 10] - weights: ["uniform", "distance", "closest"] - - module_name: linear - - module_name: dnnc - cross_encoder_name: - - BAAI/bge-reranker-base - - cross-encoder/ms-marco-MiniLM-L-6-v2 - k: [1, 3, 5, 10] -- node_type: decision - metric: decision_accuracy - search_space: - - module_name: threshold - thresh: [0.5] - - module_name: argmax -``` -""" diff --git a/user_guides/cli/02_basic_usage.py b/user_guides/cli/02_basic_usage.py deleted file mode 100644 index 43ea987b3..000000000 --- a/user_guides/cli/02_basic_usage.py +++ /dev/null @@ -1,117 +0,0 @@ -# %% [markdown] -""" -# Command Line Interface for Pipeline Auto Configuration - -## Data - -Just like with Python API, you can run an automatic pipeline configuration with just a prepared data set. - -You can use local JSON file: -```bash -autointent data.train_path="path/to/my.json" -``` - -Or dataset from Hugging Face hub: -```bash -autointent data.train_path="AutoIntent/banking77" -``` - -## Search Space - -You can provide custom search space, saved as YAML file (as explained in %mddoclink(notebook,cli.01_search_space)): -```bash -autointent data.train_path="AutoIntent/banking77" task.search_space_path="path/to/my/search/space.yaml" -``` - -## Logging Level - -AutoIntent provides comprehensive logs. You can enable it by changing default logging level: -```bash -autointent data.train_path="AutoIntent/banking77" hydra.job_logging.root.level=INFO -``` - -## All Options - -```yaml -data: -# Path to a json file with training data. Set to "default" to use AutoIntent/clinc150_subset from HF hub. - train_path: ??? - -# Path to a json file with test records. Skip this option if you want to use a random subset of the -# training sample as test data. - test_path: null - -# Set to true if your data is multiclass but you want to train the multilabel classifier. - force_multilabel: false - -task: -# Path to a yaml configuration file that defines the optimization search space. -# Omit this to use the default configuration. - search_space_path: null -logs: -# Name of the run prepended to optimization assets dirname (generated randomly if omitted) - run_name: "awful_hippo_10-30-2024_19-42-12" - -# Location where to save optimization logs that will be saved as `/_/logs.json`. -# Omit to use current working directory. <-- on Windows it is not correct - dirpath: "/home/user/AutoIntent/awful_hippo_10-30-2024_19-42-12" - - dump_dir: "/home/user/AutoIntent/runs/awful_hippo_10-30-2024_19-42-12/modules_dumps" - -vector_index: - -# Specify device in torch notation - device: cpu - -augmentation: -# Number of shots per intent to sample from regular expressions. This option extends sample utterance -# within multiclass intent records. - regex_sampling: 0 - -# Config string like "[20, 40, 20, 10]" means 20 one-label examples, 40 two-label examples, 20 three-label examples, -# 10 four-label examples. This option extends multilabel utterance records. - multilabel_generation_config: null - -embedder: -# batch size for embedding computation. - batch_size: 1 -# sentence length limit for embedding computation - max_length: null - -#Affects the randomness -seed: 0 - -# String from {DEBUG,INFO,WARNING,ERROR,CRITICAL}. Omit to use ERROR by default. -hydra.job_logging.root.level: "ERROR" -``` - -## Run from Config File - -Create a yaml file in a separate folder with the following structure **my_config.yaml**: -```yaml -defaults: -- optimization_config -- _self_ -- override hydra/job_logging: custom - -# put the configuration options you want to override here. The full structure is presented above. -# Here is just an example with the same options as for the command line variant above. -embedder: -embedder_batch_size: 32 -``` - -Launch AutoIntent: -```bash -autointent --config-path=/path/to/config/directory --config-name=my_config -``` - -Important: -* specify the full path in the config-path option. -* do not use tab in the yaml file. -* it is desirable that the file name differs from -optimization_config.yaml to avoid warnings from hydra - -You can use a combination of Option 1 and 2. Command line options have the highest priority. - -Example configs are stored in our GitHub repository in [example_configs](https://github.com/deeppavlov/AutoIntent/tree/dev/example_configs). -""" From b184516c22e02dedd96e8b71eaea854e41ffa4f6 Mon Sep 17 00:00:00 2001 From: voorhs Date: Thu, 16 Jan 2025 16:31:47 +0300 Subject: [PATCH 06/22] refactor configs to pydantic --- autointent/configs/_optimization.py | 81 +++++++++++------------------ 1 file changed, 30 insertions(+), 51 deletions(-) diff --git a/autointent/configs/_optimization.py b/autointent/configs/_optimization.py index 1e4d88b2e..8e206af54 100644 --- a/autointent/configs/_optimization.py +++ b/autointent/configs/_optimization.py @@ -1,13 +1,13 @@ """Configuration for the optimization process.""" -from dataclasses import dataclass, field from pathlib import Path +from pydantic import BaseModel, Field, ValidationInfo, field_validator + from ._name import get_run_name -@dataclass -class DataConfig: +class DataConfig(BaseModel): """Configuration for the data used in the optimization process.""" train_path: str | Path @@ -18,16 +18,14 @@ class DataConfig: """Force multilabel classification even if the data is multiclass""" -@dataclass -class TaskConfig: +class TaskConfig(BaseModel): """Configuration for the task to optimize.""" search_space_path: Path | None = None """Path to the search space configuration file. If None, the default search space will be used""" -@dataclass -class LoggingConfig: +class LoggingConfig(BaseModel): """Configuration for the logging.""" run_name: str | None = None @@ -44,53 +42,37 @@ class LoggingConfig: report_to: list[str] | None = None """List of callbacks to report to. If None, no callbacks will be used""" - def __post_init__(self) -> None: - """Define the run name, directory path and dump directory.""" - self.define_run_name() - self.define_dirpath() - self.define_dump_dir() - - def define_run_name(self) -> None: + @field_validator("run_name", mode="before") + @classmethod + def define_run_name(cls, v: str | None) -> str: """Define the run name. If None, a random name will be generated.""" - self.run_name = get_run_name(self.run_name) + return get_run_name(v) - def define_dirpath(self) -> None: + @field_validator("dirpath", mode="before") + @classmethod + def define_dirpath(cls, v: Path | None, info: ValidationInfo) -> Path: """Define the directory path. If None, the logs will be saved in the current working directory.""" - dirpath = Path.cwd() / "runs" if self.dirpath is None else self.dirpath - if self.run_name is None: - raise ValueError - self.dirpath = dirpath / self.run_name - - def get_dirpath(self) -> Path: - """Get the directory path.""" - if self.dirpath is None: - raise ValueError - return self.dirpath - - def get_run_name(self) -> str: - """Get the run name.""" - if self.run_name is None: - raise ValueError - return self.run_name - - def define_dump_dir(self) -> None: + if v is None: + v = Path.cwd() / "runs" + return v / info.data["run_name"] + + @field_validator("dump_dir", pre=True, always=True) + @classmethod + def define_dump_dir(cls, v: Path | None, info: ValidationInfo) -> Path: """Define the dump directory. If None, the modules will not be dumped.""" - if self.dump_dir is None: - if self.dirpath is None: - raise ValueError - self.dump_dir = self.dirpath / "modules_dumps" + if v is None: + v = info.data["dirpath"] / "modules_dumps" + return v -@dataclass -class VectorIndexConfig: +class VectorIndexConfig(BaseModel): """Configuration for the vector index.""" save_db: bool = False """Whether to save the vector index database or not""" -@dataclass -class TransformerConfig: +class TransformerConfig(BaseModel): """ Base class for configuration for the transformer. @@ -105,7 +87,6 @@ class TransformerConfig: """Device to use for the vector index. Can be 'cpu', 'cuda', 'cuda:0', 'mps', etc.""" -@dataclass class EmbedderConfig(TransformerConfig): """ Configuration for the embedder. @@ -119,7 +100,6 @@ class EmbedderConfig(TransformerConfig): """Whether to cache embeddings for reuse, improving performance in repeated operations.""" -@dataclass class CrossEncoderConfig(TransformerConfig): """ Configuration for the embedder. @@ -133,21 +113,20 @@ class CrossEncoderConfig(TransformerConfig): """Whether to train the ranking head of a cross encoder.""" -@dataclass -class OptimizationConfig: +class OptimizationConfig(BaseModel): """Configuration for the optimization process.""" data: DataConfig """Configuration for the data used in the optimization process""" - task: TaskConfig = field(default_factory=TaskConfig) + task: TaskConfig = Field(default_factory=TaskConfig) """Configuration for the task to optimize""" - logs: LoggingConfig = field(default_factory=LoggingConfig) + logs: LoggingConfig = Field(default_factory=LoggingConfig) """Configuration for the logging""" - vector_index: VectorIndexConfig = field(default_factory=VectorIndexConfig) + vector_index: VectorIndexConfig = Field(default_factory=VectorIndexConfig) """Configuration for the vector index""" - embedder: EmbedderConfig = field(default_factory=EmbedderConfig) + embedder: EmbedderConfig = Field(default_factory=EmbedderConfig) """Configuration for the embedder""" - cross_encoder: CrossEncoderConfig = field(default_factory=CrossEncoderConfig) + cross_encoder: CrossEncoderConfig = Field(default_factory=CrossEncoderConfig) """Configuration for the cross encoder""" seed: int = 0 """Seed for the random number generator""" From 9077bb8d4b0476a7999a2e777683413600d3930c Mon Sep 17 00:00:00 2001 From: voorhs Date: Thu, 16 Jan 2025 16:45:41 +0300 Subject: [PATCH 07/22] fix typing --- autointent/_pipeline/_pipeline.py | 4 ++-- autointent/configs/_optimization.py | 19 +++++++++++++++++-- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/autointent/_pipeline/_pipeline.py b/autointent/_pipeline/_pipeline.py index 6f709f5a9..f78a83838 100644 --- a/autointent/_pipeline/_pipeline.py +++ b/autointent/_pipeline/_pipeline.py @@ -94,8 +94,8 @@ def _fit(self, context: Context) -> None: self.context = context self._logger.info("starting pipeline optimization...") self.context.callback_handler.start_run( - run_name=self.context.logging_config.get_run_name(), - dirpath=self.context.logging_config.get_dirpath(), + run_name=self.context.logging_config.safe_run_name, + dirpath=self.context.logging_config.safe_dirpath, ) for node_type in NodeType: node_optimizer = self.nodes.get(node_type, None) diff --git a/autointent/configs/_optimization.py b/autointent/configs/_optimization.py index 8e206af54..71c565388 100644 --- a/autointent/configs/_optimization.py +++ b/autointent/configs/_optimization.py @@ -54,9 +54,9 @@ def define_dirpath(cls, v: Path | None, info: ValidationInfo) -> Path: """Define the directory path. If None, the logs will be saved in the current working directory.""" if v is None: v = Path.cwd() / "runs" - return v / info.data["run_name"] + return v / str(info.data["run_name"]) - @field_validator("dump_dir", pre=True, always=True) + @field_validator("dump_dir", mode="before") @classmethod def define_dump_dir(cls, v: Path | None, info: ValidationInfo) -> Path: """Define the dump directory. If None, the modules will not be dumped.""" @@ -64,6 +64,21 @@ def define_dump_dir(cls, v: Path | None, info: ValidationInfo) -> Path: v = info.data["dirpath"] / "modules_dumps" return v + @property + def safe_run_name(self) -> str: + # This property ensures that the type checker knows `run_name` is a `str` + if self.run_name is None: + msg = "run_name should not be None after validation" + raise ValueError(msg) + return self.run_name + + @property + def safe_dirpath(self) -> Path: + # This property ensures that the type checker knows `run_name` is a `str` + if self.dirpath is None: + msg = "dirpath should not be None after validation" + raise ValueError(msg) + return self.dirpath class VectorIndexConfig(BaseModel): """Configuration for the vector index.""" From 4a81fa97de434f122b44cbf67bc86d1d28233c0c Mon Sep 17 00:00:00 2001 From: voorhs Date: Thu, 16 Jan 2025 17:08:50 +0300 Subject: [PATCH 08/22] finally configure post initialization --- autointent/configs/_optimization.py | 38 ++++++++++++++++------------- tests/nodes/conftest.py | 2 +- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/autointent/configs/_optimization.py b/autointent/configs/_optimization.py index 71c565388..bc11d93ea 100644 --- a/autointent/configs/_optimization.py +++ b/autointent/configs/_optimization.py @@ -2,7 +2,7 @@ from pathlib import Path -from pydantic import BaseModel, Field, ValidationInfo, field_validator +from pydantic import BaseModel, Field, model_validator from ._name import get_run_name @@ -42,27 +42,30 @@ class LoggingConfig(BaseModel): report_to: list[str] | None = None """List of callbacks to report to. If None, no callbacks will be used""" - @field_validator("run_name", mode="before") - @classmethod - def define_run_name(cls, v: str | None) -> str: + @model_validator(mode="after") + def fill_nones(self) -> "LoggingConfig": + self.define_run_name() + self.define_dirpath() + self.define_dump_dir() + return self + + def define_run_name(self) -> None: """Define the run name. If None, a random name will be generated.""" - return get_run_name(v) + self.run_name = get_run_name(self.run_name) - @field_validator("dirpath", mode="before") - @classmethod - def define_dirpath(cls, v: Path | None, info: ValidationInfo) -> Path: + def define_dirpath(self) -> None: """Define the directory path. If None, the logs will be saved in the current working directory.""" - if v is None: - v = Path.cwd() / "runs" - return v / str(info.data["run_name"]) + dirpath = Path.cwd() / "runs" if self.dirpath is None else self.dirpath + if self.run_name is None: + raise ValueError + self.dirpath = dirpath / self.run_name - @field_validator("dump_dir", mode="before") - @classmethod - def define_dump_dir(cls, v: Path | None, info: ValidationInfo) -> Path: + def define_dump_dir(self) -> None: """Define the dump directory. If None, the modules will not be dumped.""" - if v is None: - v = info.data["dirpath"] / "modules_dumps" - return v + if self.dump_dir is None: + if self.dirpath is None: + raise ValueError + self.dump_dir = self.dirpath / "modules_dumps" @property def safe_run_name(self) -> str: @@ -80,6 +83,7 @@ def safe_dirpath(self) -> Path: raise ValueError(msg) return self.dirpath + class VectorIndexConfig(BaseModel): """Configuration for the vector index.""" diff --git a/tests/nodes/conftest.py b/tests/nodes/conftest.py index 9f046e7b9..b1e7e8e20 100644 --- a/tests/nodes/conftest.py +++ b/tests/nodes/conftest.py @@ -79,7 +79,7 @@ def get_context(multilabel): dump_dir, logs_dir = setup_environment() res = Context() - res.configure_data(DataConfig(get_dataset_path(), force_multilabel=multilabel)) + res.configure_data(DataConfig(train_path=get_dataset_path(), force_multilabel=multilabel)) res.configure_logging(LoggingConfig(dirpath=logs_dir, dump_dir=dump_dir, dump_modules=True)) res.configure_vector_index(VectorIndexConfig(), EmbedderConfig(device="cpu")) res.configure_cross_encoder(CrossEncoderConfig()) From 2ebfa4e03150a9e0259662a0949379a09f4cf959 Mon Sep 17 00:00:00 2001 From: voorhs Date: Thu, 16 Jan 2025 17:10:51 +0300 Subject: [PATCH 09/22] upd docstrings --- autointent/configs/_optimization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/autointent/configs/_optimization.py b/autointent/configs/_optimization.py index bc11d93ea..d1c6142fb 100644 --- a/autointent/configs/_optimization.py +++ b/autointent/configs/_optimization.py @@ -69,7 +69,7 @@ def define_dump_dir(self) -> None: @property def safe_run_name(self) -> str: - # This property ensures that the type checker knows `run_name` is a `str` + """Use this method for type safety instead of :py:attr:`LoggingConfig.run_name`.""" if self.run_name is None: msg = "run_name should not be None after validation" raise ValueError(msg) @@ -77,7 +77,7 @@ def safe_run_name(self) -> str: @property def safe_dirpath(self) -> Path: - # This property ensures that the type checker knows `run_name` is a `str` + """Use this method for type safety instead of :py:attr:`LoggingConfig.dirpath`.""" if self.dirpath is None: msg = "dirpath should not be None after validation" raise ValueError(msg) From 56a635c3e3882e65192f720f3305a283685a72be Mon Sep 17 00:00:00 2001 From: voorhs Date: Thu, 16 Jan 2025 17:16:58 +0300 Subject: [PATCH 10/22] remove CLI from docs --- CONTRIBUTING.md | 35 ------------------- example_configs/example_1.yaml | 11 ------ example_configs/example_2.yaml | 15 -------- example_configs/example_3.yaml | 11 ------ example_configs/example_4.yaml | 9 ----- testbed.py | 63 ++++++++++++++++++++++++++++++++++ 6 files changed, 63 insertions(+), 81 deletions(-) delete mode 100644 example_configs/example_1.yaml delete mode 100644 example_configs/example_2.yaml delete mode 100644 example_configs/example_3.yaml delete mode 100644 example_configs/example_4.yaml create mode 100644 testbed.py diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index bd8995f7d..9f8fb984b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -50,41 +50,6 @@ make lint ![](assets/dependency-graph.png) -## Настройка логгера -Чтобы видеть debug строчки у вас есть несколько опций: - -1. Включить весь debug output через опцию коммандной строки: -```bash -autointent hydra.verbose=true -``` -2. Включить debug output только для определенных модулей, пример для autointent.pipeline.optimization.cli_endpoint и самой hydra: -```bash -autointent hydra.verbose=[hydra,autointent/pipeline/optimization/cli_endpoint] hydra.job_logging.root.level=DEBUG -``` - -Само конфигурирование логгера сделано в autointent.configs.optimization_cli.logger_config. Вы можете изменить любой параметр логгера через коммандную строку. Вот пример, как поменять уровень логгера на ERROR: -```bash -autointent hydra.job_logging.root.level=ERROR -``` - -Еще можно изменить параметры логгера через yaml файлы: -1. Создадим папку с конфиг. файлами: test_config -2. test_config/config.yaml: -```yaml -defaults: - - optimization_config - - _self_ - - override hydra/job_logging: custom - -# set your config params for optimization here -embedder_batch_size: 32 -``` -3. Поместите конфигурацию логгера в test_config/hydra/job_logging/custom.yaml (параметры см. [здесь](https://docs.python.org/3/howto/logging.html)) -4. Запускаем с конфиг файлом config.yaml: -```bash -autointent --config-path FULL_PATH/test_config --config-name config -``` - ## Построение документации Построить html версию в папке `docs/build`: diff --git a/example_configs/example_1.yaml b/example_configs/example_1.yaml deleted file mode 100644 index 0f99562be..000000000 --- a/example_configs/example_1.yaml +++ /dev/null @@ -1,11 +0,0 @@ -defaults: - - optimization_config - - _self_ - -data: - train_path: "default-multilabel" - -hydra: - job_logging: - root: - level: "INFO" diff --git a/example_configs/example_2.yaml b/example_configs/example_2.yaml deleted file mode 100644 index 5ad97ec62..000000000 --- a/example_configs/example_2.yaml +++ /dev/null @@ -1,15 +0,0 @@ -defaults: - - optimization_config - - _self_ - -data: - train_path: "data/intent_records/ac_robotic_new.json" - force_multilabel: true - -logs: - dirpath: "experiments/multiclass_as_multilabel/" - run_name: "robotics_new_testing" - -augmentation: - regex_sampling: 10 - multilabel_generation_config: "[0, 4000, 1000]" diff --git a/example_configs/example_3.yaml b/example_configs/example_3.yaml deleted file mode 100644 index 3c8442305..000000000 --- a/example_configs/example_3.yaml +++ /dev/null @@ -1,11 +0,0 @@ -defaults: - - optimization_config - - _self_ - -data: - train_path: "data/intent_records/ac_robotic_new.json" - test_path: "data/intent_records/ac_robotic_val.json" - force_multilabel: true - -augmentation: - regex_sampling: 20 diff --git a/example_configs/example_4.yaml b/example_configs/example_4.yaml deleted file mode 100644 index 06b24883b..000000000 --- a/example_configs/example_4.yaml +++ /dev/null @@ -1,9 +0,0 @@ -defaults: - - optimization_config - - _self_ - -data: - train_path: "default-multiclass" - test_path: "data/intent_records/banking77_test.json" - -seed: 42 diff --git a/testbed.py b/testbed.py new file mode 100644 index 000000000..98d098b43 --- /dev/null +++ b/testbed.py @@ -0,0 +1,63 @@ +"""Sample experiment.""" + +from pathlib import Path + +from autointent import Dataset, Pipeline +from autointent.configs import EmbedderConfig, LoggingConfig + +search_space = [ + { + "node_type": "embedding", + "metric": "retrieval_hit_rate", + "search_space": [ + { + "module_name": "retrieval", + "k": [10], + "embedder_name": ["avsolatorio/GIST-small-Embedding-v0", "infgrad/stella-base-en-v2"], + } + ], + }, + { + "node_type": "scoring", + "metric": "scoring_roc_auc", + "search_space": [ + { + "module_name": "knn", + "k": [1, 3, 5, 10], + "weights": ["uniform", "distance", "closest"], + }, + {"module_name": "linear"}, + { + "module_name": "dnnc", + "cross_encoder_name": [ + "BAAI/bge-reranker-base", + "cross-encoder/ms-marco-MiniLM-L-6-v2", + ], + "k": [1, 3, 5], + }, + ], + }, + { + "node_type": "decision", + "metric": "decision_accuracy", + "search_space": [ + {"module_name": "threshold", "thresh": [0.5]}, + {"module_name": "argmax"}, + ], + }, +] + +log_config = LoggingConfig( + report_to=["tensorboard"], + dirpath=Path("clinc150_retrieval_hitrate_wandb"), + dump_modules=False, + run_name="bug_with_memory", + clear_ram=True, +) +emb_config = EmbedderConfig(batch_size=16, device="cuda") + +dataset = Dataset.from_hub("AutoIntent/clinc150_aug_qwen2.5-7b-awq") +pipeline_optimizer = Pipeline.from_search_space(search_space) +pipeline_optimizer.set_config(log_config) +pipeline_optimizer.set_config(emb_config) +pipeline_optimizer.fit(dataset) From 683538fd21f431fa703c2d62dffb9524e012717c Mon Sep 17 00:00:00 2001 From: voorhs Date: Thu, 16 Jan 2025 17:18:57 +0300 Subject: [PATCH 11/22] remove hydra from poetry --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 5b1090840..2e12d2671 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,6 @@ dependencies = [ "openai (>=1.59.6,<2.0.0)", "datasets (>=3.2.0,<4.0.0)", "xxhash (>=3.5.0,<4.0.0)", - "hydra-core (>=1.3.2,<2.0.0)" ] [project.urls] From 7f04befa7d7c5b8768ae8c5b33d10d307a362ac2 Mon Sep 17 00:00:00 2001 From: voorhs Date: Thu, 16 Jan 2025 17:38:18 +0300 Subject: [PATCH 12/22] remove omegaconf --- autointent/context/_utils.py | 9 +++------ pyproject.toml | 1 - 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/autointent/context/_utils.py b/autointent/context/_utils.py index fd2ac99fd..ddf7ce9fa 100644 --- a/autointent/context/_utils.py +++ b/autointent/context/_utils.py @@ -9,22 +9,21 @@ from typing import Any import numpy as np -from omegaconf import ListConfig from autointent import Dataset class NumpyEncoder(json.JSONEncoder): """ - JSON encoder that handles numpy data types and OmegaConf ListConfig. + JSON encoder that handles numpy data types. This encoder extends the default `json.JSONEncoder` to serialize numpy - arrays, numpy data types, and OmegaConf ListConfig objects. + arrays, numpy data types. """ def default(self, obj: Any) -> str | int | float | list[Any] | Any: # noqa: ANN401 """ - Serialize objects with special handling for numpy and OmegaConf types. + Serialize objects with special handling for numpy. :param obj: Object to serialize. :return: JSON-serializable representation of the object. @@ -35,8 +34,6 @@ def default(self, obj: Any) -> str | int | float | list[Any] | Any: # noqa: ANN return float(obj) if isinstance(obj, np.ndarray): return obj.tolist() - if isinstance(obj, ListConfig): - return list(obj) return super().default(obj) diff --git a/pyproject.toml b/pyproject.toml index 2e12d2671..b0b3cc55e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -181,7 +181,6 @@ module = [ "appdirs", "sre_yield", "skmultilearn.model_selection", - "omegaconf", "hydra", "hydra.*", "transformers", From c68a08cac3bb2122e07b71ce355c9eb9c4cfbc88 Mon Sep 17 00:00:00 2001 From: voorhs Date: Thu, 16 Jan 2025 18:37:24 +0300 Subject: [PATCH 13/22] remove file that was added accidentally --- testbed.py | 63 ------------------------------------------------------ 1 file changed, 63 deletions(-) delete mode 100644 testbed.py diff --git a/testbed.py b/testbed.py deleted file mode 100644 index 98d098b43..000000000 --- a/testbed.py +++ /dev/null @@ -1,63 +0,0 @@ -"""Sample experiment.""" - -from pathlib import Path - -from autointent import Dataset, Pipeline -from autointent.configs import EmbedderConfig, LoggingConfig - -search_space = [ - { - "node_type": "embedding", - "metric": "retrieval_hit_rate", - "search_space": [ - { - "module_name": "retrieval", - "k": [10], - "embedder_name": ["avsolatorio/GIST-small-Embedding-v0", "infgrad/stella-base-en-v2"], - } - ], - }, - { - "node_type": "scoring", - "metric": "scoring_roc_auc", - "search_space": [ - { - "module_name": "knn", - "k": [1, 3, 5, 10], - "weights": ["uniform", "distance", "closest"], - }, - {"module_name": "linear"}, - { - "module_name": "dnnc", - "cross_encoder_name": [ - "BAAI/bge-reranker-base", - "cross-encoder/ms-marco-MiniLM-L-6-v2", - ], - "k": [1, 3, 5], - }, - ], - }, - { - "node_type": "decision", - "metric": "decision_accuracy", - "search_space": [ - {"module_name": "threshold", "thresh": [0.5]}, - {"module_name": "argmax"}, - ], - }, -] - -log_config = LoggingConfig( - report_to=["tensorboard"], - dirpath=Path("clinc150_retrieval_hitrate_wandb"), - dump_modules=False, - run_name="bug_with_memory", - clear_ram=True, -) -emb_config = EmbedderConfig(batch_size=16, device="cuda") - -dataset = Dataset.from_hub("AutoIntent/clinc150_aug_qwen2.5-7b-awq") -pipeline_optimizer = Pipeline.from_search_space(search_space) -pipeline_optimizer.set_config(log_config) -pipeline_optimizer.set_config(emb_config) -pipeline_optimizer.fit(dataset) From 03fd0cb95086f605f50063485613ae1e93cb31d1 Mon Sep 17 00:00:00 2001 From: voorhs Date: Mon, 20 Jan 2025 11:54:08 +0300 Subject: [PATCH 14/22] remove optimization config --- autointent/configs/__init__.py | 2 -- autointent/configs/_optimization.py | 21 +-------------------- 2 files changed, 1 insertion(+), 22 deletions(-) diff --git a/autointent/configs/__init__.py b/autointent/configs/__init__.py index 6eaba133b..783ff1024 100644 --- a/autointent/configs/__init__.py +++ b/autointent/configs/__init__.py @@ -6,7 +6,6 @@ DataConfig, EmbedderConfig, LoggingConfig, - OptimizationConfig, TaskConfig, VectorIndexConfig, ) @@ -18,7 +17,6 @@ "InferenceNodeConfig", "InferenceNodeConfig", "LoggingConfig", - "OptimizationConfig", "TaskConfig", "VectorIndexConfig", ] diff --git a/autointent/configs/_optimization.py b/autointent/configs/_optimization.py index d1c6142fb..81816caf4 100644 --- a/autointent/configs/_optimization.py +++ b/autointent/configs/_optimization.py @@ -2,7 +2,7 @@ from pathlib import Path -from pydantic import BaseModel, Field, model_validator +from pydantic import BaseModel, model_validator from ._name import get_run_name @@ -130,22 +130,3 @@ class CrossEncoderConfig(TransformerConfig): train_head: bool = False """Whether to train the ranking head of a cross encoder.""" - - -class OptimizationConfig(BaseModel): - """Configuration for the optimization process.""" - - data: DataConfig - """Configuration for the data used in the optimization process""" - task: TaskConfig = Field(default_factory=TaskConfig) - """Configuration for the task to optimize""" - logs: LoggingConfig = Field(default_factory=LoggingConfig) - """Configuration for the logging""" - vector_index: VectorIndexConfig = Field(default_factory=VectorIndexConfig) - """Configuration for the vector index""" - embedder: EmbedderConfig = Field(default_factory=EmbedderConfig) - """Configuration for the embedder""" - cross_encoder: CrossEncoderConfig = Field(default_factory=CrossEncoderConfig) - """Configuration for the cross encoder""" - seed: int = 0 - """Seed for the random number generator""" From 12311ad04ddf5253ab89aea0c734ecf224a64f74 Mon Sep 17 00:00:00 2001 From: voorhs Date: Mon, 20 Jan 2025 12:05:04 +0300 Subject: [PATCH 15/22] refactor `LoggingConfig` --- autointent/_pipeline/_pipeline.py | 4 +-- autointent/configs/_optimization.py | 56 +++++++---------------------- 2 files changed, 14 insertions(+), 46 deletions(-) diff --git a/autointent/_pipeline/_pipeline.py b/autointent/_pipeline/_pipeline.py index f78a83838..f51e06d05 100644 --- a/autointent/_pipeline/_pipeline.py +++ b/autointent/_pipeline/_pipeline.py @@ -94,8 +94,8 @@ def _fit(self, context: Context) -> None: self.context = context self._logger.info("starting pipeline optimization...") self.context.callback_handler.start_run( - run_name=self.context.logging_config.safe_run_name, - dirpath=self.context.logging_config.safe_dirpath, + run_name=self.context.logging_config.run_name, + dirpath=self.context.logging_config.dirpath, ) for node_type in NodeType: node_optimizer = self.nodes.get(node_type, None) diff --git a/autointent/configs/_optimization.py b/autointent/configs/_optimization.py index 81816caf4..51fd6a542 100644 --- a/autointent/configs/_optimization.py +++ b/autointent/configs/_optimization.py @@ -2,7 +2,7 @@ from pathlib import Path -from pydantic import BaseModel, model_validator +from pydantic import BaseModel, Field from ._name import get_run_name @@ -28,13 +28,8 @@ class TaskConfig(BaseModel): class LoggingConfig(BaseModel): """Configuration for the logging.""" - run_name: str | None = None + run_name: str = Field(default_factory=get_run_name) """Name of the run. If None, a random name will be generated""" - dirpath: Path | None = None - """Path to the directory where the logs will be saved. - If None, the logs will be saved in the current working directory""" - dump_dir: Path | None = None - """Path to the directory where the modules will be dumped. If None, the modules will not be dumped""" dump_modules: bool = False """Whether to dump the modules or not""" clear_ram: bool = False @@ -42,46 +37,19 @@ class LoggingConfig(BaseModel): report_to: list[str] | None = None """List of callbacks to report to. If None, no callbacks will be used""" - @model_validator(mode="after") - def fill_nones(self) -> "LoggingConfig": - self.define_run_name() - self.define_dirpath() - self.define_dump_dir() - return self - - def define_run_name(self) -> None: - """Define the run name. If None, a random name will be generated.""" - self.run_name = get_run_name(self.run_name) - - def define_dirpath(self) -> None: - """Define the directory path. If None, the logs will be saved in the current working directory.""" - dirpath = Path.cwd() / "runs" if self.dirpath is None else self.dirpath - if self.run_name is None: - raise ValueError - self.dirpath = dirpath / self.run_name - - def define_dump_dir(self) -> None: - """Define the dump directory. If None, the modules will not be dumped.""" - if self.dump_dir is None: - if self.dirpath is None: - raise ValueError - self.dump_dir = self.dirpath / "modules_dumps" - @property - def safe_run_name(self) -> str: - """Use this method for type safety instead of :py:attr:`LoggingConfig.run_name`.""" - if self.run_name is None: - msg = "run_name should not be None after validation" - raise ValueError(msg) - return self.run_name + def dirpath(self) -> Path: + """Path to the directory where the logs will be saved.""" + if not hasattr(self, "_dirpath"): + self._dirpath = Path.cwd() / "runs" / self.run_name + return self._dirpath @property - def safe_dirpath(self) -> Path: - """Use this method for type safety instead of :py:attr:`LoggingConfig.dirpath`.""" - if self.dirpath is None: - msg = "dirpath should not be None after validation" - raise ValueError(msg) - return self.dirpath + def dump_dir(self) -> Path: + """Path to the directory where the modules will be dumped.""" + if not hasattr(self, "_dump_dir"): + self._dump_dir = self.dirpath / "modules_dumps" + return self._dump_dir class VectorIndexConfig(BaseModel): From eb0cd8057379eab43eff90e8d3ee24725049232e Mon Sep 17 00:00:00 2001 From: voorhs Date: Mon, 20 Jan 2025 12:17:09 +0300 Subject: [PATCH 16/22] remove `test_path` and `force_multilabel` params everywhere --- autointent/_pipeline/_pipeline.py | 1 - autointent/configs/_optimization.py | 4 ---- autointent/context/_context.py | 5 +---- autointent/context/data_handler/_data_handler.py | 7 +------ tests/_transformers/test_nli_transformer.py | 2 +- tests/modules/decision/conftest.py | 2 +- tests/modules/scoring/test_description.py | 4 +++- tests/modules/scoring/test_mlknn.py | 2 +- tests/nodes/conftest.py | 8 +++++--- 9 files changed, 13 insertions(+), 22 deletions(-) diff --git a/autointent/_pipeline/_pipeline.py b/autointent/_pipeline/_pipeline.py index f51e06d05..de120c6ce 100644 --- a/autointent/_pipeline/_pipeline.py +++ b/autointent/_pipeline/_pipeline.py @@ -119,7 +119,6 @@ def fit(self, dataset: Dataset) -> Context: Optimize the pipeline from dataset. :param dataset: Dataset for optimization - :param force_multilabel: Whether to force multilabel or not :return: Context """ if self._is_inference(): diff --git a/autointent/configs/_optimization.py b/autointent/configs/_optimization.py index 51fd6a542..d13a3402c 100644 --- a/autointent/configs/_optimization.py +++ b/autointent/configs/_optimization.py @@ -12,10 +12,6 @@ class DataConfig(BaseModel): train_path: str | Path """Path to the training data. Can be local path or HF repo.""" - test_path: Path | None = None - """Path to the testing data. If None, no testing data will be used""" - force_multilabel: bool = False - """Force multilabel classification even if the data is multiclass""" class TaskConfig(BaseModel): diff --git a/autointent/context/_context.py b/autointent/context/_context.py index 73cb171ae..ce14d2041 100644 --- a/autointent/context/_context.py +++ b/autointent/context/_context.py @@ -83,19 +83,16 @@ def configure_data(self, config: DataConfig) -> None: self.data_handler = DataHandler( dataset=load_data(config.train_path), random_seed=self.seed, - force_multilabel=config.force_multilabel, ) - def set_dataset(self, dataset: Dataset, force_multilabel: bool = False) -> None: + def set_dataset(self, dataset: Dataset) -> None: """ Set the datasets for training, validation and testing. :param dataset: Dataset. - :param force_multilabel: Whether to force multilabel classification. """ self.data_handler = DataHandler( dataset=dataset, - force_multilabel=force_multilabel, random_seed=self.seed, ) diff --git a/autointent/context/data_handler/_data_handler.py b/autointent/context/data_handler/_data_handler.py index 11fae5e4a..8d7f159c4 100644 --- a/autointent/context/data_handler/_data_handler.py +++ b/autointent/context/data_handler/_data_handler.py @@ -29,14 +29,11 @@ class RegexPatterns(TypedDict): class DataHandler: """Data handler class.""" - def __init__( - self, dataset: Dataset, force_multilabel: bool = False, random_seed: int = 0, split_train: bool = True - ) -> None: + def __init__(self, dataset: Dataset, random_seed: int = 0, split_train: bool = True) -> None: """ Initialize the data handler. :param dataset: Training dataset. - :param force_multilabel: If True, force the dataset to be multilabel. :param random_seed: Seed for random number generation. :param split_train: Perform or not splitting of train (default to split to be used in scoring and threshold search). @@ -44,8 +41,6 @@ def __init__( set_seed(random_seed) self.dataset = dataset - if force_multilabel: - self.dataset = self.dataset.to_multilabel() self.n_classes = self.dataset.n_classes diff --git a/tests/_transformers/test_nli_transformer.py b/tests/_transformers/test_nli_transformer.py index b41a3cc3c..d1d40bc7a 100644 --- a/tests/_transformers/test_nli_transformer.py +++ b/tests/_transformers/test_nli_transformer.py @@ -9,7 +9,7 @@ @pytest.fixture def data_handler(): data_path = ires.files("tests.assets.data").joinpath("clinc_subset.json") - return DataHandler(dataset=Dataset.from_json(data_path), random_seed=42, force_multilabel=False, split_train=False) + return DataHandler(dataset=Dataset.from_json(data_path), random_seed=42, split_train=False) def test_nli_transformer_predict_without_trained_head(data_handler): diff --git a/tests/modules/decision/conftest.py b/tests/modules/decision/conftest.py index b08b4dc3b..7bd415663 100644 --- a/tests/modules/decision/conftest.py +++ b/tests/modules/decision/conftest.py @@ -29,7 +29,7 @@ def multiclass_fit_data(dataset): def multilabel_fit_data(dataset): dump_dir, logs_dir = setup_environment() - data_handler = DataHandler(dataset, force_multilabel=True) + data_handler = DataHandler(dataset.to_multilabel()) knn_params = { "k": 3, diff --git a/tests/modules/scoring/test_description.py b/tests/modules/scoring/test_description.py index ba22f3e59..f3901a7cf 100644 --- a/tests/modules/scoring/test_description.py +++ b/tests/modules/scoring/test_description.py @@ -15,7 +15,9 @@ ) def test_description_scorer(dataset, expected_prediction, multilabel): dump_dir, logs_dir = setup_environment() - data_handler = DataHandler(dataset, force_multilabel=multilabel) + if multilabel: + dataset = dataset.to_multilabel() + data_handler = DataHandler(dataset) scorer = DescriptionScorer(embedder_name="sergeyzh/rubert-tiny-turbo", temperature=0.3, embedder_device="cpu") diff --git a/tests/modules/scoring/test_mlknn.py b/tests/modules/scoring/test_mlknn.py index 3cb18f1c2..33582235c 100644 --- a/tests/modules/scoring/test_mlknn.py +++ b/tests/modules/scoring/test_mlknn.py @@ -23,7 +23,7 @@ def test_base_mlknn(dataset): ], ) - data_handler = DataHandler(dataset, force_multilabel=True) + data_handler = DataHandler(dataset.to_multilabel()) scorer = MLKnnScorer(embedder_name="sergeyzh/rubert-tiny-turbo", k=3, embedder_device="cpu") scorer.fit(data_handler.train_utterances(0), data_handler.train_labels(0)) diff --git a/tests/nodes/conftest.py b/tests/nodes/conftest.py index b1e7e8e20..2966aa9e4 100644 --- a/tests/nodes/conftest.py +++ b/tests/nodes/conftest.py @@ -1,9 +1,8 @@ import pytest -from autointent import Context +from autointent import Context, Dataset from autointent.configs import ( CrossEncoderConfig, - DataConfig, EmbedderConfig, LoggingConfig, VectorIndexConfig, @@ -79,7 +78,10 @@ def get_context(multilabel): dump_dir, logs_dir = setup_environment() res = Context() - res.configure_data(DataConfig(train_path=get_dataset_path(), force_multilabel=multilabel)) + dataset = Dataset.from_json(get_dataset_path()) + if multilabel: + dataset = dataset.to_multilabel() + res.set_dataset(dataset) res.configure_logging(LoggingConfig(dirpath=logs_dir, dump_dir=dump_dir, dump_modules=True)) res.configure_vector_index(VectorIndexConfig(), EmbedderConfig(device="cpu")) res.configure_cross_encoder(CrossEncoderConfig()) From 80e3d1ed23bd2bc778b690f027cf4cce007c114d Mon Sep 17 00:00:00 2001 From: voorhs Date: Mon, 20 Jan 2025 12:34:44 +0300 Subject: [PATCH 17/22] refactor logs directory structure --- autointent/configs/_optimization.py | 4 +++- tests/callback/test_callback.py | 5 ++--- tests/conftest.py | 7 ++----- tests/nodes/conftest.py | 4 ++-- tests/pipeline/test_inference.py | 9 ++++----- tests/pipeline/test_optimization.py | 15 +++++++-------- 6 files changed, 20 insertions(+), 24 deletions(-) diff --git a/autointent/configs/_optimization.py b/autointent/configs/_optimization.py index d13a3402c..610eece5b 100644 --- a/autointent/configs/_optimization.py +++ b/autointent/configs/_optimization.py @@ -24,6 +24,8 @@ class TaskConfig(BaseModel): class LoggingConfig(BaseModel): """Configuration for the logging.""" + project_dir: Path = Field(default_factory=lambda: Path.cwd() / "runs") + """Path to the directory with different runs.""" run_name: str = Field(default_factory=get_run_name) """Name of the run. If None, a random name will be generated""" dump_modules: bool = False @@ -37,7 +39,7 @@ class LoggingConfig(BaseModel): def dirpath(self) -> Path: """Path to the directory where the logs will be saved.""" if not hasattr(self, "_dirpath"): - self._dirpath = Path.cwd() / "runs" / self.run_name + self._dirpath = self.project_dir / self.run_name return self._dirpath @property diff --git a/tests/callback/test_callback.py b/tests/callback/test_callback.py index cfc529d52..715640e5f 100644 --- a/tests/callback/test_callback.py +++ b/tests/callback/test_callback.py @@ -1,4 +1,3 @@ -from pathlib import Path from typing import Any import numpy as np @@ -43,7 +42,7 @@ def log_final_metrics(self, **kwargs: dict[str, Any]) -> None: def test_pipeline_callbacks(): - dump_dir, logs_dir = setup_environment() + project_dir = setup_environment() dataset = Dataset.from_hub("AutoIntent/clinc150_subset") search_space = [ @@ -76,7 +75,7 @@ def test_pipeline_callbacks(): context = Context() context.configure_vector_index(VectorIndexConfig(save_db=True)) context.configure_logging( - LoggingConfig(run_name="dummy_run_name", dirpath=Path(logs_dir).resolve(), dump_modules=False) + LoggingConfig(run_name="dummy_run_name", project_dir=project_dir, dump_modules=False) ) context.callback_handler = CallbackHandler([DummyCallback]) context.set_dataset(dataset) diff --git a/tests/conftest.py b/tests/conftest.py index 91edd194d..75c0fc31a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,16 +1,13 @@ import importlib.resources as ires from pathlib import Path -from uuid import uuid4 import pytest from autointent import Dataset -def setup_environment() -> tuple[Path, Path, Path]: - logs_dir = ires.files("tests").joinpath("logs") / str(uuid4()) - dump_dir = logs_dir / "modules_dump" - return dump_dir, logs_dir +def setup_environment() -> Path: + return ires.files("tests").joinpath("logs") def get_dataset_path(): diff --git a/tests/nodes/conftest.py b/tests/nodes/conftest.py index 2966aa9e4..9a0d6a7f7 100644 --- a/tests/nodes/conftest.py +++ b/tests/nodes/conftest.py @@ -75,14 +75,14 @@ def scoring_optimizer_multilabel(embedding_optimizer_multilabel): def get_context(multilabel): - dump_dir, logs_dir = setup_environment() + project_dir = setup_environment() res = Context() dataset = Dataset.from_json(get_dataset_path()) if multilabel: dataset = dataset.to_multilabel() res.set_dataset(dataset) - res.configure_logging(LoggingConfig(dirpath=logs_dir, dump_dir=dump_dir, dump_modules=True)) + res.configure_logging(LoggingConfig(project_dir=project_dir, dump_modules=True)) res.configure_vector_index(VectorIndexConfig(), EmbedderConfig(device="cpu")) res.configure_cross_encoder(CrossEncoderConfig()) return res diff --git a/tests/pipeline/test_inference.py b/tests/pipeline/test_inference.py index 8fe9048d1..ee1d176c6 100644 --- a/tests/pipeline/test_inference.py +++ b/tests/pipeline/test_inference.py @@ -1,5 +1,4 @@ import importlib.resources as ires -from pathlib import Path from typing import Literal import pytest @@ -26,12 +25,12 @@ def get_search_space(task_type: TaskType): ["multiclass", "multilabel", "description"], ) def test_inference_config(dataset, task_type): - dump_dir, logs_dir = setup_environment() + project_dir = setup_environment() search_space = get_search_space(task_type) pipeline_optimizer = Pipeline.from_search_space(search_space) - pipeline_optimizer.set_config(LoggingConfig(dirpath=Path(logs_dir).resolve(), dump_modules=True, clear_ram=True)) + pipeline_optimizer.set_config(LoggingConfig(project_dir=project_dir, dump_modules=True, clear_ram=True)) pipeline_optimizer.set_config(VectorIndexConfig(save_db=True)) pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32, device="cpu")) pipeline_optimizer.set_config(CrossEncoderConfig()) @@ -61,12 +60,12 @@ def test_inference_config(dataset, task_type): ["multiclass", "multilabel", "description"], ) def test_inference_context(dataset, task_type): - dump_dir, logs_dir = setup_environment() + project_dir = setup_environment() search_space = get_search_space(task_type) pipeline = Pipeline.from_search_space(search_space) - pipeline.set_config(LoggingConfig(dirpath=Path(logs_dir).resolve(), dump_modules=False, clear_ram=False)) + pipeline.set_config(LoggingConfig(project_dir=project_dir, dump_modules=False, clear_ram=False)) pipeline.set_config(VectorIndexConfig(save_db=True)) pipeline.set_config(EmbedderConfig(batch_size=16, max_length=32, device="cpu")) diff --git a/tests/pipeline/test_optimization.py b/tests/pipeline/test_optimization.py index e11c92fc4..07ca1867a 100644 --- a/tests/pipeline/test_optimization.py +++ b/tests/pipeline/test_optimization.py @@ -1,6 +1,5 @@ import importlib.resources as ires import os -from pathlib import Path from typing import Literal import pytest @@ -31,12 +30,12 @@ def get_search_space(task_type: TaskType): ["multiclass", "multilabel", "description"], ) def test_no_context_optimization(dataset, task_type): - dump_dir, logs_dir = setup_environment() + project_dir = setup_environment() search_space = get_search_space(task_type) pipeline_optimizer = Pipeline.from_search_space(search_space) - pipeline_optimizer.set_config(LoggingConfig(dirpath=Path(logs_dir).resolve(), dump_modules=False)) + pipeline_optimizer.set_config(LoggingConfig(project_dir=project_dir, dump_modules=False)) pipeline_optimizer.set_config(VectorIndexConfig()) pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32, device="cpu")) @@ -52,12 +51,12 @@ def test_no_context_optimization(dataset, task_type): ["multiclass", "multilabel", "description"], ) def test_save_db(dataset, task_type): - dump_dir, logs_dir = setup_environment() + project_dir = setup_environment() search_space = get_search_space(task_type) pipeline_optimizer = Pipeline.from_search_space(search_space) - pipeline_optimizer.set_config(LoggingConfig(dirpath=Path(logs_dir).resolve(), dump_modules=False)) + pipeline_optimizer.set_config(LoggingConfig(project_dir=project_dir, dump_modules=False)) pipeline_optimizer.set_config(VectorIndexConfig(save_db=True)) pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32, device="cpu")) @@ -73,12 +72,12 @@ def test_save_db(dataset, task_type): ["multiclass", "multilabel", "description"], ) def test_dump_modules(dataset, task_type): - dump_dir, logs_dir = setup_environment() + project_dir = setup_environment() search_space = get_search_space(task_type) pipeline_optimizer = Pipeline.from_search_space(search_space) - pipeline_optimizer.set_config(LoggingConfig(dirpath=Path(logs_dir).resolve(), dump_dir=dump_dir, dump_modules=True)) + pipeline_optimizer.set_config(LoggingConfig(project_dir=project_dir, dump_modules=True)) pipeline_optimizer.set_config(VectorIndexConfig()) pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32, device="cpu")) @@ -88,4 +87,4 @@ def test_dump_modules(dataset, task_type): context = pipeline_optimizer.fit(dataset) context.dump() - assert os.listdir(dump_dir) + assert os.listdir(pipeline_optimizer.logging_config.dump_dir) From 7e991e0ca269d143d105b07830e4b833ef981570 Mon Sep 17 00:00:00 2001 From: voorhs Date: Mon, 20 Jan 2025 12:35:11 +0300 Subject: [PATCH 18/22] remove unnecessary `setup_environment()` calls --- tests/context/datahandler/test_multilabel_generation.py | 2 -- tests/context/test_vector_index.py | 2 -- tests/modules/decision/conftest.py | 5 ----- tests/modules/retrieval/test_vectordb.py | 2 -- tests/modules/scoring/test_description.py | 2 -- tests/modules/scoring/test_dnnc.py | 3 --- tests/modules/scoring/test_knn.py | 3 --- tests/modules/scoring/test_linear.py | 2 -- tests/modules/scoring/test_mlknn.py | 3 --- tests/modules/scoring/test_rerank_scorer.py | 3 --- 10 files changed, 27 deletions(-) diff --git a/tests/context/datahandler/test_multilabel_generation.py b/tests/context/datahandler/test_multilabel_generation.py index 9a4dd94c8..e5497812c 100644 --- a/tests/context/datahandler/test_multilabel_generation.py +++ b/tests/context/datahandler/test_multilabel_generation.py @@ -4,7 +4,6 @@ from autointent import VectorIndex from autointent.context.data_handler import DataHandler -from tests.conftest import setup_environment @pytest.fixture @@ -16,6 +15,5 @@ def mock_data_handler(): def test_vector_index_initialization(): - dump_dir, logs_dir = setup_environment() index = VectorIndex(embedder_device="cpu", embedder_model_name="sergeyzh/rubert-tiny-turbo") assert index.embedder_device == "cpu" diff --git a/tests/context/test_vector_index.py b/tests/context/test_vector_index.py index 50e17e565..a0805e7a9 100644 --- a/tests/context/test_vector_index.py +++ b/tests/context/test_vector_index.py @@ -1,7 +1,6 @@ import pytest from autointent import VectorIndex -from tests.conftest import setup_environment @pytest.fixture @@ -14,7 +13,6 @@ class MockDataHandler: def test_create_collection(data_handler): - dump_dir, logs_dir = setup_environment() vector_index = VectorIndex(embedder_model_name="bert-base-uncased", embedder_device="cpu") vector_index.add( data_handler.utterances_train, diff --git a/tests/modules/decision/conftest.py b/tests/modules/decision/conftest.py index 7bd415663..5ce3d6932 100644 --- a/tests/modules/decision/conftest.py +++ b/tests/modules/decision/conftest.py @@ -3,13 +3,10 @@ from autointent.context.data_handler import DataHandler from autointent.modules import KNNScorer -from tests.conftest import setup_environment @pytest.fixture def multiclass_fit_data(dataset): - dump_dir, logs_dir = setup_environment() - data_handler = DataHandler(dataset) knn_params = { @@ -27,8 +24,6 @@ def multiclass_fit_data(dataset): @pytest.fixture def multilabel_fit_data(dataset): - dump_dir, logs_dir = setup_environment() - data_handler = DataHandler(dataset.to_multilabel()) knn_params = { diff --git a/tests/modules/retrieval/test_vectordb.py b/tests/modules/retrieval/test_vectordb.py index 60d6d534e..778c0f6a8 100644 --- a/tests/modules/retrieval/test_vectordb.py +++ b/tests/modules/retrieval/test_vectordb.py @@ -1,9 +1,7 @@ from autointent.modules.embedding import RetrievalEmbedding -from tests.conftest import setup_environment def test_get_assets_returns_correct_artifact(): - dump_dir, logs_dir = setup_environment() module = RetrievalEmbedding(k=5, embedder_name="sergeyzh/rubert-tiny-turbo") artifact = module.get_assets() assert artifact.embedder_name == "sergeyzh/rubert-tiny-turbo" diff --git a/tests/modules/scoring/test_description.py b/tests/modules/scoring/test_description.py index f3901a7cf..7a4b9236f 100644 --- a/tests/modules/scoring/test_description.py +++ b/tests/modules/scoring/test_description.py @@ -3,7 +3,6 @@ from autointent.context.data_handler import DataHandler from autointent.modules import DescriptionScorer -from tests.conftest import setup_environment @pytest.mark.parametrize( @@ -14,7 +13,6 @@ ], ) def test_description_scorer(dataset, expected_prediction, multilabel): - dump_dir, logs_dir = setup_environment() if multilabel: dataset = dataset.to_multilabel() data_handler = DataHandler(dataset) diff --git a/tests/modules/scoring/test_dnnc.py b/tests/modules/scoring/test_dnnc.py index 8e539bb21..1fb4313f8 100644 --- a/tests/modules/scoring/test_dnnc.py +++ b/tests/modules/scoring/test_dnnc.py @@ -3,13 +3,10 @@ from autointent.context.data_handler import DataHandler from autointent.modules import DNNCScorer -from tests.conftest import setup_environment @pytest.mark.parametrize(("train_head", "pred_score"), [(True, 1)]) def test_base_dnnc(dataset, train_head, pred_score): - dump_dir, logs_dir = setup_environment() - data_handler = DataHandler(dataset) scorer = DNNCScorer( diff --git a/tests/modules/scoring/test_knn.py b/tests/modules/scoring/test_knn.py index e628e459e..31e257896 100644 --- a/tests/modules/scoring/test_knn.py +++ b/tests/modules/scoring/test_knn.py @@ -2,12 +2,9 @@ from autointent.context.data_handler import DataHandler from autointent.modules import KNNScorer -from tests.conftest import setup_environment def test_base_knn(dataset): - dump_dir, logs_dir = setup_environment() - data_handler = DataHandler(dataset) scorer = KNNScorer(k=3, weights="distance", embedder_name="sergeyzh/rubert-tiny-turbo", embedder_device="cpu") diff --git a/tests/modules/scoring/test_linear.py b/tests/modules/scoring/test_linear.py index 54882dd1a..43d903511 100644 --- a/tests/modules/scoring/test_linear.py +++ b/tests/modules/scoring/test_linear.py @@ -2,11 +2,9 @@ from autointent.context.data_handler import DataHandler from autointent.modules import LinearScorer -from tests.conftest import setup_environment def test_base_linear(dataset): - dump_dir, logs_dir = setup_environment() data_handler = DataHandler(dataset) diff --git a/tests/modules/scoring/test_mlknn.py b/tests/modules/scoring/test_mlknn.py index 33582235c..bee0d8aaa 100644 --- a/tests/modules/scoring/test_mlknn.py +++ b/tests/modules/scoring/test_mlknn.py @@ -4,12 +4,9 @@ from autointent.context.data_handler import DataHandler from autointent.custom_types import Split from autointent.modules.scoring import MLKnnScorer -from tests.conftest import setup_environment def test_base_mlknn(dataset): - dump_dir, logs_dir = setup_environment() - dataset[Split.TEST] = HFDataset.from_list( [ { diff --git a/tests/modules/scoring/test_rerank_scorer.py b/tests/modules/scoring/test_rerank_scorer.py index e2f59854c..ea958d31c 100644 --- a/tests/modules/scoring/test_rerank_scorer.py +++ b/tests/modules/scoring/test_rerank_scorer.py @@ -2,12 +2,9 @@ from autointent.context.data_handler import DataHandler from autointent.modules import RerankScorer -from tests.conftest import setup_environment def test_base_rerank_scorer(dataset): - dump_dir, logs_dir = setup_environment() - data_handler = DataHandler(dataset) scorer = RerankScorer( From a7ee3053c6308942df0437e3c4f4e8f7dcd9510b Mon Sep 17 00:00:00 2001 From: voorhs Date: Mon, 20 Jan 2025 12:35:33 +0300 Subject: [PATCH 19/22] fix codestyle --- tests/callback/test_callback.py | 4 +--- tests/modules/scoring/test_linear.py | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/callback/test_callback.py b/tests/callback/test_callback.py index 715640e5f..02f4d0583 100644 --- a/tests/callback/test_callback.py +++ b/tests/callback/test_callback.py @@ -74,9 +74,7 @@ def test_pipeline_callbacks(): pipeline_optimizer = Pipeline.from_search_space(search_space) context = Context() context.configure_vector_index(VectorIndexConfig(save_db=True)) - context.configure_logging( - LoggingConfig(run_name="dummy_run_name", project_dir=project_dir, dump_modules=False) - ) + context.configure_logging(LoggingConfig(run_name="dummy_run_name", project_dir=project_dir, dump_modules=False)) context.callback_handler = CallbackHandler([DummyCallback]) context.set_dataset(dataset) diff --git a/tests/modules/scoring/test_linear.py b/tests/modules/scoring/test_linear.py index 43d903511..06188e843 100644 --- a/tests/modules/scoring/test_linear.py +++ b/tests/modules/scoring/test_linear.py @@ -5,7 +5,6 @@ def test_base_linear(dataset): - data_handler = DataHandler(dataset) scorer = LinearScorer(embedder_name="sergeyzh/rubert-tiny-turbo", embedder_device="cpu") From 41e1d663a8d3fe6d815299a720c48ef8a2726604 Mon Sep 17 00:00:00 2001 From: voorhs Date: Mon, 20 Jan 2025 12:51:17 +0300 Subject: [PATCH 20/22] fix docs building --- user_guides/basic_usage/03_automl.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/user_guides/basic_usage/03_automl.py b/user_guides/basic_usage/03_automl.py index c2611509e..40a2bea90 100644 --- a/user_guides/basic_usage/03_automl.py +++ b/user_guides/basic_usage/03_automl.py @@ -117,13 +117,14 @@ """ ## Logging Settings -The important thing is what assets you want to save during the pipeline auto-configuration process. You can control it with %mddoclink(class,configs,LoggingConfig). Default settings are the following: +The important thing is what assets you want to save during the pipeline auto-configuration process. You can control it with %mddoclink(class,configs,LoggingConfig): """ # %% +from pathlib import Path from autointent.configs import LoggingConfig -logging_config = LoggingConfig(run_name=None, dirpath=None, dump_dir=None, dump_modules=False, clear_ram=False) +logging_config = LoggingConfig(project_dir=Path.cwd() / "runs", dump_modules=False, clear_ram=False) custom_pipeline.set_config(logging_config) # %% [markdown] From d5d1af0f8376bf0c86f9bb01e6d5586399d3a122 Mon Sep 17 00:00:00 2001 From: voorhs Date: Mon, 20 Jan 2025 13:11:52 +0300 Subject: [PATCH 21/22] fix docs building --- user_guides/basic_usage/04_inference.py | 41 ++++++++++++++++--------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/user_guides/basic_usage/04_inference.py b/user_guides/basic_usage/04_inference.py index 1d0349d50..ddfe4a2de 100644 --- a/user_guides/basic_usage/04_inference.py +++ b/user_guides/basic_usage/04_inference.py @@ -15,8 +15,31 @@ # %% from autointent import Dataset, Pipeline +search_space = [ + { + "node_type": "scoring", + "metric": "scoring_roc_auc", + "search_space": [ + { + "module_name": "knn", + "k": [1], + "weights": ["uniform"], + "embedder_name": ["avsolatorio/GIST-small-Embedding-v0"] + }, + ], + }, + { + "node_type": "decision", + "metric": "decision_accuracy", + "search_space": [ + {"module_name": "threshold", "thresh": [0.5]}, + {"module_name": "argmax"}, + ], + }, +] + dataset = Dataset.from_hub("AutoIntent/clinc150_subset") -pipeline = Pipeline.default_optimizer(multilabel=False) +pipeline = Pipeline.from_search_space(search_space) context = pipeline.fit(dataset) pipeline.predict(["hello, world!"]) @@ -54,15 +77,12 @@ """ # %% -from pathlib import Path - from autointent import Dataset, Pipeline from autointent.configs import LoggingConfig, VectorIndexConfig dataset = Dataset.from_hub("AutoIntent/clinc150_subset") -pipeline = Pipeline.default_optimizer(multilabel=False) -dump_dir = Path("my_dumps") -pipeline.set_config(LoggingConfig(dump_dir=dump_dir, dump_modules=True, clear_ram=True)) +pipeline = Pipeline.from_search_space(search_space) +pipeline.set_config(LoggingConfig(dump_modules=True, clear_ram=True)) pipeline.set_config(VectorIndexConfig(save_db=True)) # %% [markdown] @@ -96,12 +116,3 @@ """ ## That's all! """ - -# %% -# [you didn't see it] -import shutil - -shutil.rmtree(dump_dir) - -for file in Path.cwd().glob("vector_db*"): - shutil.rmtree(file) From 71d1180d9e962ab3ae0c3b8e9be046e625e1ed7e Mon Sep 17 00:00:00 2001 From: voorhs Date: Mon, 20 Jan 2025 13:12:09 +0300 Subject: [PATCH 22/22] fix codestyle --- user_guides/basic_usage/04_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/user_guides/basic_usage/04_inference.py b/user_guides/basic_usage/04_inference.py index ddfe4a2de..b87623068 100644 --- a/user_guides/basic_usage/04_inference.py +++ b/user_guides/basic_usage/04_inference.py @@ -24,7 +24,7 @@ "module_name": "knn", "k": [1], "weights": ["uniform"], - "embedder_name": ["avsolatorio/GIST-small-Embedding-v0"] + "embedder_name": ["avsolatorio/GIST-small-Embedding-v0"], }, ], },