diff --git a/.github/workflows/test-presets.yaml b/.github/workflows/test-presets.yaml new file mode 100644 index 000000000..ab4a6723d --- /dev/null +++ b/.github/workflows/test-presets.yaml @@ -0,0 +1,38 @@ +name: test presets + +on: + push: + branches: + - dev + pull_request: + +jobs: + test: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ ubuntu-latest ] + python-version: [ "3.10", "3.11", "3.12" ] + include: + - os: windows-latest + python-version: "3.10" + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: "pip" + + - name: Install dependencies + run: | + pip install . + pip install pytest pytest-asyncio + + - name: Run tests + run: | + pytest tests/pipeline/test_presets.py diff --git a/README.md b/README.md index 133d79218..5780011c1 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ Example of building an intent classifier in a couple of lines of code: from autointent import Pipeline, Dataset dataset = Dataset.from_json(path_to_json) -pipeline = Pipeline.default_optimizer(multilabel=False) +pipeline = Pipeline.from_preset("light") pipeline.fit(dataset) -pipeline.predict(["show me my latest recent transactions"]) +pipeline.predict(["show me my latest transactions"]) ``` diff --git a/autointent/__init__.py b/autointent/__init__.py index 8478f709b..81e33cdba 100644 --- a/autointent/__init__.py +++ b/autointent/__init__.py @@ -7,6 +7,7 @@ from ._dataset import Dataset from ._hash import Hasher from .context import Context, load_dataset +from ._optimization_config import OptimizationConfig from ._pipeline import Pipeline @@ -15,6 +16,7 @@ "Dataset", "Embedder", "Hasher", + "OptimizationConfig", "Pipeline", "Ranker", "VectorIndex", diff --git a/autointent/_datafiles/default-multiclass-config.yaml b/autointent/_datafiles/default-multiclass-config.yaml deleted file mode 100644 index a0fef0cd4..000000000 --- a/autointent/_datafiles/default-multiclass-config.yaml +++ /dev/null @@ -1,26 +0,0 @@ -# TODO: make up a better and more versatile config -- node_type: embedding - target_metric: retrieval_hit_rate - search_space: - - module_name: retrieval - k: [10] - embedder_config: - - avsolatorio/GIST-small-Embedding-v0 - - sergeyzh/rubert-tiny-turbo -- node_type: scoring - target_metric: scoring_roc_auc - search_space: - - module_name: knn - k: [1, 3, 5, 10] - weights: ["uniform", "distance", "closest"] - - module_name: linear - - module_name: dnnc - cross_encoder_config: - - cross-encoder/ms-marco-MiniLM-L-6-v2 - k: [1, 3, 5, 10] -- node_type: decision - target_metric: decision_accuracy - search_space: - - module_name: threshold - thresh: [0.5] - - module_name: argmax diff --git a/autointent/_datafiles/default-multilabel-config.yaml b/autointent/_datafiles/default-multilabel-config.yaml deleted file mode 100644 index 2d2942dcb..000000000 --- a/autointent/_datafiles/default-multilabel-config.yaml +++ /dev/null @@ -1,21 +0,0 @@ -# TODO: make up a better and more versatile config -- node_type: embedding - target_metric: retrieval_hit_rate_intersecting - search_space: - - module_name: retrieval - k: [10] - embedder_config: - - deepvk/USER-bge-m3 -- node_type: scoring - target_metric: scoring_roc_auc - search_space: - - module_name: knn - k: [3] - weights: ["uniform", "distance", "closest"] - - module_name: linear -- node_type: decision - target_metric: decision_accuracy - search_space: - - module_name: threshold - thresh: [0.5] - - module_name: adaptive diff --git a/autointent/_datafiles/inference-config-example.yaml b/autointent/_datafiles/inference-config-example.yaml deleted file mode 100644 index 5df54876a..000000000 --- a/autointent/_datafiles/inference-config-example.yaml +++ /dev/null @@ -1,17 +0,0 @@ -- node_type: embedding - module_name: retrieval - module_config: - k: 10 - model_config: sergeyzh/rubert-tiny-turbo - load_path: . -- node_type: scoring - module_name: knn - module_config: - k: 10 - weights: uniform - load_path: . -- node_type: decision - module_name: threshold - module_config: - thresh: 0.5 - load_path: . \ No newline at end of file diff --git a/autointent/_dataset/_dataset.py b/autointent/_dataset/_dataset.py index 760601aa1..e6b93e19e 100644 --- a/autointent/_dataset/_dataset.py +++ b/autointent/_dataset/_dataset.py @@ -1,6 +1,7 @@ """File with Dataset definition.""" import json +import logging from collections import defaultdict from functools import cached_property from pathlib import Path @@ -12,6 +13,8 @@ from autointent.custom_types import LabelWithOOS, Split from autointent.schemas import Intent, Tag +logger = logging.getLogger(__name__) + class Sample(TypedDict): """ @@ -36,6 +39,7 @@ class Dataset(dict[str, HFDataset]): label_feature = "label" utterance_feature = "utterance" + has_descriptions: bool def __init__(self, *args: Any, intents: list[Intent], **kwargs: Any) -> None: # noqa: ANN401 """ @@ -49,6 +53,8 @@ def __init__(self, *args: Any, intents: list[Intent], **kwargs: Any) -> None: # self.intents = intents + self.has_descriptions = self.validate_descriptions() + @property def multilabel(self) -> bool: """ @@ -197,3 +203,18 @@ def _to_multilabel(self, sample: Sample) -> Sample: ohe_vector[sample["label"]] = 1 sample["label"] = ohe_vector return sample + + def validate_descriptions(self) -> bool: + """ + Check whether the dataset contains text descriptions for each intent. + + :return: True if all intents have description field + """ + has_any = any(intent.description is not None for intent in self.intents) + has_all = all(intent.description is not None for intent in self.intents) + + if has_any and not has_all: + msg = "Some intents have text descriptions, but some of them not." + logger.warning(msg) + + return has_all diff --git a/autointent/_dump_tools.py b/autointent/_dump_tools.py index 70b334d6a..7e52c5ba2 100644 --- a/autointent/_dump_tools.py +++ b/autointent/_dump_tools.py @@ -12,7 +12,8 @@ from sklearn.base import BaseEstimator from autointent import Embedder, Ranker, VectorIndex -from autointent.schemas import CrossEncoderConfig, EmbedderConfig, TagsList +from autointent.configs import CrossEncoderConfig, EmbedderConfig +from autointent.schemas import TagsList ModuleSimpleAttributes = None | str | int | float | bool | list # type: ignore[type-arg] diff --git a/autointent/_embedder.py b/autointent/_embedder.py index fd80fca8d..ccb53275b 100644 --- a/autointent/_embedder.py +++ b/autointent/_embedder.py @@ -17,7 +17,7 @@ from sentence_transformers import SentenceTransformer from ._hash import Hasher -from .schemas import EmbedderConfig, TaskTypeEnum +from .configs import EmbedderConfig, TaskTypeEnum def get_embeddings_path(filename: str) -> Path: diff --git a/autointent/_optimization_config.py b/autointent/_optimization_config.py new file mode 100644 index 000000000..f8b647a0c --- /dev/null +++ b/autointent/_optimization_config.py @@ -0,0 +1,17 @@ +from pydantic import BaseModel, PositiveInt + +from .configs import CrossEncoderConfig, DataConfig, EmbedderConfig, LoggingConfig +from .custom_types import SamplerType +from .nodes.schemes import OptimizationSearchSpaceConfig + + +class OptimizationConfig(BaseModel): + """Configuration for the optimization process.""" + + data_config: DataConfig = DataConfig() + search_space: OptimizationSearchSpaceConfig + logging_config: LoggingConfig = LoggingConfig() + embedder_config: EmbedderConfig = EmbedderConfig() + cross_encoder_config: CrossEncoderConfig = CrossEncoderConfig() + sampler: SamplerType = "brute" + seed: PositiveInt = 42 diff --git a/autointent/_pipeline/_pipeline.py b/autointent/_pipeline/_pipeline.py index f3644f9f1..d433616e7 100644 --- a/autointent/_pipeline/_pipeline.py +++ b/autointent/_pipeline/_pipeline.py @@ -9,13 +9,24 @@ import yaml from typing_extensions import assert_never -from autointent import Context, Dataset -from autointent.configs import DataConfig, InferenceNodeConfig, LoggingConfig, VectorIndexConfig -from autointent.custom_types import ListOfGenericLabels, NodeType, SamplerType +from autointent import Context, Dataset, OptimizationConfig +from autointent.configs import ( + CrossEncoderConfig, + DataConfig, + EmbedderConfig, + InferenceNodeConfig, + LoggingConfig, +) +from autointent.custom_types import ( + ListOfGenericLabels, + NodeType, + SamplerType, + SearchSpacePresets, + SearchSpaceValidationMode, +) from autointent.metrics import DECISION_METRICS from autointent.nodes import InferenceNode, NodeOptimizer -from autointent.nodes.schemes import OptimizationConfig, OptimizationSearchSpaceConfig -from autointent.utils import load_default_search_space, load_search_space +from autointent.utils import load_preset, load_search_space from ._schemas import InferencePipelineOutput, InferencePipelineUtteranceOutput @@ -50,12 +61,13 @@ def __init__( if isinstance(nodes[0], NodeOptimizer): self.logging_config = LoggingConfig(dump_dir=None) - self.vector_index_config = VectorIndexConfig() + self.embedder_config = EmbedderConfig() + self.cross_encoder_config = CrossEncoderConfig() self.data_config = DataConfig() elif not isinstance(nodes[0], InferenceNode): assert_never(nodes) - def set_config(self, config: LoggingConfig | VectorIndexConfig | DataConfig) -> None: + def set_config(self, config: LoggingConfig | EmbedderConfig | CrossEncoderConfig | DataConfig) -> None: """ Set configuration for the optimizer. @@ -63,8 +75,10 @@ def set_config(self, config: LoggingConfig | VectorIndexConfig | DataConfig) -> """ if isinstance(config, LoggingConfig): self.logging_config = config - elif isinstance(config, VectorIndexConfig): - self.vector_index_config = config + elif isinstance(config, EmbedderConfig): + self.embedder_config = config + elif isinstance(config, CrossEncoderConfig): + self.cross_encoder_config = config elif isinstance(config, DataConfig): self.data_config = config else: @@ -78,48 +92,46 @@ def from_search_space(cls, search_space: list[dict[str, Any]] | Path | str, seed :param search_space: Dictionary config :param seed: random seed """ - if isinstance(search_space, Path | str): + if not isinstance(search_space, list): search_space = load_search_space(search_space) - validated_search_space = OptimizationSearchSpaceConfig(search_space).model_dump() # type: ignore[arg-type] - nodes = [NodeOptimizer(**node) for node in validated_search_space] + nodes = [NodeOptimizer(**node) for node in search_space] return cls(nodes=nodes, seed=seed) @classmethod - def from_optimization_config(cls, config: dict[str, Any] | Path | str) -> "Pipeline": + def from_preset(cls, name: SearchSpacePresets, seed: int = 42) -> "Pipeline": + optimization_config = load_preset(name) + config = OptimizationConfig(seed=seed, **optimization_config) + return cls.from_optimization_config(config=config) + + @classmethod + def from_optimization_config(cls, config: dict[str, Any] | Path | str | OptimizationConfig) -> "Pipeline": """ Create pipeline optimizer from optimization config. :param config: Optimization config :return: """ - if isinstance(config, Path | str): - with Path(config).open() as file: - loaded_config = yaml.safe_load(file) + if isinstance(config, OptimizationConfig): + optimization_config = config else: - loaded_config = config - optimization_config = OptimizationConfig(**loaded_config) + if isinstance(config, dict): + dict_params = config + else: + with Path(config).open() as file: + dict_params = yaml.safe_load(file) + optimization_config = OptimizationConfig(**dict_params) + pipeline = cls( - [NodeOptimizer(**node.model_dump()) for node in optimization_config.task_config.search_space], - optimization_config.task_config.sampler, + [NodeOptimizer(**node.model_dump()) for node in optimization_config.search_space], + optimization_config.sampler, optimization_config.seed, ) pipeline.set_config(optimization_config.logging_config) - pipeline.set_config(optimization_config.vector_index_config) pipeline.set_config(optimization_config.data_config) + pipeline.set_config(optimization_config.embedder_config) + pipeline.set_config(optimization_config.cross_encoder_config) return pipeline - @classmethod - def default_optimizer(cls, multilabel: bool, seed: int = 42) -> "Pipeline": - """ - Create pipeline optimizer with default search space for given classification task. - - :param multilabel: Whether the task multi-label, or single-label. - :param seed: random seed - - :return: Pipeline - """ - return cls.from_search_space(search_space=load_default_search_space(multilabel), seed=seed) - def _fit(self, context: Context, sampler: SamplerType) -> None: """ Optimize the pipeline. @@ -136,9 +148,6 @@ def _fit(self, context: Context, sampler: SamplerType) -> None: node_optimizer = self.nodes.get(node_type, None) if node_optimizer is not None: node_optimizer.fit(context, sampler) # type: ignore[union-attr] - if not context.vector_index_config.save_db: - self._logger.info("removing vector database from file system...") - # TODO clear cache from appdirs self.context.callback_handler.end_run() def _is_inference(self) -> bool: @@ -154,6 +163,7 @@ def fit( dataset: Dataset, refit_after: bool = False, sampler: SamplerType | None = None, + incompatible_search_space: SearchSpaceValidationMode = "filter", ) -> Context: """ Optimize the pipeline from dataset. @@ -168,9 +178,10 @@ def fit( context = Context() context.set_dataset(dataset, self.data_config) context.configure_logging(self.logging_config) - context.configure_vector_index(self.vector_index_config) + context.configure_transformer(self.embedder_config) + context.configure_transformer(self.cross_encoder_config) - self.validate_modules(dataset) + self.validate_modules(dataset, mode=incompatible_search_space) test_utterances = context.data_handler.test_utterances() if test_utterances is None: @@ -207,7 +218,7 @@ def fit( return context - def validate_modules(self, dataset: Dataset) -> None: + def validate_modules(self, dataset: Dataset, mode: SearchSpaceValidationMode) -> None: """ Validate modules with dataset. @@ -215,7 +226,7 @@ def validate_modules(self, dataset: Dataset) -> None: """ for node in self.nodes.values(): if isinstance(node, NodeOptimizer): - node.validate_nodes_with_dataset(dataset) + node.validate_nodes_with_dataset(dataset, mode) @classmethod def from_dict_config(cls, nodes_configs: list[dict[str, Any]]) -> "Pipeline": diff --git a/autointent/_datafiles/__init__.py b/autointent/_presets/__init__.py similarity index 100% rename from autointent/_datafiles/__init__.py rename to autointent/_presets/__init__.py diff --git a/autointent/_presets/heavy.yaml b/autointent/_presets/heavy.yaml new file mode 100644 index 000000000..9c1e605c9 --- /dev/null +++ b/autointent/_presets/heavy.yaml @@ -0,0 +1,46 @@ +# TODO add sklearn RandomForestClassifier +search_space: + - node_type: scoring + target_metric: scoring_roc_auc + search_space: + - module_name: knn + k: + low: 1 + high: 20 + weights: [uniform, distance, closest] + n_trials: 10 + - module_name: linear + - module_name: mlknn + k: + low: 1 + high: 20 + step: 1 + n_trials: 10 + - module_name: description + temperature: + low: 0.01 + high: 10 + log: true + n_trials: 10 + - module_name: rerank + k: + low: 10 + high: 40 + m: + low: 1 + high: 10 + weights: [uniform, distance, closest] + n_trials: 15 + - node_type: decision + target_metric: decision_accuracy + search_space: + - module_name: threshold + thresh: + low: 0.1 + high: 0.9 + n_trials: 10 + - module_name: argmax + - module_name: jinoos + - module_name: tunable + - module_name: adaptive +sampler: tpe \ No newline at end of file diff --git a/autointent/_presets/heavy_extra.yaml b/autointent/_presets/heavy_extra.yaml new file mode 100644 index 000000000..0187822a3 --- /dev/null +++ b/autointent/_presets/heavy_extra.yaml @@ -0,0 +1,41 @@ +search_space: + - node_type: scoring + target_metric: scoring_roc_auc + search_space: + - module_name: knn + k: + low: 1 + high: 20 + step: 1 + weights: [uniform, distance, closest] + - module_name: linear + - module_name: mlknn + k: + low: 1 + high: 20 + step: 1 + - module_name: rerank + k: + low: 10 + high: 40 + step: 5 + m: + low: 1 + high: 9 + step: 2 + weights: [uniform, distance, closest] + - module_name: description + temperature: [0.01, 0.01637894, 0.02682696, 0.04393971, 0.07196857, 0.11787686, 0.19306977, 0.31622777, 0.51794747, 0.8483429 , 1.38949549, 2.27584593, 3.72759372, 6.1054023 , 10.] + - node_type: decision + target_metric: decision_accuracy + search_space: + - module_name: threshold + thresh: + low: 0.1 + high: 0.9 + step: 0.1 + - module_name: argmax + - module_name: jinoos + - module_name: tunable + - module_name: adaptive +sampler: brute \ No newline at end of file diff --git a/autointent/_presets/heavy_moderate.yaml b/autointent/_presets/heavy_moderate.yaml new file mode 100644 index 000000000..fca7e480e --- /dev/null +++ b/autointent/_presets/heavy_moderate.yaml @@ -0,0 +1,45 @@ +search_space: + - node_type: scoring + target_metric: scoring_roc_auc + search_space: + - module_name: knn + k: + low: 1 + high: 20 + weights: [uniform, distance, closest] + n_trials: 10 + - module_name: linear + - module_name: mlknn + k: + low: 1 + high: 20 + step: 1 + n_trials: 10 + - module_name: description + temperature: + low: 0.01 + high: 10 + log: true + n_trials: 10 + - module_name: rerank + k: + low: 10 + high: 40 + m: + low: 1 + high: 10 + weights: [uniform, distance, closest] + n_trials: 15 + - node_type: decision + target_metric: decision_accuracy + search_space: + - module_name: threshold + thresh: + low: 0.1 + high: 0.9 + n_trials: 10 + - module_name: argmax + - module_name: jinoos + - module_name: tunable + - module_name: adaptive +sampler: random \ No newline at end of file diff --git a/autointent/_presets/light.yaml b/autointent/_presets/light.yaml new file mode 100644 index 000000000..a2d4a9a5b --- /dev/null +++ b/autointent/_presets/light.yaml @@ -0,0 +1,27 @@ +search_space: + - node_type: scoring + target_metric: scoring_roc_auc + search_space: + - module_name: knn + k: + low: 1 + high: 20 + n_trials: 15 + weights: [uniform, distance, closest] + - module_name: linear + - module_name: mlknn + k: + low: 1 + high: 20 + step: 1 + n_trials: 10 + - node_type: decision + target_metric: decision_accuracy + search_space: + - module_name: threshold + thresh: + low: 0.1 + high: 0.9 + step: 0.1 + - module_name: argmax +sampler: tpe \ No newline at end of file diff --git a/autointent/_presets/light_extra.yaml b/autointent/_presets/light_extra.yaml new file mode 100644 index 000000000..4d5bb51ff --- /dev/null +++ b/autointent/_presets/light_extra.yaml @@ -0,0 +1,26 @@ +search_space: + - node_type: scoring + target_metric: scoring_roc_auc + search_space: + - module_name: knn + k: + low: 1 + high: 20 + n_trials: 10 + weights: [uniform, distance, closest] + - module_name: linear + - module_name: mlknn + k: + low: 1 + high: 20 + n_trials: 10 + - node_type: decision + target_metric: decision_accuracy + search_space: + - module_name: threshold + thresh: + low: 0.1 + high: 0.9 + n_trials: 10 + - module_name: argmax +sampler: random \ No newline at end of file diff --git a/autointent/_presets/light_moderate.yaml b/autointent/_presets/light_moderate.yaml new file mode 100644 index 000000000..ac7ffefbc --- /dev/null +++ b/autointent/_presets/light_moderate.yaml @@ -0,0 +1,26 @@ +search_space: + - node_type: scoring + target_metric: scoring_roc_auc + search_space: + - module_name: knn + k: + low: 1 + high: 20 + step: 1 + weights: [uniform, distance, closest] + - module_name: linear + - module_name: mlknn + k: + low: 1 + high: 20 + step: 1 + - node_type: decision + target_metric: decision_accuracy + search_space: + - module_name: threshold + thresh: + low: 0.1 + high: 0.9 + step: 0.1 + - module_name: argmax +sampler: brute \ No newline at end of file diff --git a/autointent/_ranker.py b/autointent/_ranker.py index 354f68b0d..7569db4c5 100644 --- a/autointent/_ranker.py +++ b/autointent/_ranker.py @@ -19,8 +19,8 @@ from sklearn.linear_model import LogisticRegressionCV from torch import nn +from autointent.configs import CrossEncoderConfig from autointent.custom_types import ListOfLabels -from autointent.schemas import CrossEncoderConfig logger = logging.getLogger(__name__) diff --git a/autointent/_vector_index.py b/autointent/_vector_index.py index b34bb7082..0d3983fe0 100644 --- a/autointent/_vector_index.py +++ b/autointent/_vector_index.py @@ -15,8 +15,8 @@ import numpy.typing as npt from autointent import Embedder +from autointent.configs import EmbedderConfig, TaskTypeEnum from autointent.custom_types import ListOfLabels -from autointent.schemas import EmbedderConfig, TaskTypeEnum class VectorIndexMetadata(TypedDict): diff --git a/autointent/configs/__init__.py b/autointent/configs/__init__.py index d267b03f3..b939a5395 100644 --- a/autointent/configs/__init__.py +++ b/autointent/configs/__init__.py @@ -1,16 +1,15 @@ """Dataclasses for the configuration of the :class:`autointent.Embedder` and other objects.""" from ._inference_node import InferenceNodeConfig -from ._optimization import ( - DataConfig, - LoggingConfig, - VectorIndexConfig, -) +from ._optimization import DataConfig, LoggingConfig +from ._transformers import CrossEncoderConfig, EmbedderConfig, TaskTypeEnum __all__ = [ + "CrossEncoderConfig", "DataConfig", + "EmbedderConfig", "InferenceNodeConfig", "InferenceNodeConfig", "LoggingConfig", - "VectorIndexConfig", + "TaskTypeEnum", ] diff --git a/autointent/configs/_optimization.py b/autointent/configs/_optimization.py index 910af2513..146813e4f 100644 --- a/autointent/configs/_optimization.py +++ b/autointent/configs/_optimization.py @@ -68,10 +68,3 @@ def get_run_name(self) -> str: if self.run_name is None: self.run_name = get_run_name() return self.run_name - - -class VectorIndexConfig(BaseModel): - """Configuration for the vector index.""" - - save_db: bool = Field(False, description="Whether to save the vector index database or not") - """Whether to save the vector index database or not""" diff --git a/autointent/configs/_transformers.py b/autointent/configs/_transformers.py new file mode 100644 index 000000000..b64343691 --- /dev/null +++ b/autointent/configs/_transformers.py @@ -0,0 +1,108 @@ +from enum import Enum +from typing import Any + +from pydantic import ( + BaseModel, + Field, + PositiveInt, +) +from typing_extensions import Self, assert_never + + +class ModelConfig(BaseModel): + batch_size: PositiveInt = Field(32, description="Batch size for model inference.") + max_length: PositiveInt | None = Field(None, description="Maximum length of input sequences.") + + +class STModelConfig(ModelConfig): + model_name: str + device: str | None = Field(None, description="Torch notation for CPU or CUDA.") + + @classmethod + def from_search_config(cls, values: dict[str, Any] | str | BaseModel | None) -> Self: + """Validate the model configuration. + + :param values: Model configuration values. If a string is provided, it is converted to a dictionary. + """ + if values is None: + return cls() # type: ignore[call-arg] + if isinstance(values, BaseModel): + return values # type: ignore[return-value] + if isinstance(values, str): + return cls(model_name=values) + return cls(**values) + + +class TaskTypeEnum(Enum): + """Enum for different types of prompts.""" + + default = "default" + classification = "classification" + cluster = "cluster" + query = "query" + passage = "passage" + sts = "sts" + + +class EmbedderConfig(STModelConfig): + model_name: str = Field("sentence-transformers/all-MiniLM-L6-v2", description="Name of the hugging face model.") + default_prompt: str | None = Field( + None, description="Default prompt for the model. This is used when no task specific prompt is not provided." + ) + classifier_prompt: str | None = Field(None, description="Prompt for classifier.") + cluster_prompt: str | None = Field(None, description="Prompt for clustering.") + sts_prompt: str | None = Field(None, description="Prompt for finding most similar sentences.") + query_prompt: str | None = Field(None, description="Prompt for query.") + passage_prompt: str | None = Field(None, description="Prompt for passage.") + + def get_prompt_config(self) -> dict[str, str] | None: + """Get the prompt config for the given prompt type. + + :return: The prompt config for the given prompt type. + """ + prompts = {} + if self.default_prompt: + prompts[TaskTypeEnum.default.value] = self.default_prompt + if self.classifier_prompt: + prompts[TaskTypeEnum.classification.value] = self.classifier_prompt + if self.cluster_prompt: + prompts[TaskTypeEnum.cluster.value] = self.cluster_prompt + if self.query_prompt: + prompts[TaskTypeEnum.query.value] = self.query_prompt + if self.passage_prompt: + prompts[TaskTypeEnum.passage.value] = self.passage_prompt + if self.sts_prompt: + prompts[TaskTypeEnum.sts.value] = self.sts_prompt + return prompts if len(prompts) > 0 else None + + def get_prompt_type(self, prompt_type: TaskTypeEnum | None) -> str | None: # noqa: PLR0911 + """Get the prompt type for the given task type. + + :param prompt_type: Task type for which to get the prompt. + + :return: The prompt for the given task type. + """ + if prompt_type is None: + return self.default_prompt + if prompt_type == TaskTypeEnum.classification: + return self.classifier_prompt + if prompt_type == TaskTypeEnum.cluster: + return self.cluster_prompt + if prompt_type == TaskTypeEnum.query: + return self.query_prompt + if prompt_type == TaskTypeEnum.passage: + return self.passage_prompt + if prompt_type == TaskTypeEnum.sts: + return self.sts_prompt + if prompt_type == TaskTypeEnum.default: + return self.default_prompt + assert_never(prompt_type) + + use_cache: bool = Field(False, description="Whether to use embeddings caching.") + + +class CrossEncoderConfig(STModelConfig): + model_name: str = Field("cross-encoder/ms-marco-MiniLM-L-6-v2", description="Name of the hugging face model.") + train_head: bool = Field( + False, description="Whether to train the head of the model. If False, LogReg will be trained." + ) diff --git a/autointent/context/_context.py b/autointent/context/_context.py index 79f73029c..a62d7e1e9 100644 --- a/autointent/context/_context.py +++ b/autointent/context/_context.py @@ -9,11 +9,7 @@ from autointent import Dataset from autointent._callbacks import CallbackHandler, get_callbacks -from autointent.configs import ( - DataConfig, - LoggingConfig, - VectorIndexConfig, -) +from autointent.configs import CrossEncoderConfig, DataConfig, EmbedderConfig, LoggingConfig from ._utils import NumpyEncoder from .data_handler import DataHandler @@ -51,13 +47,16 @@ def configure_logging(self, config: LoggingConfig) -> None: self.callback_handler = get_callbacks(config.report_to) self.optimization_info = OptimizationInfo() - def configure_vector_index(self, config: VectorIndexConfig) -> None: + def configure_transformer(self, config: EmbedderConfig | CrossEncoderConfig) -> None: """ Configure the vector index client and embedder. :param config: Configuration for the vector index. """ - self.vector_index_config = config + if isinstance(config, EmbedderConfig): + self.embedder_config = config + elif isinstance(config, CrossEncoderConfig): + self.cross_encoder_config = config def set_dataset(self, dataset: Dataset, config: DataConfig) -> None: """ @@ -154,3 +153,21 @@ def has_saved_modules(self) -> bool: """ node_types = ["regex", "embedding", "scoring", "decision"] return any(len(self.optimization_info.modules.get(nt)) > 0 for nt in node_types) + + def resolve_embedder(self) -> EmbedderConfig: + try: + return self.optimization_info.get_best_embedder() + except ValueError as e: + if hasattr(self, "embedder_config"): + return self.embedder_config + msg = ( + "Embedder could't be resolved. Either include embedding node into the " + "search space or set default config with Context.configure_transformer." + ) + raise RuntimeError(msg) from e + + def resolve_ranker(self) -> CrossEncoderConfig: + if hasattr(self, "cross_encoder_config"): + return self.cross_encoder_config + msg = "Cross-encoder could't be resolved. Set default config with Context.configure_transformer." + raise RuntimeError(msg) diff --git a/autointent/context/optimization_info/_data_models.py b/autointent/context/optimization_info/_data_models.py index fbe4615e9..4d5795109 100644 --- a/autointent/context/optimization_info/_data_models.py +++ b/autointent/context/optimization_info/_data_models.py @@ -10,8 +10,8 @@ from numpy.typing import NDArray from pydantic import BaseModel, ConfigDict, Field +from autointent.configs import EmbedderConfig from autointent.custom_types import ListOfLabelsWithOOS, NodeType -from autointent.schemas import EmbedderConfig class Artifact(BaseModel): diff --git a/autointent/context/optimization_info/_optimization_info.py b/autointent/context/optimization_info/_optimization_info.py index 2adc467b8..fadf91cc2 100644 --- a/autointent/context/optimization_info/_optimization_info.py +++ b/autointent/context/optimization_info/_optimization_info.py @@ -11,9 +11,8 @@ import numpy as np from numpy.typing import NDArray -from autointent.configs import InferenceNodeConfig +from autointent.configs import EmbedderConfig, InferenceNodeConfig from autointent.custom_types import NodeType -from autointent.schemas import EmbedderConfig from ._data_models import Artifact, Artifacts, EmbeddingArtifact, ScorerArtifact, Trial, Trials, TrialsIds diff --git a/autointent/custom_types.py b/autointent/custom_types.py index ba4cef163..d2b360717 100644 --- a/autointent/custom_types.py +++ b/autointent/custom_types.py @@ -79,3 +79,7 @@ class Split: FloatFromZeroToOne = Annotated[float, Interval(ge=0, le=1)] """Float value between 0 and 1, inclusive.""" + +SearchSpaceValidationMode = Literal["raise", "warning", "filter"] + +SearchSpacePresets = Literal["light", "light_moderate", "light_extra", "heavy", "heavy_moderate", "heavy_extra"] diff --git a/autointent/modules/__init__.py b/autointent/modules/__init__.py index 038a54d7e..be46c06a1 100644 --- a/autointent/modules/__init__.py +++ b/autointent/modules/__init__.py @@ -23,13 +23,11 @@ def _create_modules_dict(modules: list[type[T]]) -> dict[str, type[T]]: REGEX_MODULES: dict[str, type[BaseRegex]] = _create_modules_dict([Regex]) -EMBEDDING_MODULES_MULTICLASS: dict[str, type[BaseEmbedding]] = _create_modules_dict( +EMBEDDING_MODULES: dict[str, type[BaseEmbedding]] = _create_modules_dict( [RetrievalAimedEmbedding, LogregAimedEmbedding] ) -EMBEDDING_MODULES_MULTILABEL: dict[str, type[BaseEmbedding]] = EMBEDDING_MODULES_MULTICLASS - -SCORING_MODULES_MULTICLASS: dict[str, type[BaseScorer]] = _create_modules_dict( +SCORING_MODULES: dict[str, type[BaseScorer]] = _create_modules_dict( [ DNNCScorer, KNNScorer, @@ -37,24 +35,13 @@ def _create_modules_dict(modules: list[type[T]]) -> dict[str, type[T]]: DescriptionScorer, RerankScorer, SklearnScorer, - ] -) - -SCORING_MODULES_MULTILABEL: dict[str, type[BaseScorer]] = _create_modules_dict( - [ MLKnnScorer, - LinearScorer, - DescriptionScorer, - SklearnScorer, - ], + ] ) -DECISION_MODULES_MULTICLASS: dict[str, type[BaseDecision]] = _create_modules_dict( - [ArgmaxDecision, JinoosDecision, ThresholdDecision, TunableDecision], +DECISION_MODULES: dict[str, type[BaseDecision]] = _create_modules_dict( + [ArgmaxDecision, JinoosDecision, ThresholdDecision, TunableDecision, AdaptiveDecision], ) -DECISION_MODULES_MULTILABEL: dict[str, type[BaseDecision]] = _create_modules_dict( - [AdaptiveDecision, ThresholdDecision, TunableDecision], -) __all__ = [] # type: ignore[var-annotated] diff --git a/autointent/modules/decision/_threshold.py b/autointent/modules/decision/_threshold.py index 95e9d319f..f825525e4 100644 --- a/autointent/modules/decision/_threshold.py +++ b/autointent/modules/decision/_threshold.py @@ -75,7 +75,7 @@ class ThresholdDecision(BaseDecision): def __init__( self, - thresh: FloatFromZeroToOne | list[FloatFromZeroToOne], + thresh: FloatFromZeroToOne | list[FloatFromZeroToOne] = 0.5, ) -> None: """ Initialize threshold predictor. diff --git a/autointent/modules/embedding/_logreg.py b/autointent/modules/embedding/_logreg.py index c342d3ea9..0b4aa6283 100644 --- a/autointent/modules/embedding/_logreg.py +++ b/autointent/modules/embedding/_logreg.py @@ -10,11 +10,11 @@ from sklearn.preprocessing import LabelEncoder from autointent import Context, Embedder +from autointent.configs import EmbedderConfig, TaskTypeEnum from autointent.context.optimization_info import EmbeddingArtifact from autointent.custom_types import ListOfLabels from autointent.metrics import SCORING_METRICS_MULTICLASS, SCORING_METRICS_MULTILABEL from autointent.modules.abc import BaseEmbedding -from autointent.schemas import EmbedderConfig, TaskTypeEnum class LogregAimedEmbedding(BaseEmbedding): diff --git a/autointent/modules/embedding/_retrieval.py b/autointent/modules/embedding/_retrieval.py index 72d7a1d9c..0b41bc605 100644 --- a/autointent/modules/embedding/_retrieval.py +++ b/autointent/modules/embedding/_retrieval.py @@ -5,11 +5,11 @@ from pydantic import PositiveInt from autointent import Context, VectorIndex +from autointent.configs import EmbedderConfig from autointent.context.optimization_info import EmbeddingArtifact from autointent.custom_types import ListOfLabels from autointent.metrics import RETRIEVAL_METRICS_MULTICLASS, RETRIEVAL_METRICS_MULTILABEL from autointent.modules.abc import BaseEmbedding -from autointent.schemas import EmbedderConfig class RetrievalAimedEmbedding(BaseEmbedding): diff --git a/autointent/modules/scoring/_description/description.py b/autointent/modules/scoring/_description/description.py index 4fec600fe..e45c8c051 100644 --- a/autointent/modules/scoring/_description/description.py +++ b/autointent/modules/scoring/_description/description.py @@ -9,11 +9,11 @@ from sklearn.metrics.pairwise import cosine_similarity from autointent import Context, Embedder +from autointent.configs import EmbedderConfig, TaskTypeEnum from autointent.context.optimization_info import ScorerArtifact from autointent.custom_types import ListOfLabels from autointent.metrics import SCORING_METRICS_MULTICLASS, SCORING_METRICS_MULTILABEL from autointent.modules.abc import BaseScorer -from autointent.schemas import EmbedderConfig, TaskTypeEnum class DescriptionScorer(BaseScorer): @@ -38,7 +38,7 @@ class DescriptionScorer(BaseScorer): def __init__( self, - embedder_config: EmbedderConfig | str | dict[str, Any], + embedder_config: EmbedderConfig | str | dict[str, Any] | None = None, temperature: PositiveFloat = 1.0, ) -> None: """ @@ -66,7 +66,7 @@ def from_context( :return: Initialized DescriptionScorer instance. """ if embedder_config is None: - embedder_config = context.optimization_info.get_best_embedder() + embedder_config = context.resolve_embedder() return cls( temperature=temperature, diff --git a/autointent/modules/scoring/_dnnc/dnnc.py b/autointent/modules/scoring/_dnnc/dnnc.py index 86018b3d8..8ea04c7da 100644 --- a/autointent/modules/scoring/_dnnc/dnnc.py +++ b/autointent/modules/scoring/_dnnc/dnnc.py @@ -9,9 +9,9 @@ from pydantic import PositiveInt from autointent import Context, Ranker, VectorIndex +from autointent.configs import CrossEncoderConfig, EmbedderConfig from autointent.custom_types import ListOfLabels from autointent.modules.abc import BaseScorer -from autointent.schemas import CrossEncoderConfig, EmbedderConfig logger = logging.getLogger(__name__) @@ -78,8 +78,8 @@ class DNNCScorer(BaseScorer): def __init__( self, k: PositiveInt, - cross_encoder_config: CrossEncoderConfig | str | dict[str, Any], - embedder_config: EmbedderConfig | str | dict[str, Any], + cross_encoder_config: CrossEncoderConfig | str | dict[str, Any] | None = None, + embedder_config: EmbedderConfig | str | dict[str, Any] | None = None, ) -> None: """ Initialize the DNNCScorer. @@ -96,8 +96,8 @@ def __init__( def from_context( cls, context: Context, - cross_encoder_config: CrossEncoderConfig | str, k: PositiveInt, + cross_encoder_config: CrossEncoderConfig | str | None = None, embedder_config: EmbedderConfig | str | None = None, ) -> "DNNCScorer": """ @@ -110,7 +110,10 @@ def from_context( :return: Initialized DNNCScorer instance. """ if embedder_config is None: - embedder_config = context.optimization_info.get_best_embedder() + embedder_config = context.resolve_embedder() + + if cross_encoder_config is None: + cross_encoder_config = context.resolve_ranker() return cls( k=k, diff --git a/autointent/modules/scoring/_knn/knn.py b/autointent/modules/scoring/_knn/knn.py index 4efe8ee8d..91ae16efe 100644 --- a/autointent/modules/scoring/_knn/knn.py +++ b/autointent/modules/scoring/_knn/knn.py @@ -7,9 +7,9 @@ from pydantic import PositiveInt from autointent import Context, VectorIndex +from autointent.configs import EmbedderConfig from autointent.custom_types import WEIGHT_TYPES, ListOfLabels from autointent.modules.abc import BaseScorer -from autointent.schemas import EmbedderConfig from .weighting import apply_weights @@ -58,8 +58,8 @@ class KNNScorer(BaseScorer): def __init__( self, - embedder_config: EmbedderConfig | str | dict[str, Any], k: PositiveInt, + embedder_config: EmbedderConfig | str | dict[str, Any] | None = None, weights: WEIGHT_TYPES = "distance", ) -> None: """ @@ -81,7 +81,7 @@ def from_context( cls, context: Context, k: PositiveInt, - weights: WEIGHT_TYPES, + weights: WEIGHT_TYPES = "distance", embedder_config: EmbedderConfig | str | None = None, ) -> "KNNScorer": """ @@ -94,7 +94,7 @@ def from_context( :return: Initialized KNNScorer instance. """ if embedder_config is None: - embedder_config = context.optimization_info.get_best_embedder() + embedder_config = context.resolve_embedder() return cls( embedder_config=embedder_config, diff --git a/autointent/modules/scoring/_knn/rerank_scorer.py b/autointent/modules/scoring/_knn/rerank_scorer.py index edb68bd05..b18295de5 100644 --- a/autointent/modules/scoring/_knn/rerank_scorer.py +++ b/autointent/modules/scoring/_knn/rerank_scorer.py @@ -4,10 +4,11 @@ import numpy as np import numpy.typing as npt +from pydantic import PositiveInt from autointent import Context, Ranker +from autointent.configs import CrossEncoderConfig, EmbedderConfig from autointent.custom_types import WEIGHT_TYPES, ListOfLabels -from autointent.schemas import CrossEncoderConfig, EmbedderConfig from .knn import KNNScorer @@ -27,12 +28,12 @@ class RerankScorer(KNNScorer): def __init__( self, - cross_encoder_config: CrossEncoderConfig | str | dict[str, Any], - embedder_config: EmbedderConfig | str | dict[str, Any], k: int, weights: WEIGHT_TYPES, m: int | None = None, rank_threshold_cutoff: int | None = None, + cross_encoder_config: CrossEncoderConfig | str | dict[str, Any] | None = None, + embedder_config: EmbedderConfig | str | dict[str, Any] | None = None, ) -> None: """ Initialize the RerankScorer. @@ -63,10 +64,10 @@ def from_context( cls, context: Context, k: int, - weights: WEIGHT_TYPES, - cross_encoder_config: CrossEncoderConfig | str, + weights: WEIGHT_TYPES = "distance", + m: PositiveInt | None = None, + cross_encoder_config: CrossEncoderConfig | str | None = None, embedder_config: EmbedderConfig | str | None = None, - m: int | None = None, rank_threshold_cutoff: int | None = None, ) -> "RerankScorer": """ @@ -83,7 +84,10 @@ def from_context( :return: An instance of RerankScorer. """ if embedder_config is None: - embedder_config = context.optimization_info.get_best_embedder() + embedder_config = context.resolve_embedder() + + if cross_encoder_config is None: + cross_encoder_config = context.resolve_ranker() return cls( k=k, diff --git a/autointent/modules/scoring/_linear.py b/autointent/modules/scoring/_linear.py index 31221516f..949d263a0 100644 --- a/autointent/modules/scoring/_linear.py +++ b/autointent/modules/scoring/_linear.py @@ -8,9 +8,9 @@ from sklearn.multioutput import MultiOutputClassifier from autointent import Context, Embedder +from autointent.configs import EmbedderConfig, TaskTypeEnum from autointent.custom_types import ListOfLabels from autointent.modules.abc import BaseScorer -from autointent.schemas import EmbedderConfig, TaskTypeEnum class LinearScorer(BaseScorer): @@ -53,7 +53,7 @@ class LinearScorer(BaseScorer): def __init__( self, - embedder_config: EmbedderConfig | str | dict[str, Any], + embedder_config: EmbedderConfig | str | dict[str, Any] | None = None, cv: int = 3, n_jobs: int | None = None, seed: int = 0, @@ -85,7 +85,7 @@ def from_context( :return: Initialized LinearScorer instance. """ if embedder_config is None: - embedder_config = context.optimization_info.get_best_embedder() + embedder_config = context.resolve_embedder() return cls( embedder_config=embedder_config, diff --git a/autointent/modules/scoring/_mlknn/mlknn.py b/autointent/modules/scoring/_mlknn/mlknn.py index d1dd45fa8..c7decf195 100644 --- a/autointent/modules/scoring/_mlknn/mlknn.py +++ b/autointent/modules/scoring/_mlknn/mlknn.py @@ -7,9 +7,9 @@ from pydantic import NonNegativeInt, PositiveFloat, PositiveInt from autointent import Context, VectorIndex +from autointent.configs import EmbedderConfig from autointent.custom_types import ListOfLabels from autointent.modules.abc import BaseScorer -from autointent.schemas import EmbedderConfig class MLKnnScorer(BaseScorer): @@ -60,7 +60,7 @@ class MLKnnScorer(BaseScorer): def __init__( self, k: PositiveInt, - embedder_config: EmbedderConfig | str | dict[str, Any], + embedder_config: EmbedderConfig | str | dict[str, Any] | None = None, s: float = 1.0, ignore_first_neighbours: int = 0, ) -> None: @@ -97,7 +97,7 @@ def from_context( :return: Initialized MLKnnScorer instance. """ if embedder_config is None: - embedder_config = context.optimization_info.get_best_embedder() + embedder_config = context.resolve_embedder() return cls( k=k, diff --git a/autointent/modules/scoring/_sklearn/sklearn_scorer.py b/autointent/modules/scoring/_sklearn/sklearn_scorer.py index 8de70f09a..7b8b6620e 100644 --- a/autointent/modules/scoring/_sklearn/sklearn_scorer.py +++ b/autointent/modules/scoring/_sklearn/sklearn_scorer.py @@ -3,15 +3,14 @@ import numpy as np import numpy.typing as npt -from sklearn.linear_model import LogisticRegression from sklearn.multioutput import MultiOutputClassifier from sklearn.utils import all_estimators from typing_extensions import Self from autointent import Context, Embedder +from autointent.configs import EmbedderConfig, TaskTypeEnum from autointent.custom_types import ListOfLabels from autointent.modules.abc import BaseScorer -from autointent.schemas import EmbedderConfig, TaskTypeEnum logger = logging.getLogger(__name__) AVAILABLE_CLASSIFIERS = { @@ -44,28 +43,34 @@ class SklearnScorer(BaseScorer): def __init__( self, - embedder_config: EmbedderConfig | str | dict[str, Any], clf_name: str, - clf_args: dict[str, Any] | None = None, + embedder_config: EmbedderConfig | str | dict[str, Any] | None = None, + **clf_args: Any, # noqa: ANN401 ) -> None: """ Initialize the SklearnScorer. :param embedder_config: Config of the embedder model. :param clf_name: Name of the sklearn classifier to use. - :param clf_args: dictionary with the chosen sklearn classifier arguments, defaults to {}. + :param clf_args: dictionary with the chosen sklearn classifier arguments. """ self.embedder_config = EmbedderConfig.from_search_config(embedder_config) self.clf_name = clf_name - self.clf_args = clf_args or {} + + if AVAILABLE_CLASSIFIERS.get(self.clf_name): + self._base_clf = AVAILABLE_CLASSIFIERS[self.clf_name](**clf_args) + else: + msg = f"Class {self.clf_name} does not exist in sklearn or does not have predict_proba method" + logger.error(msg) + raise ValueError(msg) @classmethod def from_context( cls, context: Context, - clf_name: str = LogisticRegression.__name__, - clf_args: dict[str, Any] | None = None, + clf_name: str, embedder_config: EmbedderConfig | str | None = None, + **clf_args: float | str | bool, ) -> Self: """ Create a SklearnScorer instance using a Context object. @@ -77,12 +82,12 @@ def from_context( :return: Initialized SklearnScorer instance. """ if embedder_config is None: - embedder_config = context.optimization_info.get_best_embedder() + embedder_config = context.resolve_embedder() return cls( embedder_config=embedder_config, clf_name=clf_name, - clf_args=clf_args, + **clf_args, ) def fit( @@ -112,14 +117,8 @@ def fit( ) ) features = embedder.embed(utterances, TaskTypeEnum.classification) - if AVAILABLE_CLASSIFIERS.get(self.clf_name): - base_clf = AVAILABLE_CLASSIFIERS[self.clf_name](**self.clf_args) - else: - msg = f"Class {self.clf_name} does not exist in sklearn or does not have predict_proba method" - logger.error(msg) - raise ValueError(msg) - clf = MultiOutputClassifier(base_clf) if self._multilabel else base_clf + clf = MultiOutputClassifier(self._base_clf) if self._multilabel else self._base_clf clf.fit(features, labels) diff --git a/autointent/nodes/_optimization/_node_optimizer.py b/autointent/nodes/_optimization/_node_optimizer.py index 33fedf050..60adfbd1d 100644 --- a/autointent/nodes/_optimization/_node_optimizer.py +++ b/autointent/nodes/_optimization/_node_optimizer.py @@ -15,7 +15,7 @@ from autointent import Dataset from autointent.context import Context -from autointent.custom_types import NodeType, SamplerType +from autointent.custom_types import NodeType, SamplerType, SearchSpaceValidationMode from autointent.nodes.info import NODES_INFO @@ -185,7 +185,7 @@ def get_module_dump_dir(self, dump_dir: Path, module_name: str, j_combination: i dump_dir_.mkdir(parents=True, exist_ok=True) return str(dump_dir_) - def validate_nodes_with_dataset(self, dataset: Dataset) -> None: + def validate_nodes_with_dataset(self, dataset: Dataset, mode: SearchSpaceValidationMode) -> None: """ Validate nodes with dataset. @@ -193,16 +193,32 @@ def validate_nodes_with_dataset(self, dataset: Dataset) -> None: """ is_multilabel = dataset.multilabel + filtered_search_space = [] + for search_space in deepcopy(self.modules_search_spaces): - module_name = search_space.pop("module_name") + module_name = search_space["module_name"] module = self.node_info.modules_available[module_name] # todo add check for oos + messages = [] + + if module_name == "description" and not dataset.has_descriptions: + messages.append("DescriptionScorer cannot be used without intents descriptions.") + if is_multilabel and not module.supports_multilabel: - msg = f"Module '{module_name}' does not support multilabel datasets." - self._logger.error(msg) - raise ValueError(msg) + messages.append(f"Module '{module_name}' does not support multilabel datasets.") + if not is_multilabel and not module.supports_multiclass: - msg = f"Module '{module_name}' does not support multiclass datasets." - self._logger.error(msg) - raise ValueError(msg) + messages.append(f"Module '{module_name}' does not support multiclass datasets.") + + if len(messages) > 0: + msg = "\n".join(messages) + if mode == "raise": + self._logger.error(msg) + raise ValueError(msg) + if mode == "warning": + self._logger.warning(msg) + else: + filtered_search_space.append(search_space) + + self.modules_search_spaces = filtered_search_space diff --git a/autointent/nodes/info/_decision.py b/autointent/nodes/info/_decision.py index 305493679..b2a62667f 100644 --- a/autointent/nodes/info/_decision.py +++ b/autointent/nodes/info/_decision.py @@ -5,7 +5,7 @@ from autointent.custom_types import NodeType from autointent.metrics import DECISION_METRICS, DecisionMetricFn -from autointent.modules import DECISION_MODULES_MULTICLASS, DECISION_MODULES_MULTILABEL +from autointent.modules import DECISION_MODULES from autointent.modules.abc import BaseDecision from ._base import NodeInfo @@ -16,8 +16,6 @@ class DecisionNodeInfo(NodeInfo): metrics_available: ClassVar[Mapping[str, DecisionMetricFn]] = DECISION_METRICS - modules_available: ClassVar[dict[str, type[BaseDecision]]] = ( - DECISION_MODULES_MULTICLASS | DECISION_MODULES_MULTILABEL - ) + modules_available: ClassVar[dict[str, type[BaseDecision]]] = DECISION_MODULES node_type = NodeType.decision diff --git a/autointent/nodes/info/_embedding.py b/autointent/nodes/info/_embedding.py index ca2ffebc0..0c82e0e6f 100644 --- a/autointent/nodes/info/_embedding.py +++ b/autointent/nodes/info/_embedding.py @@ -12,7 +12,7 @@ RetrievalMetricFn, ScoringMetricFn, ) -from autointent.modules import EMBEDDING_MODULES_MULTICLASS, EMBEDDING_MODULES_MULTILABEL +from autointent.modules import EMBEDDING_MODULES from autointent.modules.abc import BaseEmbedding from ._base import NodeInfo @@ -28,8 +28,6 @@ class EmbeddingNodeInfo(NodeInfo): | SCORING_METRICS_MULTICLASS ) - modules_available: ClassVar[Mapping[str, type[BaseEmbedding]]] = ( - EMBEDDING_MODULES_MULTICLASS | EMBEDDING_MODULES_MULTILABEL - ) + modules_available: ClassVar[Mapping[str, type[BaseEmbedding]]] = EMBEDDING_MODULES node_type = NodeType.embedding diff --git a/autointent/nodes/info/_scoring.py b/autointent/nodes/info/_scoring.py index a799c4dd0..23a794719 100644 --- a/autointent/nodes/info/_scoring.py +++ b/autointent/nodes/info/_scoring.py @@ -5,7 +5,7 @@ from autointent.custom_types import NodeType from autointent.metrics import SCORING_METRICS_MULTICLASS, SCORING_METRICS_MULTILABEL, ScoringMetricFn -from autointent.modules import SCORING_MODULES_MULTICLASS, SCORING_MODULES_MULTILABEL +from autointent.modules import SCORING_MODULES from autointent.modules.abc import BaseScorer from ._base import NodeInfo @@ -16,8 +16,6 @@ class ScoringNodeInfo(NodeInfo): metrics_available: ClassVar[Mapping[str, ScoringMetricFn]] = SCORING_METRICS_MULTICLASS | SCORING_METRICS_MULTILABEL - modules_available: ClassVar[Mapping[str, type[BaseScorer]]] = ( - SCORING_MODULES_MULTICLASS | SCORING_MODULES_MULTILABEL - ) + modules_available: ClassVar[Mapping[str, type[BaseScorer]]] = SCORING_MODULES node_type = NodeType.scoring diff --git a/autointent/nodes/schemes.py b/autointent/nodes/schemes.py index 4f91572f5..4e33e1314 100644 --- a/autointent/nodes/schemes.py +++ b/autointent/nodes/schemes.py @@ -6,8 +6,7 @@ from pydantic import BaseModel, Field, PositiveInt, RootModel -from autointent.configs import DataConfig, LoggingConfig, VectorIndexConfig -from autointent.custom_types import NodeType, SamplerType +from autointent.custom_types import NodeType from autointent.modules.abc import BaseModule from autointent.nodes._optimization._node_optimizer import ParamSpaceFloat, ParamSpaceInt from autointent.nodes.info import DecisionNodeInfo, EmbeddingNodeInfo, RegexNodeInfo, ScoringNodeInfo @@ -179,21 +178,3 @@ def __getitem__(self, item: int) -> SearchSpaceTypes: :return: Item """ return self.root[item] - - -class TaskConfig(BaseModel): - """Configuration for the task to optimize.""" - - search_space: OptimizationSearchSpaceConfig - """Path to the search space configuration file. If None, the default search space will be used""" - sampler: SamplerType = "brute" - - -class OptimizationConfig(BaseModel): - """Configuration for the optimization process.""" - - data_config: DataConfig = DataConfig() - task_config: TaskConfig - logging_config: LoggingConfig = LoggingConfig() - vector_index_config: VectorIndexConfig = VectorIndexConfig() - seed: PositiveInt = 42 diff --git a/autointent/schemas/__init__.py b/autointent/schemas/__init__.py index f9fb927f4..98b99dd0c 100644 --- a/autointent/schemas/__init__.py +++ b/autointent/schemas/__init__.py @@ -1,25 +1,15 @@ """Data models related to :class:`autointent.Dataset`.""" from ._schemas import ( - CrossEncoderConfig, - EmbedderConfig, Intent, - LLMConfig, Sample, - STModelConfig, Tag, TagsList, - TaskTypeEnum, ) __all__ = [ - "CrossEncoderConfig", - "EmbedderConfig", "Intent", - "LLMConfig", - "STModelConfig", "Sample", "Tag", "TagsList", - "TaskTypeEnum", ] diff --git a/autointent/schemas/_schemas.py b/autointent/schemas/_schemas.py index 9c048954e..8005a91d2 100644 --- a/autointent/schemas/_schemas.py +++ b/autointent/schemas/_schemas.py @@ -4,19 +4,13 @@ """ import json -from enum import Enum from pathlib import Path from typing import Any from pydantic import ( - AnyHttpUrl, BaseModel, - Field, - NonNegativeFloat, - PositiveInt, model_validator, ) -from typing_extensions import Self from autointent.custom_types import LabelWithOOS @@ -125,105 +119,3 @@ class Intent(BaseModel): regex_full_match: list[str] = [] regex_partial_match: list[str] = [] description: str | None = None - - -class ModelConfig(BaseModel): - batch_size: PositiveInt = Field(32, description="Batch size for model inference.") - max_length: PositiveInt | None = Field(None, description="Maximum length of input sequences.") - - -class LLMConfig(ModelConfig): - temperature: NonNegativeFloat | None = Field(None, description="Temperature for sampling from the model.") - base_url: AnyHttpUrl | None = Field(..., description="Base URL for the model API.") - token: str | None = Field(..., description="API token for the model.") - extra_body: dict[str, Any] | None = Field(None, description="Extra body for the model API.") - - -class STModelConfig(ModelConfig): - model_name: str = Field(..., description="Name of the hugging face model.") - device: str | None = Field(None, description="Torch notation for CPU or CUDA.") - - @classmethod - def from_search_config(cls, values: dict[str, Any] | str | BaseModel) -> Self: - """Validate the model configuration. - - :param values: Model configuration values. If a string is provided, it is converted to a dictionary. - """ - if isinstance(values, BaseModel): - return values # type: ignore[return-value] - if isinstance(values, str): - return cls(model_name=values) - return cls(**values) - - -class TaskTypeEnum(Enum): - """Enum for different types of prompts.""" - - default = "default" - classification = "classification" - cluster = "cluster" - query = "query" - passage = "passage" - sts = "sts" - - -class EmbedderConfig(STModelConfig): - default_prompt: str | None = Field( - None, description="Default prompt for the model. This is used when no task specific prompt is not provided." - ) - classifier_prompt: str | None = Field(None, description="Prompt for classifier.") - cluster_prompt: str | None = Field(None, description="Prompt for clustering.") - sts_prompt: str | None = Field(None, description="Prompt for finding most similar sentences.") - query_prompt: str | None = Field(None, description="Prompt for query.") - passage_prompt: str | None = Field(None, description="Prompt for passage.") - - def get_prompt_config(self) -> dict[str, str] | None: - """Get the prompt config for the given prompt type. - - :return: The prompt config for the given prompt type. - """ - prompts = {} - if self.default_prompt: - prompts[TaskTypeEnum.default.value] = self.default_prompt - if self.classifier_prompt: - prompts[TaskTypeEnum.classification.value] = self.classifier_prompt - if self.cluster_prompt: - prompts[TaskTypeEnum.cluster.value] = self.cluster_prompt - if self.query_prompt: - prompts[TaskTypeEnum.query.value] = self.query_prompt - if self.passage_prompt: - prompts[TaskTypeEnum.passage.value] = self.passage_prompt - if self.sts_prompt: - prompts[TaskTypeEnum.sts.value] = self.sts_prompt - return prompts if len(prompts) > 0 else None - - def get_prompt_type(self, prompt_type: TaskTypeEnum | str | None) -> str | None: # noqa: PLR0911 - """Get the prompt type for the given task type. - - :param prompt_type: Task type for which to get the prompt. - - :return: The prompt for the given task type. - """ - if prompt_type is None: - return self.default_prompt - if prompt_type == TaskTypeEnum.classification: - return self.classifier_prompt - if prompt_type == TaskTypeEnum.cluster: - return self.cluster_prompt - if prompt_type == TaskTypeEnum.query: - return self.query_prompt - if prompt_type == TaskTypeEnum.passage: - return self.passage_prompt - if prompt_type == TaskTypeEnum.sts: - return self.sts_prompt - if prompt_type == TaskTypeEnum.default: - return self.default_prompt - return None - - use_cache: bool = Field(False, description="Whether to use embeddings caching.") - - -class CrossEncoderConfig(STModelConfig): - train_head: bool = Field( - False, description="Whether to train the head of the model. If False, LogReg will be trained." - ) diff --git a/autointent/utils.py b/autointent/utils.py index 7947fa5bc..f56f408d4 100644 --- a/autointent/utils.py +++ b/autointent/utils.py @@ -6,17 +6,7 @@ import yaml - -def load_default_search_space(multilabel: bool) -> list[dict[str, Any]]: - """ - Load configuration from the given path or load default configuration. - - :param multilabel: Whether to use multilabel or not - :return: - """ - config_name = "default-multilabel-config.yaml" if multilabel else "default-multiclass-config.yaml" - path = ires.files("autointent._datafiles").joinpath(config_name) - return load_search_space(path) # type: ignore[arg-type] +from autointent.custom_types import SearchSpacePresets def load_search_space(path: Path | str) -> list[dict[str, Any]]: @@ -28,3 +18,14 @@ def load_search_space(path: Path | str) -> list[dict[str, Any]]: """ with Path(path).open() as file: return yaml.safe_load(file) # type: ignore[no-any-return] + + +def load_preset(name: SearchSpacePresets) -> dict[str, Any]: + """ + Load one of preset search spaces. + + :param name: name of a presets. + """ + path = ires.files("autointent._presets").joinpath(name + ".yaml") + with path.open() as file: + return yaml.safe_load(file) # type: ignore[no-any-return] diff --git a/docs/optimizer_config.schema.json b/docs/optimizer_config.schema.json index f0eb70c38..e0cba086f 100644 --- a/docs/optimizer_config.schema.json +++ b/docs/optimizer_config.schema.json @@ -102,6 +102,7 @@ "title": "Max Length" }, "model_name": { + "default": "cross-encoder/ms-marco-MiniLM-L-6-v2", "description": "Name of the hugging face model.", "title": "Model Name", "type": "string" @@ -126,9 +127,6 @@ "type": "boolean" } }, - "required": [ - "model_name" - ], "title": "CrossEncoderConfig", "type": "object" }, @@ -153,20 +151,6 @@ "description": "Number of trials", "title": "N Trials" }, - "cross_encoder_config": { - "items": { - "anyOf": [ - { - "$ref": "#/$defs/CrossEncoderConfig" - }, - { - "type": "string" - } - ] - }, - "title": "Cross Encoder Config", - "type": "array" - }, "k": { "anyOf": [ { @@ -182,6 +166,26 @@ ], "title": "K" }, + "cross_encoder_config": { + "default": [ + null + ], + "items": { + "anyOf": [ + { + "$ref": "#/$defs/CrossEncoderConfig" + }, + { + "type": "string" + }, + { + "type": "null" + } + ] + }, + "title": "Cross Encoder Config", + "type": "array" + }, "embedder_config": { "default": [ null @@ -205,7 +209,6 @@ }, "required": [ "module_name", - "cross_encoder_config", "k" ], "title": "DNNCScorerInitModel", @@ -417,6 +420,7 @@ "title": "Max Length" }, "model_name": { + "default": "sentence-transformers/all-MiniLM-L6-v2", "description": "Name of the hugging face model.", "title": "Model Name", "type": "string" @@ -519,9 +523,6 @@ "type": "boolean" } }, - "required": [ - "model_name" - ], "title": "EmbedderConfig", "type": "object" }, @@ -714,6 +715,9 @@ "title": "K" }, "weights": { + "default": [ + "distance" + ], "items": { "enum": [ "uniform", @@ -748,8 +752,7 @@ }, "required": [ "module_name", - "k", - "weights" + "k" ], "title": "KNNScorerInitModel", "type": "object" @@ -1248,6 +1251,9 @@ "title": "K" }, "weights": { + "default": [ + "distance" + ], "items": { "enum": [ "uniform", @@ -1259,7 +1265,35 @@ "title": "Weights", "type": "array" }, + "m": { + "anyOf": [ + { + "items": { + "anyOf": [ + { + "exclusiveMinimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ] + }, + "type": "array" + }, + { + "$ref": "#/$defs/ParamSpaceInt" + } + ], + "default": [ + null + ], + "title": "M" + }, "cross_encoder_config": { + "default": [ + null + ], "items": { "anyOf": [ { @@ -1267,6 +1301,9 @@ }, { "type": "string" + }, + { + "type": "null" } ] }, @@ -1293,30 +1330,6 @@ "title": "Embedder Config", "type": "array" }, - "m": { - "anyOf": [ - { - "items": { - "anyOf": [ - { - "type": "integer" - }, - { - "type": "null" - } - ] - }, - "type": "array" - }, - { - "$ref": "#/$defs/ParamSpaceInt" - } - ], - "default": [ - null - ], - "title": "M" - }, "rank_threshold_cutoff": { "anyOf": [ { @@ -1344,9 +1357,7 @@ }, "required": [ "module_name", - "k", - "weights", - "cross_encoder_config" + "k" ], "title": "RerankScorerInitModel", "type": "object" @@ -1519,82 +1530,58 @@ "title": "N Trials" }, "clf_name": { - "default": [ - "LogisticRegression" - ], "items": { "type": "string" }, "title": "Clf Name", "type": "array" }, - "clf_args": { + "embedder_config": { "default": [ null ], "items": { "anyOf": [ { - "type": "object" + "$ref": "#/$defs/EmbedderConfig" + }, + { + "type": "string" }, { "type": "null" } ] }, - "title": "Clf Args", + "title": "Embedder Config", "type": "array" }, - "embedder_config": { - "default": [ - null - ], + "clf_args": { "items": { "anyOf": [ { - "$ref": "#/$defs/EmbedderConfig" + "type": "number" }, { "type": "string" }, { - "type": "null" + "type": "boolean" } ] }, - "title": "Embedder Config", + "title": "Clf Args", "type": "array" } }, "required": [ - "module_name" + "module_name", + "clf_name", + "clf_args" ], "title": "SklearnScorerInitModel", "type": "object" }, - "TaskConfig": { - "description": "Configuration for the task to optimize.", - "properties": { - "search_space": { - "$ref": "#/$defs/OptimizationSearchSpaceConfig" - }, - "sampler": { - "default": "brute", - "enum": [ - "brute", - "tpe", - "random" - ], - "title": "Sampler", - "type": "string" - } - }, - "required": [ - "search_space" - ], - "title": "TaskConfig", - "type": "object" - }, "ThresholdDecisionInitModel": { "properties": { "module_name": { @@ -1716,19 +1703,6 @@ ], "title": "TunableDecisionInitModel", "type": "object" - }, - "VectorIndexConfig": { - "description": "Configuration for the vector index.", - "properties": { - "save_db": { - "default": false, - "description": "Whether to save the vector index database or not", - "title": "Save Db", - "type": "boolean" - } - }, - "title": "VectorIndexConfig", - "type": "object" } }, "description": "Configuration for the optimization process.", @@ -1742,8 +1716,8 @@ "separation_ratio": 0.5 } }, - "task_config": { - "$ref": "#/$defs/TaskConfig" + "search_space": { + "$ref": "#/$defs/OptimizationSearchSpaceConfig" }, "logging_config": { "$ref": "#/$defs/LoggingConfig", @@ -1755,12 +1729,42 @@ "report_to": null } }, - "vector_index_config": { - "$ref": "#/$defs/VectorIndexConfig", + "embedder_config": { + "$ref": "#/$defs/EmbedderConfig", + "default": { + "batch_size": 32, + "max_length": null, + "model_name": "sentence-transformers/all-MiniLM-L6-v2", + "device": null, + "default_prompt": null, + "classifier_prompt": null, + "cluster_prompt": null, + "sts_prompt": null, + "query_prompt": null, + "passage_prompt": null, + "use_cache": false + } + }, + "cross_encoder_config": { + "$ref": "#/$defs/CrossEncoderConfig", "default": { - "save_db": false + "batch_size": 32, + "max_length": null, + "model_name": "cross-encoder/ms-marco-MiniLM-L-6-v2", + "device": null, + "train_head": false } }, + "sampler": { + "default": "brute", + "enum": [ + "brute", + "tpe", + "random" + ], + "title": "Sampler", + "type": "string" + }, "seed": { "default": 42, "exclusiveMinimum": 0, @@ -1769,7 +1773,7 @@ } }, "required": [ - "task_config" + "search_space" ], "title": "OptimizationConfig", "type": "object" diff --git a/docs/optimizer_search_space_config.schema.json b/docs/optimizer_search_space_config.schema.json index c0409c1fc..67932b98b 100644 --- a/docs/optimizer_search_space_config.schema.json +++ b/docs/optimizer_search_space_config.schema.json @@ -101,11 +101,6 @@ "description": "Maximum length of input sequences.", "title": "Max Length" }, - "model_name": { - "description": "Name of the hugging face model.", - "title": "Model Name", - "type": "string" - }, "device": { "anyOf": [ { @@ -119,6 +114,12 @@ "description": "Torch notation for CPU or CUDA.", "title": "Device" }, + "model_name": { + "default": "cross-encoder/ms-marco-MiniLM-L-6-v2", + "description": "Name of the hugging face model.", + "title": "Model Name", + "type": "string" + }, "train_head": { "default": false, "description": "Whether to train the head of the model. If False, LogReg will be trained.", @@ -126,9 +127,6 @@ "type": "boolean" } }, - "required": [ - "model_name" - ], "title": "CrossEncoderConfig", "type": "object" }, @@ -153,20 +151,6 @@ "description": "Number of trials", "title": "N Trials" }, - "cross_encoder_config": { - "items": { - "anyOf": [ - { - "$ref": "#/$defs/CrossEncoderConfig" - }, - { - "type": "string" - } - ] - }, - "title": "Cross Encoder Config", - "type": "array" - }, "k": { "anyOf": [ { @@ -182,6 +166,26 @@ ], "title": "K" }, + "cross_encoder_config": { + "default": [ + null + ], + "items": { + "anyOf": [ + { + "$ref": "#/$defs/CrossEncoderConfig" + }, + { + "type": "string" + }, + { + "type": "null" + } + ] + }, + "title": "Cross Encoder Config", + "type": "array" + }, "embedder_config": { "default": [ null @@ -205,7 +209,6 @@ }, "required": [ "module_name", - "cross_encoder_config", "k" ], "title": "DNNCScorerInitModel", @@ -369,11 +372,6 @@ "description": "Maximum length of input sequences.", "title": "Max Length" }, - "model_name": { - "description": "Name of the hugging face model.", - "title": "Model Name", - "type": "string" - }, "device": { "anyOf": [ { @@ -387,6 +385,12 @@ "description": "Torch notation for CPU or CUDA.", "title": "Device" }, + "model_name": { + "default": "sentence-transformers/all-MiniLM-L6-v2", + "description": "Name of the hugging face model.", + "title": "Model Name", + "type": "string" + }, "default_prompt": { "anyOf": [ { @@ -472,9 +476,6 @@ "type": "boolean" } }, - "required": [ - "model_name" - ], "title": "EmbedderConfig", "type": "object" }, @@ -667,6 +668,9 @@ "title": "K" }, "weights": { + "default": [ + "distance" + ], "items": { "enum": [ "uniform", @@ -701,8 +705,7 @@ }, "required": [ "module_name", - "k", - "weights" + "k" ], "title": "KNNScorerInitModel", "type": "object" @@ -1111,6 +1114,9 @@ "title": "K" }, "weights": { + "default": [ + "distance" + ], "items": { "enum": [ "uniform", @@ -1122,7 +1128,35 @@ "title": "Weights", "type": "array" }, + "m": { + "anyOf": [ + { + "items": { + "anyOf": [ + { + "exclusiveMinimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ] + }, + "type": "array" + }, + { + "$ref": "#/$defs/ParamSpaceInt" + } + ], + "default": [ + null + ], + "title": "M" + }, "cross_encoder_config": { + "default": [ + null + ], "items": { "anyOf": [ { @@ -1130,6 +1164,9 @@ }, { "type": "string" + }, + { + "type": "null" } ] }, @@ -1156,60 +1193,27 @@ "title": "Embedder Config", "type": "array" }, - "m": { - "anyOf": [ - { - "items": { - "anyOf": [ - { - "type": "integer" - }, - { - "type": "null" - } - ] - }, - "type": "array" - }, - { - "$ref": "#/$defs/ParamSpaceInt" - } - ], - "default": [ - null - ], - "title": "M" - }, "rank_threshold_cutoff": { - "anyOf": [ - { - "items": { - "anyOf": [ - { - "type": "integer" - }, - { - "type": "null" - } - ] - }, - "type": "array" - }, - { - "$ref": "#/$defs/ParamSpaceInt" - } - ], "default": [ null ], - "title": "Rank Threshold Cutoff" + "items": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ] + }, + "title": "Rank Threshold Cutoff", + "type": "array" } }, "required": [ "module_name", - "k", - "weights", - "cross_encoder_config" + "k" ], "title": "RerankScorerInitModel", "type": "object" @@ -1382,32 +1386,12 @@ "title": "N Trials" }, "clf_name": { - "default": [ - "LogisticRegression" - ], "items": { "type": "string" }, "title": "Clf Name", "type": "array" }, - "clf_args": { - "default": [ - null - ], - "items": { - "anyOf": [ - { - "type": "object" - }, - { - "type": "null" - } - ] - }, - "title": "Clf Args", - "type": "array" - }, "embedder_config": { "default": [ null @@ -1427,10 +1411,17 @@ }, "title": "Embedder Config", "type": "array" + }, + "clf_args": { + "items": {}, + "title": "Clf Args", + "type": "array" } }, "required": [ - "module_name" + "module_name", + "clf_name", + "clf_args" ], "title": "SklearnScorerInitModel", "type": "object" diff --git a/docs/source/index.rst b/docs/source/index.rst index 72c82976f..b02a6e6e7 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -25,7 +25,7 @@ Example of building an intent classifier in a couple of lines of code: from autointent import Pipeline, Dataset dataset = Dataset.from_json(path_to_json) - pipeline = Pipeline.default_optimizer(multilabel=False) + pipeline = Pipeline.from_preset("light_extra") pipeline.fit(dataset) pipeline.predict(["show me my latest recent transactions"]) diff --git a/scripts/generate_json_schema_config.py b/scripts/generate_json_schema_config.py index 16e5035d6..a5e2615ea 100644 --- a/scripts/generate_json_schema_config.py +++ b/scripts/generate_json_schema_config.py @@ -1,7 +1,8 @@ import json from pathlib import Path -from autointent.nodes.schemes import OptimizationConfig, OptimizationSearchSpaceConfig +from autointent.nodes.schemes import OptimizationSearchSpaceConfig +from autointent import OptimizationConfig def generate_json_schema_search_space_config() -> None: diff --git a/tests/assets/configs/full_training.yaml b/tests/assets/configs/full_training.yaml index ba15a9196..73bfd6ed9 100644 --- a/tests/assets/configs/full_training.yaml +++ b/tests/assets/configs/full_training.yaml @@ -1,21 +1,20 @@ -task_config: - search_space: - - node_type: embedding - target_metric: retrieval_hit_rate - search_space: - - module_name: retrieval - k: [10] - embedder_config: - - model_name: sentence-transformers/all-MiniLM-L6-v2 - - node_type: scoring - target_metric: scoring_roc_auc - search_space: - - module_name: linear - - node_type: decision - target_metric: decision_accuracy - search_space: - - module_name: argmax - sampler: brute +search_space: + - node_type: embedding + target_metric: retrieval_hit_rate + search_space: + - module_name: retrieval + k: [10] + embedder_config: + - model_name: sentence-transformers/all-MiniLM-L6-v2 + - node_type: scoring + target_metric: scoring_roc_auc + search_space: + - module_name: linear + - node_type: decision + target_metric: decision_accuracy + search_space: + - module_name: argmax +sampler: brute data_config: scheme: ho n_folds: 3 @@ -25,4 +24,10 @@ logging_config: run_name: full_training vector_index_config: save_db: false -seed: 42 \ No newline at end of file +seed: 42 +embedder_config: + model_name: sentence-transformers/all-MiniLM-L6-v2 + use_cache: true +cross_encoder_config: + batch_size: 32 + model_name: cross-encoder/ms-marco-MiniLM-L-6-v2 \ No newline at end of file diff --git a/tests/assets/configs/multiclass.yaml b/tests/assets/configs/multiclass.yaml index e33e8e559..eedaf5df5 100644 --- a/tests/assets/configs/multiclass.yaml +++ b/tests/assets/configs/multiclass.yaml @@ -19,12 +19,6 @@ train_head: true - avsolatorio/GIST-small-Embedding-v0 k: [1, 3] - - module_name: sklearn - embedder_config: - - sergeyzh/rubert-tiny-turbo - clf_name: - - LogisticRegression - - RandomForestClassifier - module_name: rerank k: [ 5, 10 ] weights: [uniform, distance, closest] diff --git a/tests/assets/configs/multilabel.yaml b/tests/assets/configs/multilabel.yaml index 159501a53..8b6cefc3a 100644 --- a/tests/assets/configs/multilabel.yaml +++ b/tests/assets/configs/multilabel.yaml @@ -15,12 +15,6 @@ - module_name: linear - module_name: mlknn k: [5] - - module_name: sklearn - embedder_config: - - model_name: sergeyzh/rubert-tiny-turbo - clf_name: - - LogisticRegression - - RandomForestClassifier - module_name: rerank k: [ 5, 10 ] weights: [ uniform, distance, closest ] diff --git a/tests/assets/configs/optuna.yaml b/tests/assets/configs/optuna.yaml index b775ab3f6..56ef453a2 100644 --- a/tests/assets/configs/optuna.yaml +++ b/tests/assets/configs/optuna.yaml @@ -17,6 +17,12 @@ step: 1 weights: [uniform, distance, closest] - module_name: linear + - module_name: sklearn + clf_name: + - RandomForestClassifier + n_estimators: + low: 5 + high: 10 - node_type: decision target_metric: decision_accuracy search_space: diff --git a/tests/callback/test_callback.py b/tests/callback/test_callback.py index 4bf9a8b95..4c4d38a72 100644 --- a/tests/callback/test_callback.py +++ b/tests/callback/test_callback.py @@ -5,7 +5,7 @@ from autointent import Context, Pipeline from autointent._callbacks import CallbackHandler, OptimizerCallback -from autointent.configs import DataConfig, LoggingConfig, VectorIndexConfig +from autointent.configs import DataConfig, LoggingConfig from tests.conftest import setup_environment @@ -83,7 +83,6 @@ def test_pipeline_callbacks(dataset): ] pipeline_optimizer = Pipeline.from_search_space(search_space) context = Context() - context.configure_vector_index(VectorIndexConfig(save_db=True)) context.configure_logging(LoggingConfig(run_name="dummy_run_name", project_dir=project_dir, dump_modules=False)) context.callback_handler = CallbackHandler([DummyCallback]) context.set_dataset(dataset, DataConfig(scheme="ho", separate_nodes=True)) diff --git a/tests/configs/test_full_config.py b/tests/configs/test_full_config.py index 1398aa3de..b07323712 100644 --- a/tests/configs/test_full_config.py +++ b/tests/configs/test_full_config.py @@ -1,7 +1,7 @@ import pytest from pydantic import ValidationError -from autointent.nodes.schemes import OptimizationConfig +from autointent import OptimizationConfig from tests.conftest import get_search_space diff --git a/tests/configs/test_scoring.py b/tests/configs/test_scoring.py index 9877d3add..e95d32be7 100644 --- a/tests/configs/test_scoring.py +++ b/tests/configs/test_scoring.py @@ -46,12 +46,12 @@ def valid_scoring_config(): "weights": ["distance"], "rank_threshold_cutoff": [None, 3], }, - { - "module_name": "sklearn", - "embedder_config": ["sentence-transformers/all-MiniLM-L6-v2"], - "clf_name": ["LogisticRegression"], - "clf_args": [{"C": 1.0}, {"C": 0.5}], - }, + # { + # "module_name": "sklearn", + # "embedder_config": ["sentence-transformers/all-MiniLM-L6-v2"], + # "clf_name": ["LogisticRegression"], + # "clf_args": [{"C": 1.0}, {"C": 0.5}], + # }, ], } ] diff --git a/tests/conftest.py b/tests/conftest.py index 729123f1e..fb1ed3a4a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -33,7 +33,7 @@ def dataset_no_oos(): return Dataset.from_json(path) -TaskType = Literal["multiclass", "multilabel", "description", "optuna", "light", "full_training"] +TaskType = Literal["multiclass", "multilabel", "description", "optuna", "light"] def get_search_space_path(task_type: TaskType): diff --git a/tests/context/test_vector_index.py b/tests/context/test_vector_index.py index 2a6b6b789..ee04eeb71 100644 --- a/tests/context/test_vector_index.py +++ b/tests/context/test_vector_index.py @@ -1,7 +1,7 @@ import pytest from autointent import VectorIndex -from autointent.schemas import EmbedderConfig +from autointent.configs import EmbedderConfig @pytest.fixture diff --git a/tests/modules/scoring/test_sklearn.py b/tests/modules/scoring/test_sklearn.py index a31f380af..b2c041f1f 100644 --- a/tests/modules/scoring/test_sklearn.py +++ b/tests/modules/scoring/test_sklearn.py @@ -7,7 +7,13 @@ def test_base_sklearn(dataset): data_handler = DataHandler(dataset) - scorer = SklearnScorer(embedder_config="sergeyzh/rubert-tiny-turbo", clf_name="LogisticRegression") + scorer = SklearnScorer( + embedder_config="sergeyzh/rubert-tiny-turbo", + clf_name="LogisticRegression", + penalty="elasticnet", + solver="saga", + l1_ratio=0.5, + ) scorer.fit(data_handler.train_utterances(0), data_handler.train_labels(0)) test_data = [ @@ -22,12 +28,12 @@ def test_base_sklearn(dataset): np.testing.assert_almost_equal( np.array( [ - [0.1853835, 0.37123936, 0.19039844, 0.2529787], - [0.18256407, 0.34468965, 0.20265235, 0.27009393], - [0.20398149, 0.32502411, 0.20803067, 0.26296374], - [0.19607862, 0.31059503, 0.20248233, 0.29084402], - [0.18350756, 0.40184831, 0.17685331, 0.23779081], - ], + [0.222, 0.287, 0.219, 0.271], + [0.222, 0.287, 0.219, 0.271], + [0.222, 0.287, 0.219, 0.271], + [0.222, 0.287, 0.219, 0.271], + [0.222, 0.287, 0.219, 0.271], + ] ), predictions, decimal=2, diff --git a/tests/nodes/conftest.py b/tests/nodes/conftest.py index c1d572a4c..7a844b68a 100644 --- a/tests/nodes/conftest.py +++ b/tests/nodes/conftest.py @@ -1,7 +1,7 @@ import pytest from autointent import Context, Dataset -from autointent.configs import DataConfig, LoggingConfig, VectorIndexConfig +from autointent.configs import DataConfig, LoggingConfig from autointent.nodes import NodeOptimizer from tests.conftest import get_dataset_path, setup_environment @@ -78,5 +78,4 @@ def get_context(multilabel): dataset = dataset.to_multilabel() res.set_dataset(dataset, DataConfig(scheme="ho", separate_nodes=True)) res.configure_logging(LoggingConfig(project_dir=project_dir, dump_modules=True)) - res.configure_vector_index(VectorIndexConfig()) return res diff --git a/tests/pipeline/test_inference.py b/tests/pipeline/test_inference.py index 71860b6b0..856ba81ba 100644 --- a/tests/pipeline/test_inference.py +++ b/tests/pipeline/test_inference.py @@ -1,7 +1,7 @@ import pytest from autointent import Pipeline -from autointent.configs import LoggingConfig, VectorIndexConfig +from autointent.configs import LoggingConfig from tests.conftest import get_search_space, setup_environment @@ -16,7 +16,6 @@ def test_inference_config(dataset, task_type): pipeline_optimizer = Pipeline.from_search_space(search_space) pipeline_optimizer.set_config(LoggingConfig(project_dir=project_dir, dump_modules=True, clear_ram=True)) - pipeline_optimizer.set_config(VectorIndexConfig(save_db=True)) if task_type == "multilabel": dataset = dataset.to_multilabel() @@ -46,7 +45,6 @@ def test_inference_context(dataset, task_type): pipeline = Pipeline.from_search_space(search_space) pipeline.set_config(LoggingConfig(project_dir=project_dir, dump_modules=False, clear_ram=False)) - pipeline.set_config(VectorIndexConfig(save_db=True)) if task_type == "multilabel": dataset = dataset.to_multilabel() diff --git a/tests/pipeline/test_optimization.py b/tests/pipeline/test_optimization.py index 4ae6d7e3b..b2bebc7f6 100644 --- a/tests/pipeline/test_optimization.py +++ b/tests/pipeline/test_optimization.py @@ -1,9 +1,10 @@ +import importlib.resources as ires import os import pytest from autointent import Pipeline -from autointent.configs import DataConfig, LoggingConfig, VectorIndexConfig +from autointent.configs import DataConfig, LoggingConfig from tests.conftest import get_search_space, setup_environment @@ -14,15 +15,14 @@ def test_no_node_separation(dataset_no_oos): pipeline_optimizer = Pipeline.from_search_space(search_space) pipeline_optimizer.set_config(LoggingConfig(project_dir=project_dir, dump_modules=True, clear_ram=True)) - pipeline_optimizer.set_config(VectorIndexConfig()) pipeline_optimizer.set_config(DataConfig(scheme="ho", separate_nodes=False)) pipeline_optimizer.fit(dataset_no_oos, refit_after=False) def test_full_config(dataset_no_oos): - search_space = get_search_space("full_training") - pipeline_optimizer = Pipeline.from_optimization_config(search_space) + config_path = ires.files("tests.assets.configs").joinpath("full_training.yaml") + pipeline_optimizer = Pipeline.from_optimization_config(config_path) pipeline_optimizer.fit(dataset_no_oos, refit_after=False) @@ -37,7 +37,6 @@ def test_bayes(dataset, sampler): pipeline_optimizer = Pipeline.from_search_space(search_space) pipeline_optimizer.set_config(LoggingConfig(project_dir=project_dir, dump_modules=True, clear_ram=True)) - pipeline_optimizer.set_config(VectorIndexConfig()) pipeline_optimizer.set_config(DataConfig(scheme="ho", separate_nodes=True)) pipeline_optimizer.fit(dataset, refit_after=False, sampler=sampler) @@ -54,7 +53,6 @@ def test_cv(dataset, task_type): pipeline_optimizer = Pipeline.from_search_space(search_space) pipeline_optimizer.set_config(LoggingConfig(project_dir=project_dir, dump_modules=True, clear_ram=True)) - pipeline_optimizer.set_config(VectorIndexConfig()) pipeline_optimizer.set_config(DataConfig(scheme="cv", separate_nodes=True)) if task_type == "multilabel": @@ -77,7 +75,6 @@ def test_no_context_optimization(dataset, task_type): pipeline_optimizer = Pipeline.from_search_space(search_space) pipeline_optimizer.set_config(LoggingConfig(project_dir=project_dir, dump_modules=False, clear_ram=False)) - pipeline_optimizer.set_config(VectorIndexConfig(save_db=True)) pipeline_optimizer.set_config(DataConfig(scheme="ho", separate_nodes=True)) if task_type == "multilabel": @@ -98,7 +95,6 @@ def test_dump_modules(dataset, task_type): pipeline_optimizer = Pipeline.from_search_space(search_space) pipeline_optimizer.set_config(LoggingConfig(project_dir=project_dir, dump_modules=True, clear_ram=True)) - pipeline_optimizer.set_config(VectorIndexConfig()) if task_type == "multilabel": dataset = dataset.to_multilabel() @@ -107,32 +103,3 @@ def test_dump_modules(dataset, task_type): context.dump() assert os.listdir(pipeline_optimizer.logging_config.dump_dir) - - -def test_validate_search_space_multiclass(dataset): - search_space = [ - { - "node_type": "decision", - "target_metric": "decision_accuracy", - "search_space": [{"module_name": "threshold", "thresh": [0.5]}, {"module_name": "adaptive"}], - }, - ] - - pipeline_optimizer = Pipeline.from_search_space(search_space) - with pytest.raises(ValueError, match="Module 'adaptive' does not support multiclass datasets."): - pipeline_optimizer.validate_modules(dataset) - - -def test_validate_search_space_multilabel(dataset): - dataset = dataset.to_multilabel() - - search_space = [ - { - "node_type": "decision", - "target_metric": "decision_accuracy", - "search_space": [{"module_name": "threshold", "thresh": [0.5]}, {"module_name": "argmax"}], - }, - ] - pipeline_optimizer = Pipeline.from_search_space(search_space) - with pytest.raises(ValueError, match="Module 'argmax' does not support multilabel datasets."): - pipeline_optimizer.validate_modules(dataset) diff --git a/tests/pipeline/test_presets.py b/tests/pipeline/test_presets.py new file mode 100644 index 000000000..e987ff62d --- /dev/null +++ b/tests/pipeline/test_presets.py @@ -0,0 +1,23 @@ +from typing import get_args + +import pytest + +from autointent import Pipeline +from autointent.configs import DataConfig, LoggingConfig +from autointent.custom_types import SearchSpacePresets +from tests.conftest import setup_environment + + +@pytest.mark.parametrize("preset", get_args(SearchSpacePresets)) +def test_presets(dataset, preset): + project_dir = setup_environment() + + pipeline_optimizer = Pipeline.from_preset(preset) + + if preset in ["heavy_extra", "light", "heavy"]: + return + + pipeline_optimizer.set_config(LoggingConfig(project_dir=project_dir, dump_modules=True, clear_ram=True)) + pipeline_optimizer.set_config(DataConfig(scheme="ho", separate_nodes=True)) + + pipeline_optimizer.fit(dataset, refit_after=False) diff --git a/tests/pipeline/test_validation.py b/tests/pipeline/test_validation.py new file mode 100644 index 000000000..31a94bb7d --- /dev/null +++ b/tests/pipeline/test_validation.py @@ -0,0 +1,43 @@ +from typing import get_args + +import pytest + +from autointent import Pipeline +from autointent.nodes.schemes import OptimizationSearchSpaceConfig +from tests.conftest import TaskType, get_search_space + + +def test_validate_search_space_multiclass(dataset): + search_space = [ + { + "node_type": "decision", + "target_metric": "decision_accuracy", + "search_space": [{"module_name": "threshold", "thresh": [0.5]}, {"module_name": "adaptive"}], + }, + ] + + pipeline_optimizer = Pipeline.from_search_space(search_space) + with pytest.raises(ValueError, match="Module 'adaptive' does not support multiclass datasets."): + pipeline_optimizer.validate_modules(dataset, mode="raise") + + +def test_validate_search_space_multilabel(dataset): + dataset = dataset.to_multilabel() + + search_space = [ + { + "node_type": "decision", + "target_metric": "decision_accuracy", + "search_space": [{"module_name": "threshold", "thresh": [0.5]}, {"module_name": "argmax"}], + }, + ] + pipeline_optimizer = Pipeline.from_search_space(search_space) + with pytest.raises(ValueError, match="Module 'argmax' does not support multilabel datasets."): + pipeline_optimizer.validate_modules(dataset, mode="raise") + + +# for now validation for sklearn scorer doesn't work +@pytest.mark.xfail +@pytest.mark.parametrize("search_space", get_args(TaskType)) +def test_search_space(search_space): + OptimizationSearchSpaceConfig(get_search_space(search_space)) diff --git a/tests/test_utils.py b/tests/test_utils.py deleted file mode 100644 index 08f526f21..000000000 --- a/tests/test_utils.py +++ /dev/null @@ -1,10 +0,0 @@ -import pytest - -from autointent.nodes import OptimizationSearchSpaceConfig -from autointent.utils import load_default_search_space - - -@pytest.mark.parametrize("multilabel", [True, False]) -def test_load_default_configs(multilabel): - search_space = load_default_search_space(multilabel=multilabel) - OptimizationSearchSpaceConfig(search_space).model_dump() diff --git a/user_guides/basic_usage/03_automl.py b/user_guides/basic_usage/03_automl.py index c70c61e86..2512dcf91 100644 --- a/user_guides/basic_usage/03_automl.py +++ b/user_guides/basic_usage/03_automl.py @@ -27,12 +27,11 @@ """ ## Search Space -AutoIntent provides default search spaces for multi-label and single-label classification problems. One can utilize them by constructing %mddoclink(class,,Pipeline) with factory %mddoclink(method,Pipeline,default_optimizer): +AutoIntent provides default search spaces. One can utilize them by constructing %mddoclink(class,,Pipeline) with factory %mddoclink(method,Pipeline,from_preset): """ # %% -multiclass_pipeline = Pipeline.default_optimizer(multilabel=False) -multilabel_pipeline = Pipeline.default_optimizer(multilabel=True) +pipeline = Pipeline.from_preset("light_extra") # %% [markdown] """ @@ -42,10 +41,10 @@ # %% from pprint import pprint -from autointent.utils import load_default_search_space +from autointent.utils import load_preset -search_space = load_default_search_space(multilabel=True) -pprint(search_space) +preset = load_preset("light_extra") +pprint(preset) # %% [markdown] """ @@ -53,36 +52,13 @@ """ # %% -search_space[1]["search_space"][0]["k"] = [1, 3] -custom_pipeline = Pipeline.from_search_space(search_space) +preset["search_space"][1]["search_space"][0]["k"] = [1, 3] +custom_pipeline = Pipeline.from_optimization_config(preset) # %% [markdown] """ See tutorial %mddoclink(notebook,advanced.02_search_space_configuration) on how the search space is structured. """ -# %% [markdown] -""" -## Vector Index Settings - -%mddoclink(class,,VectorIndex) is one of the key utilities of AutoIntent. During the auto-configuration process, lots of retrieval is used. By modifying %mddoclink(class,configs,VectorIndexConfig) you can select whether to save built vector index into file system and where to save it. - -Default options are the following: -""" - -# %% -from autointent.configs import VectorIndexConfig - -vector_index_config = VectorIndexConfig(save_db=False) - -# %% [markdown] -""" -- `save_db=False` tells AutoIntent to clear all the files after auto configuration is finished - -These settings can be applied in a familiar way: -""" - -# %% -custom_pipeline.set_config(vector_index_config) # %% [markdown] """ @@ -105,23 +81,21 @@ # %% from autointent import Dataset, Pipeline -from autointent.configs import LoggingConfig, VectorIndexConfig -from autointent.utils import load_default_search_space +from autointent.configs import LoggingConfig +from autointent.utils import load_preset # load data dataset = Dataset.from_hub("AutoIntent/clinc150_subset") # customize search space -search_space = load_default_search_space(multilabel=False) +preset = load_preset("light_extra") # make pipeline -custom_pipeline = Pipeline.from_search_space(search_space) +custom_pipeline = Pipeline.from_optimization_config(preset) # custom settings -vector_index_config = VectorIndexConfig() logging_config = LoggingConfig() -custom_pipeline.set_config(vector_index_config) custom_pipeline.set_config(logging_config) # start auto-configuration diff --git a/user_guides/basic_usage/04_inference.py b/user_guides/basic_usage/04_inference.py index 00c4630a0..02ad3362a 100644 --- a/user_guides/basic_usage/04_inference.py +++ b/user_guides/basic_usage/04_inference.py @@ -47,19 +47,7 @@ """ There are several caveats. -1. **Save vector databse.** - -When customizing configuration of pipeline optimization, you need to ensure that the option `save_db` of %mddoclink(class,configs,VectorIndexConfig) is set to `True`: -""" -# %% -from autointent.configs import VectorIndexConfig - -# isn't compatible with "right-after-optimization" inference -vector_index_config = VectorIndexConfig(save_db=False) - -# %% [markdown] -""" -2. **RAM usage.** +**RAM usage.** You can optimize RAM usage by saving all modules to file system. Just set the following options: """ @@ -78,12 +66,11 @@ # %% from autointent import Dataset, Pipeline -from autointent.configs import LoggingConfig, VectorIndexConfig +from autointent.configs import LoggingConfig dataset = Dataset.from_hub("AutoIntent/clinc150_subset") pipeline = Pipeline.from_search_space(search_space) pipeline.set_config(LoggingConfig(dump_modules=True, clear_ram=True)) -pipeline.set_config(VectorIndexConfig(save_db=True)) # %% [markdown] """