diff --git a/autointent/_optimization_config.py b/autointent/_optimization_config.py index f8b647a0c..572484675 100644 --- a/autointent/_optimization_config.py +++ b/autointent/_optimization_config.py @@ -1,15 +1,16 @@ +from typing import Any + from pydantic import BaseModel, PositiveInt from .configs import CrossEncoderConfig, DataConfig, EmbedderConfig, LoggingConfig from .custom_types import SamplerType -from .nodes.schemes import OptimizationSearchSpaceConfig class OptimizationConfig(BaseModel): """Configuration for the optimization process.""" data_config: DataConfig = DataConfig() - search_space: OptimizationSearchSpaceConfig + search_space: list[dict[str, Any]] logging_config: LoggingConfig = LoggingConfig() embedder_config: EmbedderConfig = EmbedderConfig() cross_encoder_config: CrossEncoderConfig = CrossEncoderConfig() diff --git a/autointent/_pipeline/_pipeline.py b/autointent/_pipeline/_pipeline.py index d433616e7..48c29d00a 100644 --- a/autointent/_pipeline/_pipeline.py +++ b/autointent/_pipeline/_pipeline.py @@ -122,7 +122,7 @@ def from_optimization_config(cls, config: dict[str, Any] | Path | str | Optimiza optimization_config = OptimizationConfig(**dict_params) pipeline = cls( - [NodeOptimizer(**node.model_dump()) for node in optimization_config.search_space], + [NodeOptimizer(**node) for node in optimization_config.search_space], optimization_config.sampler, optimization_config.seed, ) diff --git a/autointent/_presets/heavy.yaml b/autointent/_presets/heavy.yaml index 9c1e605c9..024f13e7f 100644 --- a/autointent/_presets/heavy.yaml +++ b/autointent/_presets/heavy.yaml @@ -14,7 +14,6 @@ search_space: k: low: 1 high: 20 - step: 1 n_trials: 10 - module_name: description temperature: diff --git a/autointent/_presets/heavy_moderate.yaml b/autointent/_presets/heavy_moderate.yaml index fca7e480e..922dbbc16 100644 --- a/autointent/_presets/heavy_moderate.yaml +++ b/autointent/_presets/heavy_moderate.yaml @@ -13,7 +13,6 @@ search_space: k: low: 1 high: 20 - step: 1 n_trials: 10 - module_name: description temperature: diff --git a/autointent/_presets/light.yaml b/autointent/_presets/light.yaml index a2d4a9a5b..0b08f2e63 100644 --- a/autointent/_presets/light.yaml +++ b/autointent/_presets/light.yaml @@ -13,7 +13,6 @@ search_space: k: low: 1 high: 20 - step: 1 n_trials: 10 - node_type: decision target_metric: decision_accuracy @@ -22,6 +21,6 @@ search_space: thresh: low: 0.1 high: 0.9 - step: 0.1 + n_trials: 10 - module_name: argmax sampler: tpe \ No newline at end of file diff --git a/autointent/_presets/light_extra.yaml b/autointent/_presets/light_extra.yaml index 4d5bb51ff..cb8396391 100644 --- a/autointent/_presets/light_extra.yaml +++ b/autointent/_presets/light_extra.yaml @@ -21,6 +21,6 @@ search_space: thresh: low: 0.1 high: 0.9 - n_trials: 10 + n_trials: 10 - module_name: argmax sampler: random \ No newline at end of file diff --git a/autointent/modules/decision/_adaptive.py b/autointent/modules/decision/_adaptive.py index c2325721a..ef16cb6d2 100644 --- a/autointent/modules/decision/_adaptive.py +++ b/autointent/modules/decision/_adaptive.py @@ -67,6 +67,10 @@ def __init__(self, search_space: list[FloatFromZeroToOne] | None = None) -> None """ self.search_space = search_space if search_space is not None else default_search_space + if any(val < 0 or val > 1 for val in self.search_space): + msg = "Unsupported items in `search_space` arg of `AdaptiveDecision` module" + raise ValueError(msg) + @classmethod def from_context(cls, context: Context, search_space: list[FloatFromZeroToOne] | None = None) -> "AdaptiveDecision": """ diff --git a/autointent/modules/decision/_jinoos.py b/autointent/modules/decision/_jinoos.py index 5e7e8f639..08d965a06 100644 --- a/autointent/modules/decision/_jinoos.py +++ b/autointent/modules/decision/_jinoos.py @@ -64,6 +64,10 @@ def __init__( """ self.search_space = np.array(search_space) if search_space is not None else default_search_space + if any(val < 0 or val > 1 for val in self.search_space): + msg = "Items pf `search_space` of `AdaptiveDecision` module must be a floats from zero to one" + raise ValueError(msg) + @classmethod def from_context(cls, context: Context, search_space: list[FloatFromZeroToOne] | None = None) -> "JinoosDecision": """ diff --git a/autointent/modules/decision/_threshold.py b/autointent/modules/decision/_threshold.py index f825525e4..ece61bb7f 100644 --- a/autointent/modules/decision/_threshold.py +++ b/autointent/modules/decision/_threshold.py @@ -82,7 +82,16 @@ def __init__( :param thresh: Threshold for the scores, shape (n_classes,) or float """ + val_error = False self.thresh = thresh if isinstance(thresh, float) else np.array(thresh) + if isinstance(thresh, float): + val_error = val_error or thresh < 0 or thresh > 1 + else: + val_error = val_error or any(val < 0 or val > 1 for val in thresh) + + if val_error: + msg = "`thresh` arg of `ThresholdDecision` must contain a float from zero to one (or list of floats)." + raise ValueError(msg) @classmethod def from_context( diff --git a/autointent/modules/decision/_tunable.py b/autointent/modules/decision/_tunable.py index 0f3ce7838..a2e43d03b 100644 --- a/autointent/modules/decision/_tunable.py +++ b/autointent/modules/decision/_tunable.py @@ -1,6 +1,6 @@ """Tunable predictor module.""" -from typing import Any, Literal +from typing import Any, Literal, get_args import numpy as np import numpy.typing as npt @@ -96,6 +96,14 @@ def __init__( self.seed = seed self.tags = tags + if self.n_optuna_trials < 0 or not isinstance(self.n_optuna_trials, int): + msg = "Unsupported value for `n_optuna_trial` of `TunableDecision` module" + raise ValueError(msg) + + if self.target_metric not in get_args(MetricType): + msg = "Unsupported value for `target_metric` of `TunableDecision` module" + raise TypeError(msg) + @classmethod def from_context( cls, context: Context, target_metric: MetricType = "decision_accuracy", n_optuna_trials: PositiveInt = 320 diff --git a/autointent/modules/embedding/_logreg.py b/autointent/modules/embedding/_logreg.py index 0b4aa6283..5be87fff2 100644 --- a/autointent/modules/embedding/_logreg.py +++ b/autointent/modules/embedding/_logreg.py @@ -63,6 +63,10 @@ def __init__( self.embedder_config = EmbedderConfig.from_search_config(embedder_config) self.cv = cv + if self.cv < 0 or not isinstance(self.cv, int): + msg = "`cv` argument of `LogregAimedEmbedding` must be a positive int" + raise ValueError(msg) + @classmethod def from_context( cls, diff --git a/autointent/modules/embedding/_retrieval.py b/autointent/modules/embedding/_retrieval.py index 0b41bc605..c71d85309 100644 --- a/autointent/modules/embedding/_retrieval.py +++ b/autointent/modules/embedding/_retrieval.py @@ -46,8 +46,8 @@ class RetrievalAimedEmbedding(BaseEmbedding): def __init__( self, - k: PositiveInt, embedder_config: EmbedderConfig | str | dict[str, Any], + k: PositiveInt = 10, ) -> None: """ Initialize the RetrievalAimedEmbedding. @@ -56,18 +56,19 @@ def __init__( :param embedder_config: Config of the embedder used for creating embeddings. """ self.k = k - if isinstance(embedder_config, dict): - embedder_config = EmbedderConfig(**embedder_config) - if isinstance(embedder_config, str): - embedder_config = EmbedderConfig(model_name=embedder_config) + embedder_config = EmbedderConfig.from_search_config(embedder_config) self.embedder_config = embedder_config + if self.k < 0 or not isinstance(self.k, int): + msg = "`k` argument of `RetrievalAimedEmbedding` must be a positive int" + raise ValueError(msg) + @classmethod def from_context( cls, context: Context, - k: PositiveInt, embedder_config: EmbedderConfig | str, + k: PositiveInt = 10, ) -> "RetrievalAimedEmbedding": """ Create an instance using a Context object. diff --git a/autointent/modules/scoring/_description/description.py b/autointent/modules/scoring/_description/description.py index e45c8c051..3ccc52f04 100644 --- a/autointent/modules/scoring/_description/description.py +++ b/autointent/modules/scoring/_description/description.py @@ -50,6 +50,10 @@ def __init__( self.temperature = temperature self.embedder_config = EmbedderConfig.from_search_config(embedder_config) + if self.temperature < 0 or not isinstance(self.temperature, float | int): + msg = "`temperature` argument of `DescriptionScorer` must be a positive float" + raise ValueError(msg) + @classmethod def from_context( cls, diff --git a/autointent/modules/scoring/_dnnc/dnnc.py b/autointent/modules/scoring/_dnnc/dnnc.py index 8ea04c7da..fa759ea89 100644 --- a/autointent/modules/scoring/_dnnc/dnnc.py +++ b/autointent/modules/scoring/_dnnc/dnnc.py @@ -92,6 +92,10 @@ def __init__( self.embedder_config = EmbedderConfig.from_search_config(embedder_config) self.k = k + if self.k < 0 or not isinstance(self.k, int): + msg = "`k` argument of `DNNCScorer` must be a positive int" + raise ValueError(msg) + @classmethod def from_context( cls, diff --git a/autointent/modules/scoring/_knn/knn.py b/autointent/modules/scoring/_knn/knn.py index 91ae16efe..adf107212 100644 --- a/autointent/modules/scoring/_knn/knn.py +++ b/autointent/modules/scoring/_knn/knn.py @@ -1,6 +1,6 @@ """KNNScorer class for k-nearest neighbors scoring.""" -from typing import Any +from typing import Any, get_args import numpy as np import numpy.typing as npt @@ -76,6 +76,14 @@ def __init__( self.k = k self.weights = weights + if self.k < 0 or not isinstance(self.k, int): + msg = "`k` argument of `KNNScorer` must be a positive int" + raise ValueError(msg) + + if weights not in get_args(WEIGHT_TYPES): + msg = f"`weights` argument of `KNNScorer` must be a literal from a list: {get_args(WEIGHT_TYPES)}" + raise TypeError(msg) + @classmethod def from_context( cls, diff --git a/autointent/modules/scoring/_knn/rerank_scorer.py b/autointent/modules/scoring/_knn/rerank_scorer.py index b18295de5..0ee9d3bf5 100644 --- a/autointent/modules/scoring/_knn/rerank_scorer.py +++ b/autointent/modules/scoring/_knn/rerank_scorer.py @@ -59,6 +59,16 @@ def __init__( self.m = k if m is None else m self.rank_threshold_cutoff = rank_threshold_cutoff + if self.m < 0 or not isinstance(self.m, int): + msg = "`m` argument of `RerankScorer` must be a positive int" + raise ValueError(msg) + + if self.rank_threshold_cutoff is not None and ( + self.rank_threshold_cutoff < 0 or not isinstance(self.rank_threshold_cutoff, int) + ): + msg = "`rank_threshold_cutoff` argument of `RerankScorer` must be a positive int or None" + raise ValueError(msg) + @classmethod def from_context( cls, diff --git a/autointent/modules/scoring/_linear.py b/autointent/modules/scoring/_linear.py index 949d263a0..4a43851c1 100644 --- a/autointent/modules/scoring/_linear.py +++ b/autointent/modules/scoring/_linear.py @@ -55,7 +55,6 @@ def __init__( self, embedder_config: EmbedderConfig | str | dict[str, Any] | None = None, cv: int = 3, - n_jobs: int | None = None, seed: int = 0, ) -> None: """ @@ -67,10 +66,13 @@ def __init__( :param seed: Random seed for reproducibility, defaults to 0. """ self.cv = cv - self.n_jobs = n_jobs self.seed = seed self.embedder_config = EmbedderConfig.from_search_config(embedder_config) + if self.cv < 0 or not isinstance(self.cv, int): + msg = "`cv` argument of `LinearScorer` must be a positive int" + raise ValueError(msg) + @classmethod def from_context( cls, @@ -125,7 +127,7 @@ def fit( base_clf = LogisticRegression() clf = MultiOutputClassifier(base_clf) else: - clf = LogisticRegressionCV(cv=self.cv, n_jobs=self.n_jobs, random_state=self.seed) + clf = LogisticRegressionCV(cv=self.cv, random_state=self.seed) clf.fit(features, labels) diff --git a/autointent/modules/scoring/_mlknn/mlknn.py b/autointent/modules/scoring/_mlknn/mlknn.py index c7decf195..fe42efcda 100644 --- a/autointent/modules/scoring/_mlknn/mlknn.py +++ b/autointent/modules/scoring/_mlknn/mlknn.py @@ -5,6 +5,7 @@ import numpy as np from numpy.typing import NDArray from pydantic import NonNegativeInt, PositiveFloat, PositiveInt +from typing_extensions import assert_never from autointent import Context, VectorIndex from autointent.configs import EmbedderConfig @@ -77,6 +78,13 @@ def __init__( self.s = s self.ignore_first_neighbours = ignore_first_neighbours + if self.k < 0 or not isinstance(self.k, int): + msg = "`k` argument of `MLKnnScorer` must be a positive int" + raise ValueError(msg) + + if not isinstance(self.s, float | int): + assert_never(self.s) + @classmethod def from_context( cls, diff --git a/autointent/modules/scoring/_sklearn/sklearn_scorer.py b/autointent/modules/scoring/_sklearn/sklearn_scorer.py index 7b8b6620e..64119b924 100644 --- a/autointent/modules/scoring/_sklearn/sklearn_scorer.py +++ b/autointent/modules/scoring/_sklearn/sklearn_scorer.py @@ -57,8 +57,9 @@ def __init__( self.embedder_config = EmbedderConfig.from_search_config(embedder_config) self.clf_name = clf_name - if AVAILABLE_CLASSIFIERS.get(self.clf_name): - self._base_clf = AVAILABLE_CLASSIFIERS[self.clf_name](**clf_args) + clf_type = AVAILABLE_CLASSIFIERS.get(self.clf_name, None) + if clf_type: + self._base_clf = clf_type(**clf_args) else: msg = f"Class {self.clf_name} does not exist in sklearn or does not have predict_proba method" logger.error(msg) diff --git a/autointent/nodes/__init__.py b/autointent/nodes/__init__.py index 1529ee331..c15db8d73 100644 --- a/autointent/nodes/__init__.py +++ b/autointent/nodes/__init__.py @@ -2,10 +2,8 @@ from ._inference_node import InferenceNode from ._optimization import NodeOptimizer -from .schemes import OptimizationSearchSpaceConfig __all__ = [ "InferenceNode", "NodeOptimizer", - "OptimizationSearchSpaceConfig", ] diff --git a/autointent/nodes/_optimization/_node_optimizer.py b/autointent/nodes/_optimization/_node_optimizer.py index 60adfbd1d..8771e4a81 100644 --- a/autointent/nodes/_optimization/_node_optimizer.py +++ b/autointent/nodes/_optimization/_node_optimizer.py @@ -1,6 +1,7 @@ """Node optimizer.""" import gc +import itertools as it import logging from copy import deepcopy from functools import partial @@ -33,6 +34,9 @@ class ParamSpaceFloat(BaseModel): log: bool = Field(False, description="Whether to use a logarithmic scale.") +logger = logging.getLogger(__name__) + + class NodeOptimizer: """Node optimizer class.""" @@ -50,6 +54,7 @@ def __init__( :param search_space: Search space for the optimization :param metrics: Metrics to optimize. """ + self._logger = logger self.node_type = node_type self.node_info = NODES_INFO[node_type] self.target_metric = target_metric @@ -58,8 +63,8 @@ def __init__( if self.target_metric not in self.metrics: self.metrics.append(self.target_metric) + self.validate_search_space(search_space) self.modules_search_spaces = search_space - self._logger = logging.getLogger(__name__) # TODO solve duplicate logging messages problem def fit(self, context: Context, sampler: SamplerType = "brute") -> None: """ @@ -151,27 +156,27 @@ def objective( def suggest(self, trial: Trial, search_space: dict[str, Any | list[Any]]) -> dict[str, Any]: res: dict[str, Any] = {} - def is_valid_param_space( - param_space: dict[str, Any], space_type: type[ParamSpaceInt | ParamSpaceFloat] - ) -> bool: - try: - space_type(**param_space) - return True # noqa: TRY300 - except ValueError: - return False - for param_name, param_space in search_space.items(): if isinstance(param_space, list): res[param_name] = trial.suggest_categorical(param_name, choices=param_space) - elif is_valid_param_space(param_space, ParamSpaceInt): + elif self._is_valid_param_space(param_space, ParamSpaceInt): res[param_name] = trial.suggest_int(param_name, **param_space) - elif is_valid_param_space(param_space, ParamSpaceFloat): + elif self._is_valid_param_space(param_space, ParamSpaceFloat): res[param_name] = trial.suggest_float(param_name, **param_space) else: msg = f"Unsupported type of param search space: {param_space}" raise TypeError(msg) return res + def _is_valid_param_space( + self, param_space: dict[str, Any], space_type: type[ParamSpaceInt | ParamSpaceFloat] + ) -> bool: + try: + space_type(**param_space) + return True # noqa: TRY300 + except ValueError: + return False + def get_module_dump_dir(self, dump_dir: Path, module_name: str, j_combination: int) -> str: """ Get module dump directory. @@ -222,3 +227,38 @@ def validate_nodes_with_dataset(self, dataset: Dataset, mode: SearchSpaceValidat filtered_search_space.append(search_space) self.modules_search_spaces = filtered_search_space + + def validate_search_space(self, search_space: list[dict[str, Any]]) -> None: + """Check if search space is configured correctly.""" + for module_search_space in search_space: + module_search_space_no_optuna, module_name = self._reformat_search_space(deepcopy(module_search_space)) + + for params_combination in it.product(*module_search_space_no_optuna.values()): + module_kwargs = dict(zip(module_search_space_no_optuna.keys(), params_combination, strict=False)) + + self._logger.debug("validating %s module...", module_name, extra=module_kwargs) + module = self.node_info.modules_available[module_name](**module_kwargs) + self._logger.debug("%s is ok", module_name) + + del module + gc.collect() + + def _reformat_search_space(self, module_search_space: dict[str, Any]) -> tuple[dict[str, Any], str]: + """Remove optuna notation from search space.""" + res = {} + module_name = module_search_space.pop("module_name") + + for param_name, param_space in module_search_space.items(): + if param_name == "n_trials": + continue + if isinstance(param_space, list): + res[param_name] = param_space + elif self._is_valid_param_space(param_space, ParamSpaceInt) or self._is_valid_param_space( + param_space, ParamSpaceFloat + ): + res[param_name] = [param_space["low"], param_space["high"]] + else: + msg = f"Unsupported type of param search space: {param_space}" + raise TypeError(msg) + + return res, module_name diff --git a/autointent/nodes/schemes.py b/autointent/nodes/schemes.py deleted file mode 100644 index 4e33e1314..000000000 --- a/autointent/nodes/schemes.py +++ /dev/null @@ -1,180 +0,0 @@ -"""Schemes.""" - -import inspect -from collections.abc import Iterator -from typing import Annotated, Any, Literal, TypeAlias, Union, get_args, get_origin, get_type_hints - -from pydantic import BaseModel, Field, PositiveInt, RootModel - -from autointent.custom_types import NodeType -from autointent.modules.abc import BaseModule -from autointent.nodes._optimization._node_optimizer import ParamSpaceFloat, ParamSpaceInt -from autointent.nodes.info import DecisionNodeInfo, EmbeddingNodeInfo, RegexNodeInfo, ScoringNodeInfo - - -def unwrap_annotated(tp: type) -> type: - """ - Unwrap the Annotated type to get the actual type. - - :param tp: Type to unwrap - :return: Unwrapped type - """ - return get_args(tp)[0] if get_origin(tp) is Annotated else tp - - -def type_matches(target: type, tp: type) -> bool: - """ - Recursively check if the target type is present in the given type. - - This function handles union types by unwrapping Annotated types where necessary. - - :param target: Target type - :param tp: Given type - :return: If the target type is present in the given type - """ - origin = get_origin(tp) - - if origin is Union: # float | list[float] - return any(type_matches(target, arg) for arg in get_args(tp)) - return unwrap_annotated(tp) is target - - -def get_optuna_class(param_type: type) -> type[ParamSpaceInt | ParamSpaceFloat] | None: - """ - Get the Optuna class for the given parameter type. - - If the (possibly annotated or union) type includes int or float, this function - returns the corresponding search space class. - - :param param_type: Parameter type (could be a union, annotated type, or container) - :return: ParamSpaceInt if the type matches int, ParamSpaceFloat if it matches float, else None. - """ - if type_matches(int, param_type): - return ParamSpaceInt - if type_matches(float, param_type): - return ParamSpaceFloat - return None - - -def generate_models_and_union_type_for_classes( - classes: list[type[BaseModule]], -) -> type[BaseModel]: - """Dynamically generates Pydantic models for class constructors and creates a union type.""" - models: dict[str, type[BaseModel]] = {} - - for cls in classes: - init_signature = inspect.signature(cls.from_context) - globalns = getattr(cls.from_context, "__globals__", {}) - type_hints = get_type_hints(cls.from_context, globalns, None, include_extras=True) # Resolve forward refs - - fields = { - "module_name": (Literal[cls.name], Field(...)), - "n_trials": (PositiveInt | None, Field(None, description="Number of trials")), - } - - for param_name, param in init_signature.parameters.items(): - if param_name in ("self", "cls", "context"): - continue - - param_type: TypeAlias = type_hints.get(param_name, Any) # type: ignore[valid-type] # noqa: PYI042 - field = Field(default=[param.default]) if param.default is not inspect.Parameter.empty else Field(...) - search_type = get_optuna_class(param_type) - if search_type is None: - fields[param_name] = (list[param_type], field) - else: - fields[param_name] = (list[param_type] | search_type, field) - - model_name = f"{cls.__name__}InitModel" - models[cls.__name__] = type( - model_name, - (BaseModel,), - { - "__annotations__": {k: v[0] for k, v in fields.items()}, - **{k: v[1] for k, v in fields.items()}, - }, - ) - - return Union[tuple(models.values())] # type: ignore[return-value] # noqa: UP007 - - -DecisionSearchSpaceType: TypeAlias = generate_models_and_union_type_for_classes( # type: ignore[valid-type] - list(DecisionNodeInfo.modules_available.values()) -) -DecisionMetrics: TypeAlias = Literal[tuple(DecisionNodeInfo.metrics_available.keys())] # type: ignore[valid-type] - - -class DecisionNodeValidator(BaseModel): - """Search space configuration for the Decision node.""" - - node_type: NodeType = NodeType.decision - target_metric: DecisionMetrics - metrics: list[DecisionMetrics] | None = None - search_space: list[DecisionSearchSpaceType] - - -EmbeddingSearchSpaceType: TypeAlias = generate_models_and_union_type_for_classes( # type: ignore[valid-type] - list(EmbeddingNodeInfo.modules_available.values()) -) -EmbeddingMetrics: TypeAlias = Literal[tuple(EmbeddingNodeInfo.metrics_available.keys())] # type: ignore[valid-type] - - -class EmbeddingNodeValidator(BaseModel): - """Search space configuration for the Embedding node.""" - - node_type: NodeType = NodeType.embedding - target_metric: EmbeddingMetrics - metrics: list[EmbeddingMetrics] | None = None - search_space: list[EmbeddingSearchSpaceType] - - -ScoringSearchSpaceType: TypeAlias = generate_models_and_union_type_for_classes( # type: ignore[valid-type] - list(ScoringNodeInfo.modules_available.values()) -) -ScoringMetrics: TypeAlias = Literal[tuple(ScoringNodeInfo.metrics_available.keys())] # type: ignore[valid-type] - - -class ScoringNodeValidator(BaseModel): - """Search space configuration for the Scoring node.""" - - node_type: NodeType = NodeType.scoring - target_metric: ScoringMetrics - metrics: list[ScoringMetrics] | None = None - search_space: list[ScoringSearchSpaceType] - - -RegexpSearchSpaceType: TypeAlias = generate_models_and_union_type_for_classes( # type: ignore[valid-type] - list(RegexNodeInfo.modules_available.values()) -) -RegexpMetrics: TypeAlias = Literal[tuple(RegexNodeInfo.metrics_available.keys())] # type: ignore[valid-type] - - -class RegexNodeValidator(BaseModel): - """Search space configuration for the Regexp node.""" - - node_type: NodeType = NodeType.regex - target_metric: RegexpMetrics - metrics: list[RegexpMetrics] | None = None - search_space: list[RegexpSearchSpaceType] - - -SearchSpaceTypes: TypeAlias = EmbeddingNodeValidator | ScoringNodeValidator | DecisionNodeValidator | RegexNodeValidator - - -class OptimizationSearchSpaceConfig(RootModel[list[SearchSpaceTypes]]): - """Optimizer configuration.""" - - def __iter__( - self, - ) -> Iterator[SearchSpaceTypes]: - """Iterate over the root.""" - return iter(self.root) - - def __getitem__(self, item: int) -> SearchSpaceTypes: - """ - To get item directly from the root. - - :param item: Index - - :return: Item - """ - return self.root[item] diff --git a/docs/optimizer_config.schema.json b/docs/optimizer_config.schema.json index e0cba086f..6018c52a4 100644 --- a/docs/optimizer_config.schema.json +++ b/docs/optimizer_config.schema.json @@ -1,83 +1,5 @@ { "$defs": { - "AdaptiveDecisionInitModel": { - "properties": { - "module_name": { - "const": "adaptive", - "title": "Module Name", - "type": "string" - }, - "n_trials": { - "anyOf": [ - { - "exclusiveMinimum": 0, - "type": "integer" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Number of trials", - "title": "N Trials" - }, - "search_space": { - "default": [ - null - ], - "items": { - "anyOf": [ - { - "items": { - "maximum": 1.0, - "minimum": 0.0, - "type": "number" - }, - "type": "array" - }, - { - "type": "null" - } - ] - }, - "title": "Search Space", - "type": "array" - } - }, - "required": [ - "module_name" - ], - "title": "AdaptiveDecisionInitModel", - "type": "object" - }, - "ArgmaxDecisionInitModel": { - "properties": { - "module_name": { - "const": "argmax", - "title": "Module Name", - "type": "string" - }, - "n_trials": { - "anyOf": [ - { - "exclusiveMinimum": 0, - "type": "integer" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Number of trials", - "title": "N Trials" - } - }, - "required": [ - "module_name" - ], - "title": "ArgmaxDecisionInitModel", - "type": "object" - }, "CrossEncoderConfig": { "properties": { "batch_size": { @@ -130,90 +52,6 @@ "title": "CrossEncoderConfig", "type": "object" }, - "DNNCScorerInitModel": { - "properties": { - "module_name": { - "const": "dnnc", - "title": "Module Name", - "type": "string" - }, - "n_trials": { - "anyOf": [ - { - "exclusiveMinimum": 0, - "type": "integer" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Number of trials", - "title": "N Trials" - }, - "k": { - "anyOf": [ - { - "items": { - "exclusiveMinimum": 0, - "type": "integer" - }, - "type": "array" - }, - { - "$ref": "#/$defs/ParamSpaceInt" - } - ], - "title": "K" - }, - "cross_encoder_config": { - "default": [ - null - ], - "items": { - "anyOf": [ - { - "$ref": "#/$defs/CrossEncoderConfig" - }, - { - "type": "string" - }, - { - "type": "null" - } - ] - }, - "title": "Cross Encoder Config", - "type": "array" - }, - "embedder_config": { - "default": [ - null - ], - "items": { - "anyOf": [ - { - "$ref": "#/$defs/EmbedderConfig" - }, - { - "type": "string" - }, - { - "type": "null" - } - ] - }, - "title": "Embedder Config", - "type": "array" - } - }, - "required": [ - "module_name", - "k" - ], - "title": "DNNCScorerInitModel", - "type": "object" - }, "DataConfig": { "description": "Configuration for the data used in the optimization process.", "properties": { @@ -261,141 +99,6 @@ "title": "DataConfig", "type": "object" }, - "DecisionNodeValidator": { - "description": "Search space configuration for the Decision node.", - "properties": { - "node_type": { - "$ref": "#/$defs/NodeType", - "default": "decision" - }, - "target_metric": { - "enum": [ - "decision_accuracy", - "decision_f1", - "decision_precision", - "decision_recall", - "decision_roc_auc" - ], - "title": "Target Metric", - "type": "string" - }, - "metrics": { - "anyOf": [ - { - "items": { - "enum": [ - "decision_accuracy", - "decision_f1", - "decision_precision", - "decision_recall", - "decision_roc_auc" - ], - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Metrics" - }, - "search_space": { - "items": { - "anyOf": [ - { - "$ref": "#/$defs/ArgmaxDecisionInitModel" - }, - { - "$ref": "#/$defs/JinoosDecisionInitModel" - }, - { - "$ref": "#/$defs/ThresholdDecisionInitModel" - }, - { - "$ref": "#/$defs/TunableDecisionInitModel" - }, - { - "$ref": "#/$defs/AdaptiveDecisionInitModel" - } - ] - }, - "title": "Search Space", - "type": "array" - } - }, - "required": [ - "target_metric", - "search_space" - ], - "title": "DecisionNodeValidator", - "type": "object" - }, - "DescriptionScorerInitModel": { - "properties": { - "module_name": { - "const": "description", - "title": "Module Name", - "type": "string" - }, - "n_trials": { - "anyOf": [ - { - "exclusiveMinimum": 0, - "type": "integer" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Number of trials", - "title": "N Trials" - }, - "temperature": { - "anyOf": [ - { - "items": { - "exclusiveMinimum": 0.0, - "type": "number" - }, - "type": "array" - }, - { - "$ref": "#/$defs/ParamSpaceFloat" - } - ], - "title": "Temperature" - }, - "embedder_config": { - "default": [ - null - ], - "items": { - "anyOf": [ - { - "$ref": "#/$defs/EmbedderConfig" - }, - { - "type": "string" - }, - { - "type": "null" - } - ] - }, - "title": "Embedder Config", - "type": "array" - } - }, - "required": [ - "module_name", - "temperature" - ], - "title": "DescriptionScorerInitModel", - "type": "object" - }, "EmbedderConfig": { "properties": { "batch_size": { @@ -526,285 +229,6 @@ "title": "EmbedderConfig", "type": "object" }, - "EmbeddingNodeValidator": { - "description": "Search space configuration for the Embedding node.", - "properties": { - "node_type": { - "$ref": "#/$defs/NodeType", - "default": "embedding" - }, - "target_metric": { - "enum": [ - "retrieval_hit_rate", - "retrieval_map", - "retrieval_mrr", - "retrieval_ndcg", - "retrieval_precision", - "retrieval_hit_rate_intersecting", - "retrieval_hit_rate_macro", - "retrieval_map_intersecting", - "retrieval_map_macro", - "retrieval_mrr_intersecting", - "retrieval_mrr_macro", - "retrieval_ndcg_intersecting", - "retrieval_ndcg_macro", - "retrieval_precision_intersecting", - "retrieval_precision_macro", - "scoring_accuracy", - "scoring_f1", - "scoring_log_likelihood", - "scoring_precision", - "scoring_recall", - "scoring_roc_auc", - "scoring_hit_rate", - "scoring_map", - "scoring_neg_coverage", - "scoring_neg_ranking_loss" - ], - "title": "Target Metric", - "type": "string" - }, - "metrics": { - "anyOf": [ - { - "items": { - "enum": [ - "retrieval_hit_rate", - "retrieval_map", - "retrieval_mrr", - "retrieval_ndcg", - "retrieval_precision", - "retrieval_hit_rate_intersecting", - "retrieval_hit_rate_macro", - "retrieval_map_intersecting", - "retrieval_map_macro", - "retrieval_mrr_intersecting", - "retrieval_mrr_macro", - "retrieval_ndcg_intersecting", - "retrieval_ndcg_macro", - "retrieval_precision_intersecting", - "retrieval_precision_macro", - "scoring_accuracy", - "scoring_f1", - "scoring_log_likelihood", - "scoring_precision", - "scoring_recall", - "scoring_roc_auc", - "scoring_hit_rate", - "scoring_map", - "scoring_neg_coverage", - "scoring_neg_ranking_loss" - ], - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Metrics" - }, - "search_space": { - "items": { - "anyOf": [ - { - "$ref": "#/$defs/RetrievalAimedEmbeddingInitModel" - }, - { - "$ref": "#/$defs/LogregAimedEmbeddingInitModel" - } - ] - }, - "title": "Search Space", - "type": "array" - } - }, - "required": [ - "target_metric", - "search_space" - ], - "title": "EmbeddingNodeValidator", - "type": "object" - }, - "JinoosDecisionInitModel": { - "properties": { - "module_name": { - "const": "jinoos", - "title": "Module Name", - "type": "string" - }, - "n_trials": { - "anyOf": [ - { - "exclusiveMinimum": 0, - "type": "integer" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Number of trials", - "title": "N Trials" - }, - "search_space": { - "default": [ - null - ], - "items": { - "anyOf": [ - { - "items": { - "maximum": 1.0, - "minimum": 0.0, - "type": "number" - }, - "type": "array" - }, - { - "type": "null" - } - ] - }, - "title": "Search Space", - "type": "array" - } - }, - "required": [ - "module_name" - ], - "title": "JinoosDecisionInitModel", - "type": "object" - }, - "KNNScorerInitModel": { - "properties": { - "module_name": { - "const": "knn", - "title": "Module Name", - "type": "string" - }, - "n_trials": { - "anyOf": [ - { - "exclusiveMinimum": 0, - "type": "integer" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Number of trials", - "title": "N Trials" - }, - "k": { - "anyOf": [ - { - "items": { - "exclusiveMinimum": 0, - "type": "integer" - }, - "type": "array" - }, - { - "$ref": "#/$defs/ParamSpaceInt" - } - ], - "title": "K" - }, - "weights": { - "default": [ - "distance" - ], - "items": { - "enum": [ - "uniform", - "distance", - "closest" - ], - "type": "string" - }, - "title": "Weights", - "type": "array" - }, - "embedder_config": { - "default": [ - null - ], - "items": { - "anyOf": [ - { - "$ref": "#/$defs/EmbedderConfig" - }, - { - "type": "string" - }, - { - "type": "null" - } - ] - }, - "title": "Embedder Config", - "type": "array" - } - }, - "required": [ - "module_name", - "k" - ], - "title": "KNNScorerInitModel", - "type": "object" - }, - "LinearScorerInitModel": { - "properties": { - "module_name": { - "const": "linear", - "title": "Module Name", - "type": "string" - }, - "n_trials": { - "anyOf": [ - { - "exclusiveMinimum": 0, - "type": "integer" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Number of trials", - "title": "N Trials" - }, - "embedder_config": { - "default": [ - null - ], - "items": { - "anyOf": [ - { - "$ref": "#/$defs/EmbedderConfig" - }, - { - "type": "string" - }, - { - "type": "null" - } - ] - }, - "title": "Embedder Config", - "type": "array" - } - }, - "required": [ - "module_name" - ], - "title": "LinearScorerInitModel", - "type": "object" - }, "LoggingConfig": { "description": "Configuration for the logging.", "properties": { @@ -873,836 +297,6 @@ }, "title": "LoggingConfig", "type": "object" - }, - "LogregAimedEmbeddingInitModel": { - "properties": { - "module_name": { - "const": "logreg_embedding", - "title": "Module Name", - "type": "string" - }, - "n_trials": { - "anyOf": [ - { - "exclusiveMinimum": 0, - "type": "integer" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Number of trials", - "title": "N Trials" - }, - "embedder_config": { - "items": { - "anyOf": [ - { - "$ref": "#/$defs/EmbedderConfig" - }, - { - "type": "string" - } - ] - }, - "title": "Embedder Config", - "type": "array" - }, - "cv": { - "anyOf": [ - { - "items": { - "exclusiveMinimum": 0, - "type": "integer" - }, - "type": "array" - }, - { - "$ref": "#/$defs/ParamSpaceInt" - } - ], - "default": [ - 3 - ], - "title": "Cv" - } - }, - "required": [ - "module_name", - "embedder_config" - ], - "title": "LogregAimedEmbeddingInitModel", - "type": "object" - }, - "MLKnnScorerInitModel": { - "properties": { - "module_name": { - "const": "mlknn", - "title": "Module Name", - "type": "string" - }, - "n_trials": { - "anyOf": [ - { - "exclusiveMinimum": 0, - "type": "integer" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Number of trials", - "title": "N Trials" - }, - "k": { - "anyOf": [ - { - "items": { - "exclusiveMinimum": 0, - "type": "integer" - }, - "type": "array" - }, - { - "$ref": "#/$defs/ParamSpaceInt" - } - ], - "title": "K" - }, - "s": { - "anyOf": [ - { - "items": { - "exclusiveMinimum": 0.0, - "type": "number" - }, - "type": "array" - }, - { - "$ref": "#/$defs/ParamSpaceFloat" - } - ], - "default": [ - 1.0 - ], - "title": "S" - }, - "ignore_first_neighbours": { - "anyOf": [ - { - "items": { - "minimum": 0, - "type": "integer" - }, - "type": "array" - }, - { - "$ref": "#/$defs/ParamSpaceInt" - } - ], - "default": [ - 0 - ], - "title": "Ignore First Neighbours" - }, - "embedder_config": { - "default": [ - null - ], - "items": { - "anyOf": [ - { - "$ref": "#/$defs/EmbedderConfig" - }, - { - "type": "string" - }, - { - "type": "null" - } - ] - }, - "title": "Embedder Config", - "type": "array" - } - }, - "required": [ - "module_name", - "k" - ], - "title": "MLKnnScorerInitModel", - "type": "object" - }, - "NodeType": { - "description": "Enumeration of node types in the AutoIntent pipeline.", - "enum": [ - "regex", - "embedding", - "scoring", - "decision" - ], - "title": "NodeType", - "type": "string" - }, - "OptimizationSearchSpaceConfig": { - "description": "Optimizer configuration.", - "items": { - "anyOf": [ - { - "$ref": "#/$defs/EmbeddingNodeValidator" - }, - { - "$ref": "#/$defs/ScoringNodeValidator" - }, - { - "$ref": "#/$defs/DecisionNodeValidator" - }, - { - "$ref": "#/$defs/RegexNodeValidator" - } - ] - }, - "title": "OptimizationSearchSpaceConfig", - "type": "array" - }, - "ParamSpaceFloat": { - "properties": { - "low": { - "description": "Low boundary of the search space.", - "title": "Low", - "type": "number" - }, - "high": { - "description": "High boundary of the search space.", - "title": "High", - "type": "number" - }, - "step": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Step of the search space.", - "title": "Step" - }, - "log": { - "default": false, - "description": "Whether to use a logarithmic scale.", - "title": "Log", - "type": "boolean" - } - }, - "required": [ - "low", - "high" - ], - "title": "ParamSpaceFloat", - "type": "object" - }, - "ParamSpaceInt": { - "properties": { - "low": { - "description": "Low boundary of the search space.", - "title": "Low", - "type": "integer" - }, - "high": { - "description": "High boundary of the search space.", - "title": "High", - "type": "integer" - }, - "step": { - "default": 1, - "description": "Step of the search space.", - "title": "Step", - "type": "integer" - }, - "log": { - "default": false, - "description": "Whether to use a logarithmic scale.", - "title": "Log", - "type": "boolean" - } - }, - "required": [ - "low", - "high" - ], - "title": "ParamSpaceInt", - "type": "object" - }, - "RegexInitModel": { - "properties": { - "module_name": { - "const": "regex", - "title": "Module Name", - "type": "string" - }, - "n_trials": { - "anyOf": [ - { - "exclusiveMinimum": 0, - "type": "integer" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Number of trials", - "title": "N Trials" - } - }, - "required": [ - "module_name" - ], - "title": "RegexInitModel", - "type": "object" - }, - "RegexNodeValidator": { - "description": "Search space configuration for the Regexp node.", - "properties": { - "node_type": { - "$ref": "#/$defs/NodeType", - "default": "regex" - }, - "target_metric": { - "enum": [ - "regex_partial_accuracy", - "regex_partial_precision" - ], - "title": "Target Metric", - "type": "string" - }, - "metrics": { - "anyOf": [ - { - "items": { - "enum": [ - "regex_partial_accuracy", - "regex_partial_precision" - ], - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Metrics" - }, - "search_space": { - "items": { - "$ref": "#/$defs/RegexInitModel" - }, - "title": "Search Space", - "type": "array" - } - }, - "required": [ - "target_metric", - "search_space" - ], - "title": "RegexNodeValidator", - "type": "object" - }, - "RerankScorerInitModel": { - "properties": { - "module_name": { - "const": "rerank", - "title": "Module Name", - "type": "string" - }, - "n_trials": { - "anyOf": [ - { - "exclusiveMinimum": 0, - "type": "integer" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Number of trials", - "title": "N Trials" - }, - "k": { - "anyOf": [ - { - "items": { - "type": "integer" - }, - "type": "array" - }, - { - "$ref": "#/$defs/ParamSpaceInt" - } - ], - "title": "K" - }, - "weights": { - "default": [ - "distance" - ], - "items": { - "enum": [ - "uniform", - "distance", - "closest" - ], - "type": "string" - }, - "title": "Weights", - "type": "array" - }, - "m": { - "anyOf": [ - { - "items": { - "anyOf": [ - { - "exclusiveMinimum": 0, - "type": "integer" - }, - { - "type": "null" - } - ] - }, - "type": "array" - }, - { - "$ref": "#/$defs/ParamSpaceInt" - } - ], - "default": [ - null - ], - "title": "M" - }, - "cross_encoder_config": { - "default": [ - null - ], - "items": { - "anyOf": [ - { - "$ref": "#/$defs/CrossEncoderConfig" - }, - { - "type": "string" - }, - { - "type": "null" - } - ] - }, - "title": "Cross Encoder Config", - "type": "array" - }, - "embedder_config": { - "default": [ - null - ], - "items": { - "anyOf": [ - { - "$ref": "#/$defs/EmbedderConfig" - }, - { - "type": "string" - }, - { - "type": "null" - } - ] - }, - "title": "Embedder Config", - "type": "array" - }, - "rank_threshold_cutoff": { - "anyOf": [ - { - "items": { - "anyOf": [ - { - "type": "integer" - }, - { - "type": "null" - } - ] - }, - "type": "array" - }, - { - "$ref": "#/$defs/ParamSpaceInt" - } - ], - "default": [ - null - ], - "title": "Rank Threshold Cutoff" - } - }, - "required": [ - "module_name", - "k" - ], - "title": "RerankScorerInitModel", - "type": "object" - }, - "RetrievalAimedEmbeddingInitModel": { - "properties": { - "module_name": { - "const": "retrieval", - "title": "Module Name", - "type": "string" - }, - "n_trials": { - "anyOf": [ - { - "exclusiveMinimum": 0, - "type": "integer" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Number of trials", - "title": "N Trials" - }, - "k": { - "anyOf": [ - { - "items": { - "exclusiveMinimum": 0, - "type": "integer" - }, - "type": "array" - }, - { - "$ref": "#/$defs/ParamSpaceInt" - } - ], - "title": "K" - }, - "embedder_config": { - "items": { - "anyOf": [ - { - "$ref": "#/$defs/EmbedderConfig" - }, - { - "type": "string" - } - ] - }, - "title": "Embedder Config", - "type": "array" - } - }, - "required": [ - "module_name", - "k", - "embedder_config" - ], - "title": "RetrievalAimedEmbeddingInitModel", - "type": "object" - }, - "ScoringNodeValidator": { - "description": "Search space configuration for the Scoring node.", - "properties": { - "node_type": { - "$ref": "#/$defs/NodeType", - "default": "scoring" - }, - "target_metric": { - "enum": [ - "scoring_accuracy", - "scoring_f1", - "scoring_log_likelihood", - "scoring_precision", - "scoring_recall", - "scoring_roc_auc", - "scoring_hit_rate", - "scoring_map", - "scoring_neg_coverage", - "scoring_neg_ranking_loss" - ], - "title": "Target Metric", - "type": "string" - }, - "metrics": { - "anyOf": [ - { - "items": { - "enum": [ - "scoring_accuracy", - "scoring_f1", - "scoring_log_likelihood", - "scoring_precision", - "scoring_recall", - "scoring_roc_auc", - "scoring_hit_rate", - "scoring_map", - "scoring_neg_coverage", - "scoring_neg_ranking_loss" - ], - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Metrics" - }, - "search_space": { - "items": { - "anyOf": [ - { - "$ref": "#/$defs/DNNCScorerInitModel" - }, - { - "$ref": "#/$defs/KNNScorerInitModel" - }, - { - "$ref": "#/$defs/LinearScorerInitModel" - }, - { - "$ref": "#/$defs/DescriptionScorerInitModel" - }, - { - "$ref": "#/$defs/RerankScorerInitModel" - }, - { - "$ref": "#/$defs/SklearnScorerInitModel" - }, - { - "$ref": "#/$defs/MLKnnScorerInitModel" - } - ] - }, - "title": "Search Space", - "type": "array" - } - }, - "required": [ - "target_metric", - "search_space" - ], - "title": "ScoringNodeValidator", - "type": "object" - }, - "SklearnScorerInitModel": { - "properties": { - "module_name": { - "const": "sklearn", - "title": "Module Name", - "type": "string" - }, - "n_trials": { - "anyOf": [ - { - "exclusiveMinimum": 0, - "type": "integer" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Number of trials", - "title": "N Trials" - }, - "clf_name": { - "items": { - "type": "string" - }, - "title": "Clf Name", - "type": "array" - }, - "embedder_config": { - "default": [ - null - ], - "items": { - "anyOf": [ - { - "$ref": "#/$defs/EmbedderConfig" - }, - { - "type": "string" - }, - { - "type": "null" - } - ] - }, - "title": "Embedder Config", - "type": "array" - }, - "clf_args": { - "items": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "boolean" - } - ] - }, - "title": "Clf Args", - "type": "array" - } - }, - "required": [ - "module_name", - "clf_name", - "clf_args" - ], - "title": "SklearnScorerInitModel", - "type": "object" - }, - "ThresholdDecisionInitModel": { - "properties": { - "module_name": { - "const": "threshold", - "title": "Module Name", - "type": "string" - }, - "n_trials": { - "anyOf": [ - { - "exclusiveMinimum": 0, - "type": "integer" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Number of trials", - "title": "N Trials" - }, - "thresh": { - "anyOf": [ - { - "items": { - "anyOf": [ - { - "maximum": 1.0, - "minimum": 0.0, - "type": "number" - }, - { - "items": { - "maximum": 1.0, - "minimum": 0.0, - "type": "number" - }, - "type": "array" - } - ] - }, - "type": "array" - }, - { - "$ref": "#/$defs/ParamSpaceFloat" - } - ], - "default": [ - 0.5 - ], - "title": "Thresh" - } - }, - "required": [ - "module_name" - ], - "title": "ThresholdDecisionInitModel", - "type": "object" - }, - "TunableDecisionInitModel": { - "properties": { - "module_name": { - "const": "tunable", - "title": "Module Name", - "type": "string" - }, - "n_trials": { - "anyOf": [ - { - "exclusiveMinimum": 0, - "type": "integer" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Number of trials", - "title": "N Trials" - }, - "target_metric": { - "default": [ - "decision_accuracy" - ], - "items": { - "enum": [ - "decision_accuracy", - "decision_f1", - "decision_roc_auc", - "decision_precision", - "decision_recall" - ], - "type": "string" - }, - "title": "Target Metric", - "type": "array" - }, - "n_optuna_trials": { - "anyOf": [ - { - "items": { - "exclusiveMinimum": 0, - "type": "integer" - }, - "type": "array" - }, - { - "$ref": "#/$defs/ParamSpaceInt" - } - ], - "default": [ - 320 - ], - "title": "N Optuna Trials" - } - }, - "required": [ - "module_name" - ], - "title": "TunableDecisionInitModel", - "type": "object" } }, "description": "Configuration for the optimization process.", @@ -1717,7 +311,11 @@ } }, "search_space": { - "$ref": "#/$defs/OptimizationSearchSpaceConfig" + "items": { + "type": "object" + }, + "title": "Search Space", + "type": "array" }, "logging_config": { "$ref": "#/$defs/LoggingConfig", diff --git a/scripts/generate_json_schema_config.py b/scripts/generate_json_schema_config.py index a5e2615ea..c6ddb0736 100644 --- a/scripts/generate_json_schema_config.py +++ b/scripts/generate_json_schema_config.py @@ -1,18 +1,9 @@ import json from pathlib import Path -from autointent.nodes.schemes import OptimizationSearchSpaceConfig from autointent import OptimizationConfig -def generate_json_schema_search_space_config() -> None: - """Generate the JSON schema for the optimizer config.""" - schema = OptimizationSearchSpaceConfig.model_json_schema() - path = Path(__file__).parent.parent / "docs" / "optimizer_search_space_config.schema.json" - with path.open("w") as f: - json.dump(schema, f, indent=4) - - def generate_json_schema_optimizer_config() -> None: """Generate the JSON schema for the optimizer config.""" schema = OptimizationConfig.model_json_schema() @@ -22,5 +13,4 @@ def generate_json_schema_optimizer_config() -> None: if __name__ == "__main__": - generate_json_schema_search_space_config() generate_json_schema_optimizer_config() diff --git a/tests/configs/test_combined_config.py b/tests/configs/test_combined_config.py index bef606f48..41dc5bc7a 100644 --- a/tests/configs/test_combined_config.py +++ b/tests/configs/test_combined_config.py @@ -1,10 +1,9 @@ +from typing import get_args + import pytest -from pydantic import ValidationError -from autointent.nodes.schemes import ( - OptimizationSearchSpaceConfig, -) -from tests.conftest import get_search_space +from autointent.nodes import NodeOptimizer +from tests.conftest import TaskType, get_search_space @pytest.fixture @@ -44,19 +43,17 @@ def valid_optimizer_config(): def test_valid_optimizer_config(valid_optimizer_config): """Test that a valid optimizer config passes validation.""" - config = OptimizationSearchSpaceConfig(valid_optimizer_config) - assert config[0].node_type == "scoring" - assert config[1].node_type == "embedding" + for node_dict_config in valid_optimizer_config: + NodeOptimizer(**node_dict_config) @pytest.mark.parametrize( "task_type", - ["multiclass", "multilabel", "description"], + get_args(TaskType), ) def test_optimizer_config(task_type): - search_space = get_search_space(task_type) - config = OptimizationSearchSpaceConfig(search_space) - assert config + for node_dict_config in get_search_space(task_type): + NodeOptimizer(**node_dict_config) def test_invalid_optimizer_config_missing_field(): @@ -71,8 +68,8 @@ def test_invalid_optimizer_config_missing_field(): } ] - with pytest.raises(ValidationError): - OptimizationSearchSpaceConfig(invalid_config) + with pytest.raises(TypeError): + NodeOptimizer(**invalid_config) def test_invalid_optimizer_config_wrong_type(): @@ -92,5 +89,5 @@ def test_invalid_optimizer_config_wrong_type(): } ] - with pytest.raises(ValidationError): - OptimizationSearchSpaceConfig(invalid_config) + with pytest.raises(TypeError): + NodeOptimizer(**invalid_config) diff --git a/tests/configs/test_decision.py b/tests/configs/test_decision.py index 79a60fb33..6174d2cef 100644 --- a/tests/configs/test_decision.py +++ b/tests/configs/test_decision.py @@ -1,71 +1,64 @@ import pytest -from pydantic import ValidationError -from autointent.nodes.schemes import OptimizationSearchSpaceConfig +from autointent.nodes import NodeOptimizer @pytest.fixture def valid_decision_config(): """Fixture for a valid DecisionNode configuration.""" - return [ - { - "node_type": "decision", - "target_metric": "decision_roc_auc", - "search_space": [ - {"module_name": "argmax"}, - {"module_name": "jinoos", "search_space": [[0.3, 0.5, 0.7]]}, - {"module_name": "threshold", "thresh": [[0.5, 0.6]]}, - { - "module_name": "tunable", - "n_optuna_trials": [100], - }, - {"module_name": "adaptive", "search_space": [[0.5]]}, - ], - } - ] + return { + "node_type": "decision", + "target_metric": "decision_roc_auc", + "search_space": [ + {"module_name": "argmax"}, + {"module_name": "jinoos", "search_space": [[0.3, 0.5, 0.7]]}, + {"module_name": "threshold", "thresh": [[0.5, 0.6]]}, + { + "module_name": "tunable", + "n_optuna_trials": [100], + }, + {"module_name": "adaptive", "search_space": [[0.5]]}, + ], + } def test_valid_decision_config(valid_decision_config): """Test that a valid decision config passes validation.""" - config = OptimizationSearchSpaceConfig(valid_decision_config) - assert config[0].node_type == "decision" - assert config[0].target_metric == "decision_roc_auc" - assert isinstance(config[0].search_space, list) - assert config[0].search_space[0].module_name == "argmax" + node = NodeOptimizer(**valid_decision_config) + assert node.node_type == "decision" + assert node.target_metric == "decision_roc_auc" + assert isinstance(node.modules_search_spaces, list) + assert node.modules_search_spaces[0]["module_name"] == "argmax" def test_invalid_decision_config_missing_field(): """Test that a missing required field raises ValidationError.""" - invalid_config = [ - { - "node_type": "decision", - # Missing "target_metric" - "search_space": [{"module_name": "tunable", "n_optuna_trials": [100]}], - } - ] + invalid_config = { + "node_type": "decision", + # Missing "target_metric" + "search_space": [{"module_name": "tunable", "n_optuna_trials": [100]}], + } - with pytest.raises(ValidationError): - OptimizationSearchSpaceConfig(invalid_config) + with pytest.raises(TypeError): + NodeOptimizer(**invalid_config) def test_invalid_decision_config_wrong_type(): """Test that an invalid field type raises ValidationError.""" - invalid_config = [ - { - "node_type": "decision", - "target_metric": "decision_roc_auc", - "search_space": [ - { - "module_name": "threshold", - "thresh": ["wrong_type"], # Should be a list of floats or a single float - }, - { - "module_name": "tunable", - "n_optuna_trials": ["not_an_int"], # Should be an integer - }, - ], - } - ] + invalid_config = { + "node_type": "decision", + "target_metric": "decision_roc_auc", + "search_space": [ + { + "module_name": "threshold", + "thresh": ["wrong_type"], # Should be a list of floats or a single float + }, + { + "module_name": "tunable", + "n_optuna_trials": ["not_an_int"], # Should be an integer + }, + ], + } - with pytest.raises(ValidationError): - OptimizationSearchSpaceConfig(invalid_config) + with pytest.raises(TypeError): + NodeOptimizer(**invalid_config) diff --git a/tests/configs/test_embedding.py b/tests/configs/test_embedding.py index 868ba0901..5a352ae6f 100644 --- a/tests/configs/test_embedding.py +++ b/tests/configs/test_embedding.py @@ -1,73 +1,66 @@ import pytest -from pydantic import ValidationError -from autointent.nodes import OptimizationSearchSpaceConfig +from autointent.nodes import NodeOptimizer @pytest.fixture def valid_embedding_config(): """Fixture for a valid EmbeddingNode configuration.""" - return [ - { - "node_type": "embedding", - "target_metric": "retrieval_mrr", - "search_space": [ - {"module_name": "logreg_embedding", "embedder_config": ["sergeyzh/rubert-tiny-turbo"], "cv": [3, 5]}, - { - "module_name": "retrieval", - "embedder_config": ["sentence-transformers/all-MiniLM-L6-v2"], - "k": [5, 10], - }, - ], - } - ] + return { + "node_type": "embedding", + "target_metric": "retrieval_mrr", + "search_space": [ + {"module_name": "logreg_embedding", "embedder_config": ["sergeyzh/rubert-tiny-turbo"], "cv": [3, 5]}, + { + "module_name": "retrieval", + "embedder_config": ["sentence-transformers/all-MiniLM-L6-v2"], + "k": [5, 10], + }, + ], + } def test_valid_embedding_config(valid_embedding_config): """Test that a valid embedding config passes validation.""" - config = OptimizationSearchSpaceConfig(valid_embedding_config) - assert config[0].node_type == "embedding" - assert config[0].target_metric == "retrieval_mrr" - assert isinstance(config[0].search_space, list) - assert config[0].search_space[0].module_name == "logreg_embedding" - assert "embedder_config" in config[0].search_space[0].model_dump() + node = NodeOptimizer(**valid_embedding_config) + assert node.node_type == "embedding" + assert node.target_metric == "retrieval_mrr" + assert isinstance(node.modules_search_spaces, list) + assert node.modules_search_spaces[0]["module_name"] == "logreg_embedding" + assert "embedder_config" in node.modules_search_spaces[0] def test_invalid_embedding_config_missing_field(): """Test that a missing required field raises ValidationError.""" - invalid_config = [ - { - "node_type": "embedding", - # Missing "target_metric" - "search_space": [ - { - "module_name": "retrieval", - "embedder_config": ["sentence-transformers/all-MiniLM-L6-v2"], - "k": [5, 10], - } - ], - } - ] + invalid_config = { + "node_type": "embedding", + # Missing "target_metric" + "search_space": [ + { + "module_name": "retrieval", + "embedder_config": ["sentence-transformers/all-MiniLM-L6-v2"], + "k": [5, 10], + } + ], + } - with pytest.raises(ValidationError): - OptimizationSearchSpaceConfig(invalid_config) + with pytest.raises(TypeError): + NodeOptimizer(**invalid_config) def test_invalid_embedding_config_wrong_type(): """Test that an invalid field type raises ValidationError.""" - invalid_config = [ - { - "node_type": "embedding", - "target_metric": "retrieval_mrr", - "search_space": [ - { - "module_name": "logreg_embedding", - "embedder_config": "not_a_list", # Should be a list of strings - "cv": ["wrong_type"], # Should be a list of integers - } - ], - } - ] + invalid_config = { + "node_type": "embedding", + "target_metric": "retrieval_mrr", + "search_space": [ + { + "module_name": "logreg_embedding", + "embedder_config": "not_a_list", # Should be a list of strings + "cv": ["wrong_type"], # Should be a list of integers + } + ], + } - with pytest.raises(ValidationError): - OptimizationSearchSpaceConfig(invalid_config) + with pytest.raises(TypeError): + NodeOptimizer(**invalid_config) diff --git a/tests/configs/test_scoring.py b/tests/configs/test_scoring.py index e95d32be7..dc696069d 100644 --- a/tests/configs/test_scoring.py +++ b/tests/configs/test_scoring.py @@ -1,69 +1,68 @@ import pytest -from pydantic import ValidationError -from autointent.nodes import OptimizationSearchSpaceConfig +from autointent.nodes import NodeOptimizer @pytest.fixture def valid_scoring_config(): """Fixture for a valid ScoringNode configuration.""" - return [ - { - "node_type": "scoring", - "target_metric": "scoring_roc_auc", - "search_space": [ - { - "module_name": "dnnc", - "cross_encoder_config": ["cross-encoder/ms-marco-MiniLM-L-6-v2"], - "embedder_config": ["sergeyzh/rubert-tiny-turbo"], - "k": [5, 10], - "train_head": [False, True], - }, - { - "module_name": "knn", - "embedder_config": ["sentence-transformers/all-MiniLM-L6-v2"], - "k": [5, 10], - "weights": ["uniform", "distance"], - }, - {"module_name": "linear", "embedder_config": ["sergeyzh/rubert-tiny-turbo"], "cv": [3, 5]}, - { - "module_name": "mlknn", - "embedder_config": ["sergeyzh/rubert-tiny-turbo"], - "k": [5, 10], - "s": [1.0, 0.5], - "ignore_first_neighbours": [0, 1], - }, - { - "module_name": "description", - "embedder_config": ["sentence-transformers/all-MiniLM-L6-v2"], - "temperature": [0.5, 1.0], - }, - { - "module_name": "rerank", - "cross_encoder_config": ["cross-encoder/ms-marco-MiniLM-L-6-v2"], - "embedder_config": ["sergeyzh/rubert-tiny-turbo"], - "k": [5], - "weights": ["distance"], - "rank_threshold_cutoff": [None, 3], - }, - # { - # "module_name": "sklearn", - # "embedder_config": ["sentence-transformers/all-MiniLM-L6-v2"], - # "clf_name": ["LogisticRegression"], - # "clf_args": [{"C": 1.0}, {"C": 0.5}], - # }, - ], - } - ] + return { + "node_type": "scoring", + "target_metric": "scoring_roc_auc", + "search_space": [ + { + "module_name": "dnnc", + "cross_encoder_config": [ + "cross-encoder/ms-marco-MiniLM-L-6-v2", + {"model_name": "cross-encoder/ms-marco-MiniLM-L-6-v2", "train_head": True}, + ], + "embedder_config": ["sergeyzh/rubert-tiny-turbo"], + "k": [5, 10], + }, + { + "module_name": "knn", + "embedder_config": ["sentence-transformers/all-MiniLM-L6-v2"], + "k": [5, 10], + "weights": ["uniform", "distance"], + }, + {"module_name": "linear", "embedder_config": ["sergeyzh/rubert-tiny-turbo"], "cv": [3, 5]}, + { + "module_name": "mlknn", + "embedder_config": ["sergeyzh/rubert-tiny-turbo"], + "k": [5, 10], + "s": [1.0, 0.5], + "ignore_first_neighbours": [0, 1], + }, + { + "module_name": "description", + "embedder_config": ["sentence-transformers/all-MiniLM-L6-v2"], + "temperature": [0.5, 1.0], + }, + { + "module_name": "rerank", + "cross_encoder_config": ["cross-encoder/ms-marco-MiniLM-L-6-v2"], + "embedder_config": ["sergeyzh/rubert-tiny-turbo"], + "k": [5], + "weights": ["distance"], + "rank_threshold_cutoff": [None, 3], + }, + { + "module_name": "sklearn", + "embedder_config": ["sentence-transformers/all-MiniLM-L6-v2"], + "clf_name": ["LogisticRegression"], + "C": [0.2, 0.3], + }, + ], + } def test_valid_scoring_config(valid_scoring_config): """Test that a valid scoring config passes validation.""" - config = OptimizationSearchSpaceConfig(valid_scoring_config) - assert config[0].node_type == "scoring" - assert config[0].target_metric == "scoring_roc_auc" - assert isinstance(config[0].search_space, list) - assert config[0].search_space[0].module_name == "dnnc" + node = NodeOptimizer(**valid_scoring_config) + assert node.node_type == "scoring" + assert node.target_metric == "scoring_roc_auc" + assert isinstance(node.modules_search_spaces, list) + assert node.modules_search_spaces[0]["module_name"] == "dnnc" def test_invalid_scoring_config_missing_field(): @@ -76,8 +75,8 @@ def test_invalid_scoring_config_missing_field(): ], } - with pytest.raises(ValidationError): - OptimizationSearchSpaceConfig(invalid_config) + with pytest.raises(TypeError): + NodeOptimizer(*invalid_config) def test_invalid_scoring_config_wrong_type(): @@ -95,5 +94,5 @@ def test_invalid_scoring_config_wrong_type(): ], } - with pytest.raises(ValidationError): - OptimizationSearchSpaceConfig(invalid_config) + with pytest.raises(TypeError): + NodeOptimizer(**invalid_config) diff --git a/tests/pipeline/test_validation.py b/tests/pipeline/test_validation.py index 31a94bb7d..1650cd4b4 100644 --- a/tests/pipeline/test_validation.py +++ b/tests/pipeline/test_validation.py @@ -1,10 +1,6 @@ -from typing import get_args - import pytest from autointent import Pipeline -from autointent.nodes.schemes import OptimizationSearchSpaceConfig -from tests.conftest import TaskType, get_search_space def test_validate_search_space_multiclass(dataset): @@ -34,10 +30,3 @@ def test_validate_search_space_multilabel(dataset): pipeline_optimizer = Pipeline.from_search_space(search_space) with pytest.raises(ValueError, match="Module 'argmax' does not support multilabel datasets."): pipeline_optimizer.validate_modules(dataset, mode="raise") - - -# for now validation for sklearn scorer doesn't work -@pytest.mark.xfail -@pytest.mark.parametrize("search_space", get_args(TaskType)) -def test_search_space(search_space): - OptimizationSearchSpaceConfig(get_search_space(search_space)) diff --git a/user_guides/basic_usage/03_automl.py b/user_guides/basic_usage/03_automl.py index 2512dcf91..18734d7ef 100644 --- a/user_guides/basic_usage/03_automl.py +++ b/user_guides/basic_usage/03_automl.py @@ -52,7 +52,7 @@ """ # %% -preset["search_space"][1]["search_space"][0]["k"] = [1, 3] +preset["search_space"][0]["search_space"][0]["k"] = [1, 3] custom_pipeline = Pipeline.from_optimization_config(preset) # %% [markdown]