diff --git a/autointent/_pipeline/_pipeline.py b/autointent/_pipeline/_pipeline.py index fc00981ac..0549ecb05 100644 --- a/autointent/_pipeline/_pipeline.py +++ b/autointent/_pipeline/_pipeline.py @@ -60,7 +60,7 @@ def __init__( self.sampler = sampler if isinstance(nodes[0], NodeOptimizer): - self.logging_config = LoggingConfig(dump_dir=None) + self.logging_config = LoggingConfig() self.embedder_config = EmbedderConfig() self.cross_encoder_config = CrossEncoderConfig() self.data_config = DataConfig() diff --git a/autointent/_ranker.py b/autointent/_ranker.py index 9bb96f627..b2f41fd4c 100644 --- a/autointent/_ranker.py +++ b/autointent/_ranker.py @@ -31,14 +31,14 @@ class CrossEncoderMetadata(TypedDict): Attributes: model_name: Name of the model - train_classifier: Whether to train a classifier + train_head: Whether to train a classifier device: Device to use for inference max_length: Maximum sequence length batch_size: Batch size for inference """ model_name: str - train_classifier: bool + train_head: bool device: str | None max_length: int | None batch_size: int @@ -119,11 +119,11 @@ def __init__( device=self.cross_encoder_config.device, max_length=self.cross_encoder_config.max_length, # type: ignore[arg-type] ) - self.train_classifier = False + self.train_head = False self._clf = classifier_head if classifier_head is not None or self.cross_encoder_config.train_head: - self.train_classifier = True + self.train_head = True self._activations_list: list[npt.NDArray[Any]] = [] self._hook_handler = self.cross_encoder.model.classifier.register_forward_hook(self._classifier_hook) @@ -147,7 +147,7 @@ def _get_features_or_predictions(self, pairs: list[tuple[str, str]]) -> npt.NDAr Returns: Array of extracted features or predictions """ - if not self.train_classifier: + if not self.train_head: return np.array( self.cross_encoder.predict( pairs, @@ -189,7 +189,7 @@ def fit(self, utterances: list[str], labels: ListOfLabels) -> None: utterances: List of utterances (texts) labels: Intent class labels corresponding to the utterances """ - if not self.train_classifier: + if not self.train_head: return pairs, labels_ = construct_samples(utterances, labels, balancing_factor=1) @@ -207,7 +207,7 @@ def predict(self, pairs: list[tuple[str, str]]) -> npt.NDArray[Any]: Raises: ValueError: If classifier is not trained yet """ - if self.train_classifier and self._clf is None: + if self.train_head and self._clf is None: msg = "Classifier is not trained yet" raise ValueError(msg) @@ -254,7 +254,7 @@ def save(self, path: str) -> None: metadata = CrossEncoderMetadata( model_name=self.cross_encoder_config.model_name, - train_classifier=self.train_classifier, + train_head=self.train_head, device=self.cross_encoder_config.device, max_length=self.cross_encoder_config.max_length, batch_size=self.cross_encoder_config.batch_size, diff --git a/autointent/configs/_optimization.py b/autointent/configs/_optimization.py index 146813e4f..b70fd8895 100644 --- a/autointent/configs/_optimization.py +++ b/autointent/configs/_optimization.py @@ -2,7 +2,7 @@ from pathlib import Path -from pydantic import BaseModel, Field, PositiveInt +from pydantic import BaseModel, ConfigDict, Field, PositiveInt from autointent._callbacks import REPORTERS_NAMES from autointent.custom_types import FloatFromZeroToOne, ValidationScheme @@ -13,6 +13,7 @@ class DataConfig(BaseModel): """Configuration for the data used in the optimization process.""" + model_config = ConfigDict(extra="forbid") scheme: ValidationScheme = Field("ho", description="Validation scheme to use.") """Hold-out or cross-validation.""" n_folds: PositiveInt = Field(3, description="Number of folds in cross-validation.") @@ -33,6 +34,8 @@ class DataConfig(BaseModel): class LoggingConfig(BaseModel): """Configuration for the logging.""" + model_config = ConfigDict(extra="forbid") + _dirpath: Path | None = None _dump_dir: Path | None = None diff --git a/autointent/configs/_transformers.py b/autointent/configs/_transformers.py index 81ea3d57f..2cbe98efb 100644 --- a/autointent/configs/_transformers.py +++ b/autointent/configs/_transformers.py @@ -1,15 +1,12 @@ from enum import Enum from typing import Any -from pydantic import ( - BaseModel, - Field, - PositiveInt, -) +from pydantic import BaseModel, ConfigDict, Field, PositiveInt from typing_extensions import Self, assert_never class ModelConfig(BaseModel): + model_config = ConfigDict(extra="forbid") batch_size: PositiveInt = Field(32, description="Batch size for model inference.") max_length: PositiveInt | None = Field(None, description="Maximum length of input sequences.") diff --git a/autointent/modules/regex/_simple.py b/autointent/modules/regex/_simple.py index 76d33ca11..0ce441ca4 100644 --- a/autointent/modules/regex/_simple.py +++ b/autointent/modules/regex/_simple.py @@ -1,12 +1,16 @@ """Module for regular expressions based intent detection.""" import re +from collections.abc import Iterable from typing import Any, TypedDict +import numpy as np +import numpy.typing as npt + from autointent import Context from autointent.context.data_handler._data_handler import RegexPatterns from autointent.context.optimization_info import Artifact -from autointent.custom_types import LabelType +from autointent.custom_types import LabelType, ListOfGenericLabels, ListOfLabels from autointent.metrics import REGEX_METRICS from autointent.modules.base import BaseRegex from autointent.schemas import Intent @@ -36,7 +40,10 @@ class Regex(BaseRegex): name: Name of the module, defaults to "regex" """ - name = "regex" + name = "simple" + supports_multiclass = True + supports_multilabel = True + supports_oos = False @classmethod def from_context(cls, context: Context) -> "Regex": @@ -158,7 +165,7 @@ def score_ho(self, context: Context, metrics: list[str]) -> dict[str, float]: return self.score_metrics_ho((val_labels, pred_labels), chosen_metrics) def score_cv(self, context: Context, metrics: list[str]) -> dict[str, float]: - """Score the model using cross-validation. + """Score the model in cross-validation mode. Args: context: Context containing validation data @@ -169,10 +176,42 @@ def score_cv(self, context: Context, metrics: list[str]) -> dict[str, float]: """ chosen_metrics = {name: fn for name, fn in REGEX_METRICS.items() if name in metrics} - metrics_calculated, _ = self.score_metrics_cv(chosen_metrics, context.data_handler.validation_iterator()) + metrics_calculated, _ = self.score_metrics_cv( + chosen_metrics, context.data_handler.validation_iterator(), intents=context.data_handler.dataset.intents + ) return metrics_calculated + def score_metrics_cv( + self, + metrics_dict: dict[str, Any], + cv_iterator: Iterable[tuple[list[str], ListOfLabels, list[str], ListOfLabels]], + intents: list[Intent], + ) -> tuple[dict[str, float], list[ListOfGenericLabels] | list[npt.NDArray[Any]]]: + """Score metrics using cross-validation. + + Args: + metrics_dict: Dictionary of metrics to compute + cv_iterator: Cross-validation iterator + intents: intents from the dataset + + Returns: + Tuple of metrics dictionary and predictions + """ + metrics_values: dict[str, list[float]] = {name: [] for name in metrics_dict} + all_val_preds = [] + + self.fit(intents) + + for _, _, val_utterances, val_labels in cv_iterator: + val_preds = self.predict(val_utterances) + for name, fn in metrics_dict.items(): + metrics_values[name].append(fn(val_labels, val_preds)) + all_val_preds.append(val_preds) + + metrics = {name: float(np.mean(values_list)) for name, values_list in metrics_values.items()} + return metrics, all_val_preds # type: ignore[return-value] + def clear_cache(self) -> None: """Clear cached regex patterns.""" del self.regex_patterns diff --git a/autointent/modules/scoring/_dnnc/dnnc.py b/autointent/modules/scoring/_dnnc/dnnc.py index a947da09a..7758528b2 100644 --- a/autointent/modules/scoring/_dnnc/dnnc.py +++ b/autointent/modules/scoring/_dnnc/dnnc.py @@ -52,13 +52,6 @@ class DNNCScorer(BaseScorer): test_utterances = ["Hello!", "What's up?"] scores = scorer.predict(test_utterances) - print(scores) # Outputs similarity scores for the utterances - - - .. testoutput:: - - [[0.00013581 0. ] - [0.00030066 0. ]] """ diff --git a/autointent/modules/scoring/_knn/knn.py b/autointent/modules/scoring/_knn/knn.py index b92a3eb44..27453fce9 100644 --- a/autointent/modules/scoring/_knn/knn.py +++ b/autointent/modules/scoring/_knn/knn.py @@ -42,12 +42,6 @@ class KNNScorer(BaseScorer): scorer.fit(utterances, labels) test_utterances = ["hi", "what's up?"] probabilities = scorer.predict(test_utterances) - print(probabilities) # Outputs predicted class probabilities for the utterances - - .. testoutput:: - - [[0.67297815 0.32702185] - [0.44031667 0.55968333]] """ diff --git a/autointent/nodes/info/_regex.py b/autointent/nodes/info/_regex.py index 0f03ef261..d01603100 100644 --- a/autointent/nodes/info/_regex.py +++ b/autointent/nodes/info/_regex.py @@ -6,8 +6,8 @@ from autointent.custom_types import NodeType from autointent.metrics import REGEX_METRICS from autointent.metrics.regex import RegexMetricFn +from autointent.modules import REGEX_MODULES from autointent.modules.base import BaseRegex -from autointent.modules.regex import Regex from ._base import NodeInfo @@ -17,6 +17,6 @@ class RegexNodeInfo(NodeInfo): metrics_available: ClassVar[Mapping[str, RegexMetricFn]] = REGEX_METRICS - modules_available: ClassVar[Mapping[str, type[BaseRegex]]] = {NodeType.regex: Regex} + modules_available: ClassVar[Mapping[str, type[BaseRegex]]] = REGEX_MODULES node_type = NodeType.regex diff --git a/docs/optimizer_config.schema.json b/docs/optimizer_config.schema.json index 6018c52a4..8227dbb9d 100644 --- a/docs/optimizer_config.schema.json +++ b/docs/optimizer_config.schema.json @@ -1,6 +1,7 @@ { "$defs": { "CrossEncoderConfig": { + "additionalProperties": false, "properties": { "batch_size": { "default": 32, @@ -53,6 +54,7 @@ "type": "object" }, "DataConfig": { + "additionalProperties": false, "description": "Configuration for the data used in the optimization process.", "properties": { "scheme": { @@ -100,6 +102,7 @@ "type": "object" }, "EmbedderConfig": { + "additionalProperties": false, "properties": { "batch_size": { "default": 32, @@ -230,6 +233,7 @@ "type": "object" }, "LoggingConfig": { + "additionalProperties": false, "description": "Configuration for the logging.", "properties": { "project_dir": { diff --git a/tests/assets/configs/regex.yaml b/tests/assets/configs/regex.yaml new file mode 100644 index 000000000..c88915795 --- /dev/null +++ b/tests/assets/configs/regex.yaml @@ -0,0 +1,14 @@ +- node_type: regex + target_metric: regex_partial_accuracy + search_space: + - module_name: simple +- node_type: scoring + target_metric: scoring_roc_auc + search_space: + - module_name: linear + embedder_config: + - model_name: sentence-transformers/all-MiniLM-L6-v2 +- node_type: decision + target_metric: decision_accuracy + search_space: + - module_name: argmax diff --git a/tests/callback/test_callback.py b/tests/callback/test_callback.py index 4c4d38a72..986b5d348 100644 --- a/tests/callback/test_callback.py +++ b/tests/callback/test_callback.py @@ -85,7 +85,7 @@ def test_pipeline_callbacks(dataset): context = Context() context.configure_logging(LoggingConfig(run_name="dummy_run_name", project_dir=project_dir, dump_modules=False)) context.callback_handler = CallbackHandler([DummyCallback]) - context.set_dataset(dataset, DataConfig(scheme="ho", separate_nodes=True)) + context.set_dataset(dataset, DataConfig(scheme="ho")) pipeline_optimizer._fit(context, "brute") diff --git a/tests/conftest.py b/tests/conftest.py index fb1ed3a4a..9f01bb353 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -33,7 +33,7 @@ def dataset_no_oos(): return Dataset.from_json(path) -TaskType = Literal["multiclass", "multilabel", "description", "optuna", "light"] +TaskType = Literal["multiclass", "multilabel", "description", "optuna", "light", "regex"] def get_search_space_path(task_type: TaskType): diff --git a/tests/nodes/conftest.py b/tests/nodes/conftest.py index 7a844b68a..a7326f249 100644 --- a/tests/nodes/conftest.py +++ b/tests/nodes/conftest.py @@ -76,6 +76,6 @@ def get_context(multilabel): dataset = Dataset.from_json(get_dataset_path()) if multilabel: dataset = dataset.to_multilabel() - res.set_dataset(dataset, DataConfig(scheme="ho", separate_nodes=True)) + res.set_dataset(dataset, DataConfig(scheme="ho")) res.configure_logging(LoggingConfig(project_dir=project_dir, dump_modules=True)) return res diff --git a/tests/pipeline/test_optimization.py b/tests/pipeline/test_optimization.py index b2bebc7f6..ca97b161d 100644 --- a/tests/pipeline/test_optimization.py +++ b/tests/pipeline/test_optimization.py @@ -8,6 +8,31 @@ from tests.conftest import get_search_space, setup_environment +@pytest.mark.parametrize( + ("data_config", "refit_after"), + [ + (DataConfig(scheme="ho", separation_ratio=None), False), + (DataConfig(scheme="ho", separation_ratio=0.5), False), + (DataConfig(scheme="cv", separation_ratio=None), False), + (DataConfig(scheme="cv", separation_ratio=0.5), False), + (DataConfig(scheme="ho", separation_ratio=None), True), + (DataConfig(scheme="ho", separation_ratio=0.5), True), + (DataConfig(scheme="cv", separation_ratio=None), True), + (DataConfig(scheme="cv", separation_ratio=0.5), True), + ], +) +def test_with_regex(dataset, data_config, refit_after): + project_dir = setup_environment() + search_space = get_search_space("regex") + + pipeline_optimizer = Pipeline.from_search_space(search_space) + + pipeline_optimizer.set_config(LoggingConfig(project_dir=project_dir, dump_modules=True, clear_ram=True)) + pipeline_optimizer.set_config(data_config) + + pipeline_optimizer.fit(dataset, refit_after=refit_after) + + def test_no_node_separation(dataset_no_oos): project_dir = setup_environment() search_space = get_search_space("light") @@ -15,7 +40,7 @@ def test_no_node_separation(dataset_no_oos): pipeline_optimizer = Pipeline.from_search_space(search_space) pipeline_optimizer.set_config(LoggingConfig(project_dir=project_dir, dump_modules=True, clear_ram=True)) - pipeline_optimizer.set_config(DataConfig(scheme="ho", separate_nodes=False)) + pipeline_optimizer.set_config(DataConfig(scheme="ho", separation_ratio=None)) pipeline_optimizer.fit(dataset_no_oos, refit_after=False) @@ -37,7 +62,7 @@ def test_bayes(dataset, sampler): pipeline_optimizer = Pipeline.from_search_space(search_space) pipeline_optimizer.set_config(LoggingConfig(project_dir=project_dir, dump_modules=True, clear_ram=True)) - pipeline_optimizer.set_config(DataConfig(scheme="ho", separate_nodes=True)) + pipeline_optimizer.set_config(DataConfig(scheme="ho", separation_ratio=0.5)) pipeline_optimizer.fit(dataset, refit_after=False, sampler=sampler) @@ -53,7 +78,7 @@ def test_cv(dataset, task_type): pipeline_optimizer = Pipeline.from_search_space(search_space) pipeline_optimizer.set_config(LoggingConfig(project_dir=project_dir, dump_modules=True, clear_ram=True)) - pipeline_optimizer.set_config(DataConfig(scheme="cv", separate_nodes=True)) + pipeline_optimizer.set_config(DataConfig(scheme="cv", separation_ratio=0.5)) if task_type == "multilabel": dataset = dataset.to_multilabel() @@ -75,7 +100,7 @@ def test_no_context_optimization(dataset, task_type): pipeline_optimizer = Pipeline.from_search_space(search_space) pipeline_optimizer.set_config(LoggingConfig(project_dir=project_dir, dump_modules=False, clear_ram=False)) - pipeline_optimizer.set_config(DataConfig(scheme="ho", separate_nodes=True)) + pipeline_optimizer.set_config(DataConfig(scheme="ho", separation_ratio=0.5)) if task_type == "multilabel": dataset = dataset.to_multilabel() diff --git a/tests/pipeline/test_presets.py b/tests/pipeline/test_presets.py index e987ff62d..2cfb72ba0 100644 --- a/tests/pipeline/test_presets.py +++ b/tests/pipeline/test_presets.py @@ -18,6 +18,6 @@ def test_presets(dataset, preset): return pipeline_optimizer.set_config(LoggingConfig(project_dir=project_dir, dump_modules=True, clear_ram=True)) - pipeline_optimizer.set_config(DataConfig(scheme="ho", separate_nodes=True)) + pipeline_optimizer.set_config(DataConfig(scheme="ho")) pipeline_optimizer.fit(dataset, refit_after=False) diff --git a/user_guides/advanced/04_reporting.py b/user_guides/advanced/04_reporting.py index 2d092e819..85c406086 100644 --- a/user_guides/advanced/04_reporting.py +++ b/user_guides/advanced/04_reporting.py @@ -74,7 +74,7 @@ from pathlib import Path log_config = LoggingConfig( - run_name="test_tensorboard", report_to=["tensorboard"], dirpath=Path("test_tensorboard"), dump_modules=False + run_name="test_tensorboard", report_to=["tensorboard"], project_dir=Path("my_projects"), dump_modules=False ) pipeline_optimizer.set_config(log_config) diff --git a/user_guides/advanced/05_logging.py b/user_guides/advanced/05_logging.py index c287e2258..e80c385fb 100644 --- a/user_guides/advanced/05_logging.py +++ b/user_guides/advanced/05_logging.py @@ -36,7 +36,7 @@ }, ] -log_config = LoggingConfig(dirpath=Path("logging_tutorial")) +log_config = LoggingConfig(project_dir=Path("logging_tutorial")) pipeline_optimizer = Pipeline.from_search_space(search_space) pipeline_optimizer.set_config(log_config)