Skip to content
2 changes: 1 addition & 1 deletion autointent/_pipeline/_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def __init__(
self.sampler = sampler

if isinstance(nodes[0], NodeOptimizer):
self.logging_config = LoggingConfig(dump_dir=None)
self.logging_config = LoggingConfig()
self.embedder_config = EmbedderConfig()
self.cross_encoder_config = CrossEncoderConfig()
self.data_config = DataConfig()
Expand Down
16 changes: 8 additions & 8 deletions autointent/_ranker.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,14 @@ class CrossEncoderMetadata(TypedDict):

Attributes:
model_name: Name of the model
train_classifier: Whether to train a classifier
train_head: Whether to train a classifier
device: Device to use for inference
max_length: Maximum sequence length
batch_size: Batch size for inference
"""

model_name: str
train_classifier: bool
train_head: bool
device: str | None
max_length: int | None
batch_size: int
Expand Down Expand Up @@ -119,11 +119,11 @@ def __init__(
device=self.cross_encoder_config.device,
max_length=self.cross_encoder_config.max_length, # type: ignore[arg-type]
)
self.train_classifier = False
self.train_head = False
self._clf = classifier_head

if classifier_head is not None or self.cross_encoder_config.train_head:
self.train_classifier = True
self.train_head = True
self._activations_list: list[npt.NDArray[Any]] = []
self._hook_handler = self.cross_encoder.model.classifier.register_forward_hook(self._classifier_hook)

Expand All @@ -147,7 +147,7 @@ def _get_features_or_predictions(self, pairs: list[tuple[str, str]]) -> npt.NDAr
Returns:
Array of extracted features or predictions
"""
if not self.train_classifier:
if not self.train_head:
return np.array(
self.cross_encoder.predict(
pairs,
Expand Down Expand Up @@ -189,7 +189,7 @@ def fit(self, utterances: list[str], labels: ListOfLabels) -> None:
utterances: List of utterances (texts)
labels: Intent class labels corresponding to the utterances
"""
if not self.train_classifier:
if not self.train_head:
return

pairs, labels_ = construct_samples(utterances, labels, balancing_factor=1)
Expand All @@ -207,7 +207,7 @@ def predict(self, pairs: list[tuple[str, str]]) -> npt.NDArray[Any]:
Raises:
ValueError: If classifier is not trained yet
"""
if self.train_classifier and self._clf is None:
if self.train_head and self._clf is None:
msg = "Classifier is not trained yet"
raise ValueError(msg)

Expand Down Expand Up @@ -254,7 +254,7 @@ def save(self, path: str) -> None:

metadata = CrossEncoderMetadata(
model_name=self.cross_encoder_config.model_name,
train_classifier=self.train_classifier,
train_head=self.train_head,
device=self.cross_encoder_config.device,
max_length=self.cross_encoder_config.max_length,
batch_size=self.cross_encoder_config.batch_size,
Expand Down
5 changes: 4 additions & 1 deletion autointent/configs/_optimization.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from pathlib import Path

from pydantic import BaseModel, Field, PositiveInt
from pydantic import BaseModel, ConfigDict, Field, PositiveInt

from autointent._callbacks import REPORTERS_NAMES
from autointent.custom_types import FloatFromZeroToOne, ValidationScheme
Expand All @@ -13,6 +13,7 @@
class DataConfig(BaseModel):
"""Configuration for the data used in the optimization process."""

model_config = ConfigDict(extra="forbid")
scheme: ValidationScheme = Field("ho", description="Validation scheme to use.")
"""Hold-out or cross-validation."""
n_folds: PositiveInt = Field(3, description="Number of folds in cross-validation.")
Expand All @@ -33,6 +34,8 @@ class DataConfig(BaseModel):
class LoggingConfig(BaseModel):
"""Configuration for the logging."""

model_config = ConfigDict(extra="forbid")

_dirpath: Path | None = None
_dump_dir: Path | None = None

Expand Down
7 changes: 2 additions & 5 deletions autointent/configs/_transformers.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
from enum import Enum
from typing import Any

from pydantic import (
BaseModel,
Field,
PositiveInt,
)
from pydantic import BaseModel, ConfigDict, Field, PositiveInt
from typing_extensions import Self, assert_never


class ModelConfig(BaseModel):
model_config = ConfigDict(extra="forbid")
batch_size: PositiveInt = Field(32, description="Batch size for model inference.")
max_length: PositiveInt | None = Field(None, description="Maximum length of input sequences.")

Expand Down
47 changes: 43 additions & 4 deletions autointent/modules/regex/_simple.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
"""Module for regular expressions based intent detection."""

import re
from collections.abc import Iterable
from typing import Any, TypedDict

import numpy as np
import numpy.typing as npt

from autointent import Context
from autointent.context.data_handler._data_handler import RegexPatterns
from autointent.context.optimization_info import Artifact
from autointent.custom_types import LabelType
from autointent.custom_types import LabelType, ListOfGenericLabels, ListOfLabels
from autointent.metrics import REGEX_METRICS
from autointent.modules.base import BaseRegex
from autointent.schemas import Intent
Expand Down Expand Up @@ -36,7 +40,10 @@ class Regex(BaseRegex):
name: Name of the module, defaults to "regex"
"""

name = "regex"
name = "simple"
supports_multiclass = True
supports_multilabel = True
supports_oos = False

@classmethod
def from_context(cls, context: Context) -> "Regex":
Expand Down Expand Up @@ -158,7 +165,7 @@ def score_ho(self, context: Context, metrics: list[str]) -> dict[str, float]:
return self.score_metrics_ho((val_labels, pred_labels), chosen_metrics)

def score_cv(self, context: Context, metrics: list[str]) -> dict[str, float]:
"""Score the model using cross-validation.
"""Score the model in cross-validation mode.

Args:
context: Context containing validation data
Expand All @@ -169,10 +176,42 @@ def score_cv(self, context: Context, metrics: list[str]) -> dict[str, float]:
"""
chosen_metrics = {name: fn for name, fn in REGEX_METRICS.items() if name in metrics}

metrics_calculated, _ = self.score_metrics_cv(chosen_metrics, context.data_handler.validation_iterator())
metrics_calculated, _ = self.score_metrics_cv(
chosen_metrics, context.data_handler.validation_iterator(), intents=context.data_handler.dataset.intents
)

return metrics_calculated

def score_metrics_cv(
self,
metrics_dict: dict[str, Any],
cv_iterator: Iterable[tuple[list[str], ListOfLabels, list[str], ListOfLabels]],
intents: list[Intent],
) -> tuple[dict[str, float], list[ListOfGenericLabels] | list[npt.NDArray[Any]]]:
"""Score metrics using cross-validation.

Args:
metrics_dict: Dictionary of metrics to compute
cv_iterator: Cross-validation iterator
intents: intents from the dataset

Returns:
Tuple of metrics dictionary and predictions
"""
metrics_values: dict[str, list[float]] = {name: [] for name in metrics_dict}
all_val_preds = []

self.fit(intents)

for _, _, val_utterances, val_labels in cv_iterator:
val_preds = self.predict(val_utterances)
for name, fn in metrics_dict.items():
metrics_values[name].append(fn(val_labels, val_preds))
all_val_preds.append(val_preds)

metrics = {name: float(np.mean(values_list)) for name, values_list in metrics_values.items()}
return metrics, all_val_preds # type: ignore[return-value]

def clear_cache(self) -> None:
"""Clear cached regex patterns."""
del self.regex_patterns
Expand Down
7 changes: 0 additions & 7 deletions autointent/modules/scoring/_dnnc/dnnc.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,6 @@ class DNNCScorer(BaseScorer):

test_utterances = ["Hello!", "What's up?"]
scores = scorer.predict(test_utterances)
print(scores) # Outputs similarity scores for the utterances


.. testoutput::

[[0.00013581 0. ]
[0.00030066 0. ]]

"""

Expand Down
6 changes: 0 additions & 6 deletions autointent/modules/scoring/_knn/knn.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,6 @@ class KNNScorer(BaseScorer):
scorer.fit(utterances, labels)
test_utterances = ["hi", "what's up?"]
probabilities = scorer.predict(test_utterances)
print(probabilities) # Outputs predicted class probabilities for the utterances

.. testoutput::

[[0.67297815 0.32702185]
[0.44031667 0.55968333]]

"""

Expand Down
4 changes: 2 additions & 2 deletions autointent/nodes/info/_regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
from autointent.custom_types import NodeType
from autointent.metrics import REGEX_METRICS
from autointent.metrics.regex import RegexMetricFn
from autointent.modules import REGEX_MODULES
from autointent.modules.base import BaseRegex
from autointent.modules.regex import Regex

from ._base import NodeInfo

Expand All @@ -17,6 +17,6 @@ class RegexNodeInfo(NodeInfo):

metrics_available: ClassVar[Mapping[str, RegexMetricFn]] = REGEX_METRICS

modules_available: ClassVar[Mapping[str, type[BaseRegex]]] = {NodeType.regex: Regex}
modules_available: ClassVar[Mapping[str, type[BaseRegex]]] = REGEX_MODULES

node_type = NodeType.regex
4 changes: 4 additions & 0 deletions docs/optimizer_config.schema.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{
"$defs": {
"CrossEncoderConfig": {
"additionalProperties": false,
"properties": {
"batch_size": {
"default": 32,
Expand Down Expand Up @@ -53,6 +54,7 @@
"type": "object"
},
"DataConfig": {
"additionalProperties": false,
"description": "Configuration for the data used in the optimization process.",
"properties": {
"scheme": {
Expand Down Expand Up @@ -100,6 +102,7 @@
"type": "object"
},
"EmbedderConfig": {
"additionalProperties": false,
"properties": {
"batch_size": {
"default": 32,
Expand Down Expand Up @@ -230,6 +233,7 @@
"type": "object"
},
"LoggingConfig": {
"additionalProperties": false,
"description": "Configuration for the logging.",
"properties": {
"project_dir": {
Expand Down
14 changes: 14 additions & 0 deletions tests/assets/configs/regex.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
- node_type: regex
target_metric: regex_partial_accuracy
search_space:
- module_name: simple
- node_type: scoring
target_metric: scoring_roc_auc
search_space:
- module_name: linear
embedder_config:
- model_name: sentence-transformers/all-MiniLM-L6-v2
- node_type: decision
target_metric: decision_accuracy
search_space:
- module_name: argmax
2 changes: 1 addition & 1 deletion tests/callback/test_callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def test_pipeline_callbacks(dataset):
context = Context()
context.configure_logging(LoggingConfig(run_name="dummy_run_name", project_dir=project_dir, dump_modules=False))
context.callback_handler = CallbackHandler([DummyCallback])
context.set_dataset(dataset, DataConfig(scheme="ho", separate_nodes=True))
context.set_dataset(dataset, DataConfig(scheme="ho"))

pipeline_optimizer._fit(context, "brute")

Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def dataset_no_oos():
return Dataset.from_json(path)


TaskType = Literal["multiclass", "multilabel", "description", "optuna", "light"]
TaskType = Literal["multiclass", "multilabel", "description", "optuna", "light", "regex"]


def get_search_space_path(task_type: TaskType):
Expand Down
2 changes: 1 addition & 1 deletion tests/nodes/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,6 @@ def get_context(multilabel):
dataset = Dataset.from_json(get_dataset_path())
if multilabel:
dataset = dataset.to_multilabel()
res.set_dataset(dataset, DataConfig(scheme="ho", separate_nodes=True))
res.set_dataset(dataset, DataConfig(scheme="ho"))
res.configure_logging(LoggingConfig(project_dir=project_dir, dump_modules=True))
return res
33 changes: 29 additions & 4 deletions tests/pipeline/test_optimization.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,39 @@
from tests.conftest import get_search_space, setup_environment


@pytest.mark.parametrize(
("data_config", "refit_after"),
[
(DataConfig(scheme="ho", separation_ratio=None), False),
(DataConfig(scheme="ho", separation_ratio=0.5), False),
(DataConfig(scheme="cv", separation_ratio=None), False),
(DataConfig(scheme="cv", separation_ratio=0.5), False),
(DataConfig(scheme="ho", separation_ratio=None), True),
(DataConfig(scheme="ho", separation_ratio=0.5), True),
(DataConfig(scheme="cv", separation_ratio=None), True),
(DataConfig(scheme="cv", separation_ratio=0.5), True),
],
)
def test_with_regex(dataset, data_config, refit_after):
project_dir = setup_environment()
search_space = get_search_space("regex")

pipeline_optimizer = Pipeline.from_search_space(search_space)

pipeline_optimizer.set_config(LoggingConfig(project_dir=project_dir, dump_modules=True, clear_ram=True))
pipeline_optimizer.set_config(data_config)

pipeline_optimizer.fit(dataset, refit_after=refit_after)


def test_no_node_separation(dataset_no_oos):
project_dir = setup_environment()
search_space = get_search_space("light")

pipeline_optimizer = Pipeline.from_search_space(search_space)

pipeline_optimizer.set_config(LoggingConfig(project_dir=project_dir, dump_modules=True, clear_ram=True))
pipeline_optimizer.set_config(DataConfig(scheme="ho", separate_nodes=False))
pipeline_optimizer.set_config(DataConfig(scheme="ho", separation_ratio=None))

pipeline_optimizer.fit(dataset_no_oos, refit_after=False)

Expand All @@ -37,7 +62,7 @@ def test_bayes(dataset, sampler):
pipeline_optimizer = Pipeline.from_search_space(search_space)

pipeline_optimizer.set_config(LoggingConfig(project_dir=project_dir, dump_modules=True, clear_ram=True))
pipeline_optimizer.set_config(DataConfig(scheme="ho", separate_nodes=True))
pipeline_optimizer.set_config(DataConfig(scheme="ho", separation_ratio=0.5))

pipeline_optimizer.fit(dataset, refit_after=False, sampler=sampler)

Expand All @@ -53,7 +78,7 @@ def test_cv(dataset, task_type):
pipeline_optimizer = Pipeline.from_search_space(search_space)

pipeline_optimizer.set_config(LoggingConfig(project_dir=project_dir, dump_modules=True, clear_ram=True))
pipeline_optimizer.set_config(DataConfig(scheme="cv", separate_nodes=True))
pipeline_optimizer.set_config(DataConfig(scheme="cv", separation_ratio=0.5))

if task_type == "multilabel":
dataset = dataset.to_multilabel()
Expand All @@ -75,7 +100,7 @@ def test_no_context_optimization(dataset, task_type):
pipeline_optimizer = Pipeline.from_search_space(search_space)

pipeline_optimizer.set_config(LoggingConfig(project_dir=project_dir, dump_modules=False, clear_ram=False))
pipeline_optimizer.set_config(DataConfig(scheme="ho", separate_nodes=True))
pipeline_optimizer.set_config(DataConfig(scheme="ho", separation_ratio=0.5))

if task_type == "multilabel":
dataset = dataset.to_multilabel()
Expand Down
2 changes: 1 addition & 1 deletion tests/pipeline/test_presets.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,6 @@ def test_presets(dataset, preset):
return

pipeline_optimizer.set_config(LoggingConfig(project_dir=project_dir, dump_modules=True, clear_ram=True))
pipeline_optimizer.set_config(DataConfig(scheme="ho", separate_nodes=True))
pipeline_optimizer.set_config(DataConfig(scheme="ho"))

pipeline_optimizer.fit(dataset, refit_after=False)
Loading