Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion autointent/modules/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,4 +54,25 @@ def _create_modules_dict(modules: list[type[T]]) -> dict[str, type[T]]:
)


__all__ = [] # type: ignore[var-annotated]
__all__ = [
"AdaptiveDecision",
"ArgmaxDecision",
"BaseDecision",
"BaseEmbedding",
"BaseModule",
"BaseRegex",
"BaseScorer",
"DNNCScorer",
"DescriptionScorer",
"JinoosDecision",
"KNNScorer",
"LinearScorer",
"LogregAimedEmbedding",
"MLKnnScorer",
"RerankScorer",
"RetrievalAimedEmbedding",
"SimpleRegex",
"SklearnScorer",
"ThresholdDecision",
"TunableDecision",
]
24 changes: 13 additions & 11 deletions autointent/modules/scoring/_bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,14 @@ class BertScorer(BaseScorer):

def __init__(
self,
model_config: HFModelConfig | str | dict[str, Any] | None = None,
classification_model_config: HFModelConfig | str | dict[str, Any] | None = None,
num_train_epochs: int = 3,
batch_size: int = 8,
learning_rate: float = 5e-5,
seed: int = 0,
report_to: REPORTERS_NAMES | None = None, # type: ignore # noqa: PGH003
) -> None:
self.model_config = HFModelConfig.from_search_config(model_config)
self.classification_model_config = HFModelConfig.from_search_config(classification_model_config)
self.num_train_epochs = num_train_epochs
self.batch_size = batch_size
self.learning_rate = learning_rate
Expand All @@ -49,19 +49,19 @@ def __init__(
def from_context(
cls,
context: Context,
model_config: HFModelConfig | str | dict[str, Any] | None = None,
classification_model_config: HFModelConfig | str | dict[str, Any] | None = None,
num_train_epochs: int = 3,
batch_size: int = 8,
learning_rate: float = 5e-5,
seed: int = 0,
) -> "BertScorer":
if model_config is None:
model_config = context.resolve_embedder()
if classification_model_config is None:
classification_model_config = context.resolve_embedder()

report_to = context.logging_config.report_to

return cls(
model_config=model_config,
classification_model_config=classification_model_config,
num_train_epochs=num_train_epochs,
batch_size=batch_size,
learning_rate=learning_rate,
Expand All @@ -70,7 +70,7 @@ def from_context(
)

def get_embedder_config(self) -> dict[str, Any]:
return self.model_config.model_dump()
return self.classification_model_config.model_dump()

def fit(
self,
Expand All @@ -81,7 +81,7 @@ def fit(
self.clear_cache()
self._validate_task(labels)

model_name = self.model_config.model_name
model_name = self.classification_model_config.model_name
self._tokenizer = AutoTokenizer.from_pretrained(model_name)

label2id = {i: i for i in range(self._n_classes)}
Expand All @@ -95,11 +95,11 @@ def fit(
problem_type="multi_label_classification" if self._multilabel else "single_label_classification",
)

use_cpu = self.model_config.device == "cpu"
use_cpu = self.classification_model_config.device == "cpu"

def tokenize_function(examples: dict[str, Any]) -> dict[str, Any]:
return self._tokenizer( # type: ignore[no-any-return]
examples["text"], return_tensors="pt", **self.model_config.tokenizer_config.model_dump()
examples["text"], return_tensors="pt", **self.classification_model_config.tokenizer_config.model_dump()
)

dataset = Dataset.from_dict({"text": utterances, "labels": labels})
Expand Down Expand Up @@ -148,7 +148,9 @@ def predict(self, utterances: list[str]) -> npt.NDArray[Any]:
all_predictions = []
for i in range(0, len(utterances), self.batch_size):
batch = utterances[i : i + self.batch_size]
inputs = self._tokenizer(batch, return_tensors="pt", **self.model_config.tokenizer_config.model_dump())
inputs = self._tokenizer(
batch, return_tensors="pt", **self.classification_model_config.tokenizer_config.model_dump()
)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
outputs = self._model(**inputs)
Expand Down
5 changes: 4 additions & 1 deletion autointent/modules/scoring/_linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import numpy as np
import numpy.typing as npt
from pydantic import PositiveInt
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.multioutput import MultiOutputClassifier

Expand All @@ -22,7 +23,6 @@ class LinearScorer(BaseScorer):
Args:
embedder_config: Config of the embedder model
cv: Number of cross-validation folds, defaults to 3
n_jobs: Number of parallel jobs for cross-validation, defaults to None
seed: Random seed for reproducibility, defaults to 0

Example:
Expand Down Expand Up @@ -72,18 +72,21 @@ def __init__(
def from_context(
cls,
context: Context,
cv: PositiveInt = 3,
embedder_config: EmbedderConfig | str | None = None,
) -> "LinearScorer":
"""Create a LinearScorer instance using a Context object.

Args:
context: Context containing configurations and utilities
cv: Number of cross-validation folds, defaults to 3
embedder_config: Config of the embedder, or None to use the best embedder
"""
if embedder_config is None:
embedder_config = context.resolve_embedder()

return cls(
cv=cv,
embedder_config=embedder_config,
)

Expand Down
24 changes: 3 additions & 21 deletions autointent/nodes/_node_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,33 +11,14 @@
import optuna
import torch
from optuna.trial import Trial
from pydantic import BaseModel, Field
from typing_extensions import assert_never

from autointent import Dataset
from autointent.context import Context
from autointent.custom_types import NodeType, SamplerType, SearchSpaceValidationMode
from autointent.nodes.emissions_tracker import EmissionsTracker
from autointent.nodes.info import NODES_INFO


class ParamSpaceInt(BaseModel):
"""Integer parameter search space configuration."""

low: int = Field(..., description="Lower boundary of the search space.")
high: int = Field(..., description="Upper boundary of the search space.")
step: int = Field(1, description="Step size for the search space.")
log: bool = Field(False, description="Indicates whether to use a logarithmic scale.")


class ParamSpaceFloat(BaseModel):
"""Float parameter search space configuration."""

low: float = Field(..., description="Lower boundary of the search space.")
high: float = Field(..., description="Upper boundary of the search space.")
step: float | None = Field(None, description="Step size for the search space (if applicable).")
log: bool = Field(False, description="Indicates whether to use a logarithmic scale.")

from autointent.schemas.node_validation import ParamSpaceFloat, ParamSpaceInt, SearchSpaceConfig

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -277,7 +258,8 @@ def validate_nodes_with_dataset(self, dataset: Dataset, mode: SearchSpaceValidat

def validate_search_space(self, search_space: list[dict[str, Any]]) -> None:
"""Check if search space is configured correctly."""
for module_search_space in search_space:
validated_search_space = SearchSpaceConfig(search_space).model_dump()
for module_search_space in validated_search_space:
module_search_space_no_optuna, module_name = self._reformat_search_space(deepcopy(module_search_space))

for params_combination in it.product(*module_search_space_no_optuna.values()):
Expand Down
Loading
Loading