Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 43 additions & 24 deletions autointent/_dump_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,16 @@
import joblib
import numpy as np
import numpy.typing as npt
from peft import PeftModel
from pydantic import BaseModel
from sklearn.base import BaseEstimator
from transformers import ( # type: ignore[attr-defined]
AutoModelForSequenceClassification,
AutoTokenizer,
PreTrainedModel,
PreTrainedTokenizer,
PreTrainedTokenizerFast,
)

from autointent import Embedder, Ranker, VectorIndex
from autointent.configs import CrossEncoderConfig, EmbedderConfig
Expand All @@ -34,6 +42,7 @@ class Dumper:
pydantic_models: str = "pydantic"
hf_models = "hf_models"
hf_tokenizers = "hf_tokenizers"
ptuning_models = "ptuning_models"

@staticmethod
def make_subdirectories(path: Path, exists_ok: bool = False) -> None:
Expand All @@ -52,6 +61,7 @@ def make_subdirectories(path: Path, exists_ok: bool = False) -> None:
path / Dumper.pydantic_models,
path / Dumper.hf_models,
path / Dumper.hf_tokenizers,
path / Dumper.ptuning_models,
]
for subdir in subdirectories:
subdir.mkdir(parents=True, exist_ok=exists_ok)
Expand Down Expand Up @@ -101,25 +111,38 @@ def dump(obj: Any, path: Path, exists_ok: bool = False, exclude: list[type[Any]]
except Exception as e:
msg = f"Error dumping pydantic model {key}: {e}"
logging.exception(msg)
elif (key == "_model" or "model" in key.lower()) and hasattr(val, "save_pretrained"):
elif isinstance(val, PeftModel):
# dumping peft models is a nightmare...
# this might break with new versions of peft
try:
if val._is_prompt_learning: # noqa: SLF001
# strategy to save prompt learning models: save prompt encoder and bert classifier separately
model_path = path / Dumper.ptuning_models / key
model_path.mkdir(parents=True, exist_ok=True)
val.save_pretrained(str(model_path / "peft"))
val.base_model.save_pretrained(model_path / "base_model") # type: ignore[attr-defined]
else:
# strategy to save lora models: merge adapters and save as usual hugging face model
model_path = path / Dumper.hf_models / key
model_path.mkdir(parents=True, exist_ok=True)
merged_model: PreTrainedModel = val.merge_and_unload()
merged_model.save_pretrained(model_path) # type: ignore[attr-defined]
except Exception as e:
msg = f"Error dumping PeftModel {key}: {e}"
logger.exception(msg)
elif isinstance(val, PreTrainedModel):
model_path = path / Dumper.hf_models / key
model_path.mkdir(parents=True, exist_ok=True)
try:
val.save_pretrained(model_path)
class_info = {"module": val.__class__.__module__, "name": val.__class__.__name__}
with (model_path / "class_info.json").open("w") as f:
json.dump(class_info, f)
val.save_pretrained(model_path) # type: ignore[attr-defined]
except Exception as e:
msg = f"Error dumping HF model {key}: {e}"
logger.exception(msg)
elif (key == "_tokenizer" or "tokenizer" in key.lower()) and hasattr(val, "save_pretrained"):
elif isinstance(val, PreTrainedTokenizer | PreTrainedTokenizerFast):
tokenizer_path = path / Dumper.hf_tokenizers / key
tokenizer_path.mkdir(parents=True, exist_ok=True)
try:
val.save_pretrained(tokenizer_path)
class_info = {"module": val.__class__.__module__, "name": val.__class__.__name__}
with (tokenizer_path / "class_info.json").open("w") as f:
json.dump(class_info, f)
val.save_pretrained(tokenizer_path) # type: ignore[union-attr]
except Exception as e:
msg = f"Error dumping HF tokenizer {key}: {e}"
logger.exception(msg)
Expand Down Expand Up @@ -202,29 +225,25 @@ def load( # noqa: C901, PLR0912, PLR0915
msg = f"Error loading Pydantic model from {model_dir}: {e}"
logger.exception(msg)
continue
elif child.name == Dumper.ptuning_models:
for model_dir in child.iterdir():
try:
model = AutoModelForSequenceClassification.from_pretrained(model_dir / "base_model")
hf_models[model_dir.name] = PeftModel.from_pretrained(model, model_dir / "peft")
except Exception as e: # noqa: PERF203
msg = f"Error loading PeftModel {model_dir.name}: {e}"
logger.exception(msg)
elif child.name == Dumper.hf_models:
for model_dir in child.iterdir():
try:
with (model_dir / "class_info.json").open("r") as f:
class_info = json.load(f)

module = __import__(class_info["module"], fromlist=[class_info["name"]])
model_class = getattr(module, class_info["name"])

hf_models[model_dir.name] = model_class.from_pretrained(model_dir)
hf_models[model_dir.name] = AutoModelForSequenceClassification.from_pretrained(model_dir)
except Exception as e: # noqa: PERF203
msg = f"Error loading HF model {model_dir.name}: {e}"
logger.exception(msg)
elif child.name == Dumper.hf_tokenizers:
for tokenizer_dir in child.iterdir():
try:
with (tokenizer_dir / "class_info.json").open("r") as f:
class_info = json.load(f)

module = __import__(class_info["module"], fromlist=[class_info["name"]])
tokenizer_class = getattr(module, class_info["name"])

hf_tokenizers[tokenizer_dir.name] = tokenizer_class.from_pretrained(tokenizer_dir)
hf_tokenizers[tokenizer_dir.name] = AutoTokenizer.from_pretrained(tokenizer_dir)
except Exception as e: # noqa: PERF203
msg = f"Error loading HF tokenizer {tokenizer_dir.name}: {e}"
logger.exception(msg)
Expand Down
47 changes: 29 additions & 18 deletions autointent/context/_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from autointent import Dataset
from autointent._callbacks import CallbackHandler, get_callbacks
from autointent.configs import CrossEncoderConfig, DataConfig, EmbedderConfig, LoggingConfig
from autointent.configs import CrossEncoderConfig, DataConfig, EmbedderConfig, HFModelConfig, LoggingConfig

from .data_handler import DataHandler
from .optimization_info import OptimizationInfo
Expand Down Expand Up @@ -49,7 +49,7 @@ def configure_logging(self, config: LoggingConfig) -> None:
self.callback_handler = get_callbacks(config.report_to)
self.optimization_info = OptimizationInfo()

def configure_transformer(self, config: EmbedderConfig | CrossEncoderConfig) -> None:
def configure_transformer(self, config: EmbedderConfig | CrossEncoderConfig | HFModelConfig) -> None:
"""Configure the vector index client and embedder.

Args:
Expand All @@ -59,6 +59,8 @@ def configure_transformer(self, config: EmbedderConfig | CrossEncoderConfig) ->
self.embedder_config = config
elif isinstance(config, CrossEncoderConfig):
self.cross_encoder_config = config
elif isinstance(config, HFModelConfig):
self.transformer_config = config

def set_dataset(self, dataset: Dataset, config: DataConfig) -> None:
"""Set the datasets for training, validation and testing.
Expand Down Expand Up @@ -133,31 +135,40 @@ def has_saved_modules(self) -> bool:
def resolve_embedder(self) -> EmbedderConfig:
"""Resolve the embedder configuration.

Returns the best embedder configuration or default configuration.

Raises:
RuntimeError: If embedder configuration cannot be resolved.
This method returns the configuration with the following priorities:
- the best embedder configuration obtained during embedding node optimization
- default configuration preset by user with :py:meth:`Context.configure_transformer`
- default configuration preset by AutoIntent in :py:class:`autointent.configs.EmbedderConfig`
"""
try:
return self.optimization_info.get_best_embedder()
except ValueError as e:
except ValueError:
if hasattr(self, "embedder_config"):
return self.embedder_config
msg = (
"Embedder could't be resolved. Either include embedding node into the "
"search space or set default config with Context.configure_transformer."
)
raise RuntimeError(msg) from e
return EmbedderConfig()

def resolve_ranker(self) -> CrossEncoderConfig:
"""Resolve the cross-encoder configuration.

Returns default config if set.

Raises:
RuntimeError: If cross-encoder configuration cannot be resolved.
This method returns the configuration with the following priorities:
- default configuration preset by user with :py:meth:`Context.configure_transformer`
- default configuration preset by AutoIntent in :py:class:`autointent.configs.CrossEncoderConfig`
"""
if hasattr(self, "cross_encoder_config"):
return self.cross_encoder_config
msg = "Cross-encoder could't be resolved. Set default config with Context.configure_transformer."
raise RuntimeError(msg)
return CrossEncoderConfig()

def resolve_transformer(self) -> HFModelConfig:
"""Resolve the transformer configuration.

This method returns the configuration with the following priorities:
- the best transformer configuration obtained during embedding node optimization
- default configuration preset by user with :py:meth:`Context.configure_transformer`
- default configuration preset by AutoIntent in :py:class:`autointent.configs.HFModelConfig`
"""
try:
return self.optimization_info.get_best_embedder()
except ValueError:
if hasattr(self, "transformer_config"):
return self.transformer_config
return HFModelConfig()
13 changes: 10 additions & 3 deletions autointent/modules/base/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,9 +138,16 @@ def from_context(cls, context: Context, **kwargs: dict[str, Any]) -> "BaseModule
Initialized module
"""

def get_embedder_config(self) -> dict[str, Any] | None:
"""Get the config of the embedder."""
return None
@abstractmethod
def get_implicit_initialization_params(self) -> dict[str, Any]:
"""Return default params used in ``__init__`` method.

Some parameters of the module may be inferred using context rather from ``__init__`` method.
But they need to be logged for reproducibility during loading from disk.

Returns:
Dictionary of default params
"""

@staticmethod
def score_metrics_ho(params: tuple[Any, Any], metrics_dict: dict[str, Any]) -> dict[str, float]:
Expand Down
3 changes: 3 additions & 0 deletions autointent/modules/base/_decision.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
class BaseDecision(BaseModule, ABC):
"""Base class for decision modules."""

def get_implicit_initialization_params(self) -> dict[str, Any]:
return {}

@abstractmethod
def fit(
self,
Expand Down
4 changes: 4 additions & 0 deletions autointent/modules/base/_embedding.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Base class for embedding modules."""

from abc import ABC
from typing import Any

from autointent import Context
from autointent.custom_types import ListOfLabels
Expand All @@ -10,6 +11,9 @@
class BaseEmbedding(BaseModule, ABC):
"""Base class for embedding modules."""

def get_implicit_initialization_params(self) -> dict[str, Any]:
return {}

def get_train_data(self, context: Context) -> tuple[list[str], ListOfLabels]:
"""Get train data.

Expand Down
4 changes: 4 additions & 0 deletions autointent/modules/base/_regex.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
"""Base class for embedding modules."""

from abc import ABC
from typing import Any

from autointent.modules.base import BaseModule


class BaseRegex(BaseModule, ABC):
"""Base class for rule-based modules."""

def get_implicit_initialization_params(self) -> dict[str, Any]:
return {}
18 changes: 9 additions & 9 deletions autointent/modules/scoring/_bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ class BertScorer(BaseScorer):
name = "bert"
supports_multiclass = True
supports_multilabel = True
_model: Any
_tokenizer: Any
_model: Any # transformers AutoModel factory returns Any
_tokenizer: Any # transformers AutoTokenizer factory returns Any

def __init__(
self,
Expand Down Expand Up @@ -56,7 +56,7 @@ def from_context(
seed: int = 0,
) -> "BertScorer":
if classification_model_config is None:
classification_model_config = context.resolve_embedder()
classification_model_config = context.resolve_transformer()

report_to = context.logging_config.report_to

Expand All @@ -69,14 +69,14 @@ def from_context(
report_to=report_to,
)

def get_embedder_config(self) -> dict[str, Any]:
return self.classification_model_config.model_dump()
def get_implicit_initialization_params(self) -> dict[str, Any]:
return {"classification_model_config": self.classification_model_config.model_dump()}

def __initialize_model(self) -> None:
def _initialize_model(self) -> Any: # noqa: ANN401
label2id = {i: i for i in range(self._n_classes)}
id2label = {i: i for i in range(self._n_classes)}

self._model = AutoModelForSequenceClassification.from_pretrained(
return AutoModelForSequenceClassification.from_pretrained(
self.classification_model_config.model_name,
trust_remote_code=self.classification_model_config.trust_remote_code,
num_labels=self._n_classes,
Expand All @@ -96,7 +96,7 @@ def fit(

self._tokenizer = AutoTokenizer.from_pretrained(self.classification_model_config.model_name)

self.__initialize_model()
self._model = self._initialize_model()

use_cpu = self.classification_model_config.device == "cpu"

Expand Down Expand Up @@ -126,7 +126,7 @@ def tokenize_function(examples: dict[str, Any]) -> dict[str, Any]:
save_strategy="no",
logging_strategy="steps",
logging_steps=10,
report_to=self.report_to,
report_to=self.report_to if self.report_to is not None else "none",
use_cpu=use_cpu,
)

Expand Down
26 changes: 9 additions & 17 deletions autointent/modules/scoring/_description/description.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,9 @@ def from_context(
Returns:
Initialized DescriptionScorer instance
"""
if embedder_config is None:
if embedder_config is None and encoder_type == "bi":
embedder_config = context.resolve_embedder()
if cross_encoder_config is None:
if cross_encoder_config is None and encoder_type == "cross":
cross_encoder_config = context.resolve_ranker()

return cls(
Expand All @@ -88,21 +88,13 @@ def from_context(
encoder_type=encoder_type,
)

def get_embedder_config(self) -> dict[str, Any]:
"""Get the configuration of the embedder.

Returns:
Embedder configuration
"""
return self.embedder_config.model_dump()

def get_cross_encoder_config(self) -> dict[str, Any]:
"""Get the configuration of the cross-encoder.

Returns:
Cross-encoder configuration
"""
return self.cross_encoder_config.model_dump()
def get_implicit_initialization_params(self) -> dict[str, Any]:
res = {}
if self._encoder_type == "bi":
res["embedder_config"] = self.embedder_config.model_dump()
else:
res["cross_encoder_config"] = self.cross_encoder_config.model_dump()
return res

def fit(
self,
Expand Down
6 changes: 6 additions & 0 deletions autointent/modules/scoring/_dnnc/dnnc.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,12 @@ def from_context(
cross_encoder_config=cross_encoder_config,
)

def get_implicit_initialization_params(self) -> dict[str, Any]:
return {
"embedder_config": self.embedder_config.model_dump(),
"cross_encoder_config": self.cross_encoder_config.model_dump(),
}

def fit(self, utterances: list[str], labels: ListOfLabels) -> None:
"""Fit the scorer by training or loading the vector index.

Expand Down
9 changes: 2 additions & 7 deletions autointent/modules/scoring/_knn/knn.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,13 +97,8 @@ def from_context(
weights=weights,
)

def get_embedder_config(self) -> dict[str, Any]:
"""Get the name of the embedder.

Returns:
Embedder name
"""
return self.embedder_config.model_dump()
def get_implicit_initialization_params(self) -> dict[str, Any]:
return {"embedder_config": self.embedder_config.model_dump()}

def fit(self, utterances: list[str], labels: ListOfLabels, clear_cache: bool = False) -> None:
"""Fit the scorer by training or loading the vector index.
Expand Down
Loading
Loading