added main code for saving models

SeBorgey · SeBorgey · commit 5637fb85400a · 2025-04-01T21:18:07.000+03:00
diff --git a/autointent/_dump_tools.py b/autointent/_dump_tools.py
@@ -1,6 +1,7 @@
 import inspect
 import json
 import logging
+import types
 from pathlib import Path
 from types import UnionType
 from typing import Any, TypeAlias, Union, get_args, get_origin
@@ -10,6 +11,13 @@
 import numpy.typing as npt
 from pydantic import BaseModel
 from sklearn.base import BaseEstimator
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    PreTrainedModel,
+    PreTrainedTokenizer,
+    PreTrainedTokenizerFast,
+)
 
 from autointent import Embedder, Ranker, VectorIndex
 from autointent.configs import CrossEncoderConfig, EmbedderConfig
@@ -18,7 +26,17 @@
 ModuleSimpleAttributes = None | str | int | float | bool | list  # type: ignore[type-arg]
 
 ModuleAttributes: TypeAlias = (
-    ModuleSimpleAttributes | TagsList | np.ndarray | Embedder | VectorIndex | BaseEstimator | Ranker  # type: ignore[type-arg]
+    ModuleSimpleAttributes
+    | TagsList
+    | np.ndarray
+    | Embedder
+    | VectorIndex
+    | BaseEstimator
+    | Ranker
+    | BaseModel
+    | PreTrainedModel
+    | PreTrainedTokenizer
+    | PreTrainedTokenizerFast
 )
 
 logger = logging.getLogger(__name__)
@@ -33,6 +51,8 @@ class Dumper:
     estimators = "estimators"
     cross_encoders = "cross_encoders"
     pydantic_models: str = "pydantic"
+    hf_models = "hf_models"
+    hf_tokenizers = "hf_tokenizers"
 
     @staticmethod
     def make_subdirectories(path: Path) -> None:
@@ -48,12 +68,14 @@ def make_subdirectories(path: Path) -> None:
             path / Dumper.estimators,
             path / Dumper.cross_encoders,
             path / Dumper.pydantic_models,
+            path / Dumper.hf_models,
+            path / Dumper.hf_tokenizers,
         ]
         for subdir in subdirectories:
             subdir.mkdir(parents=True, exist_ok=True)
 
     @staticmethod
-    def dump(obj: Any, path: Path) -> None:  # noqa: ANN401, C901
+    def dump(obj: Any, path: Path) -> None:  # noqa: ANN401, C901, PLR0912
         """Dump modules attributes to filestystem.
 
         Args:
@@ -67,7 +89,26 @@ def dump(obj: Any, path: Path) -> None:  # noqa: ANN401, C901
         Dumper.make_subdirectories(path)
 
         for key, val in attrs.items():
-            if isinstance(val, TagsList):
+            if isinstance(val, PreTrainedModel):
+                try:
+                    model_path = path / Dumper.hf_models / key
+                    val.save_pretrained(model_path)
+                except Exception:
+                    logger.exception("Error dumping Hugging Face model %s", key)
+            elif isinstance(val, PreTrainedTokenizer | PreTrainedTokenizerFast):
+                try:
+                    tokenizer_path = path / Dumper.hf_tokenizers / key
+                    val.save_pretrained(tokenizer_path)
+                except Exception:
+                    logger.exception("Error dumping Hugging Face tokenizer %s", key)
+            elif isinstance(val, BaseModel):
+                try:
+                    pydantic_path = path / Dumper.pydantic_models / f"{key}.json"
+                    with pydantic_path.open("w", encoding="utf-8") as file:
+                        json.dump(val.model_dump(), file, ensure_ascii=False, indent=4)
+                except Exception:
+                    logger.exception("Error dumping pydantic model %s", key)
+            elif isinstance(val, TagsList):
                 val.dump(path / Dumper.tags / key)
             elif isinstance(val, ModuleSimpleAttributes):
                 simple_attrs[key] = val
@@ -78,25 +119,23 @@ def dump(obj: Any, path: Path) -> None:  # noqa: ANN401, C901
             elif isinstance(val, VectorIndex):
                 val.dump(path / Dumper.indexes / key)
             elif isinstance(val, BaseEstimator):
-                joblib.dump(val, path / Dumper.estimators / key)
+                try:
+                    joblib.dump(val, path / Dumper.estimators / f"{key}.joblib")
+                except Exception:
+                    logger.exception("Error dumping BaseEstimator %s", key)
             elif isinstance(val, Ranker):
                 val.save(str(path / Dumper.cross_encoders / key))
-            elif isinstance(val, CrossEncoderConfig | EmbedderConfig):
-                try:
-                    pydantic_path = path / Dumper.pydantic_models / f"{key}.json"
-                    with pydantic_path.open("w", encoding="utf-8") as file:
-                        json.dump(val.model_dump(), file, ensure_ascii=False, indent=4)
-                except Exception as e:
-                    msg = f"Error dumping pydantic model {key}: {e}"
-                    logging.exception(msg)
-            else:
-                msg = f"Attribute {key} of type {type(val)} cannot be dumped to file system."
-                logger.error(msg)
-
-        with (path / Dumper.simple_attrs).open("w") as file:
+            elif not isinstance(val, type | types.ModuleType | types.FunctionType | types.MethodType):
+                logger.warning("Attribute '%s' of type %s cannot be dumped and will be skipped.", key, type(val))
+
+        with (path / Dumper.simple_attrs).open("w", encoding="utf-8") as file:
             json.dump(simple_attrs, file, ensure_ascii=False, indent=4)
 
-        np.savez(path / Dumper.arrays, allow_pickle=False, **arrays)
+        if arrays:
+            try:
+                np.savez(path / Dumper.arrays, allow_pickle=False, **arrays)
+            except Exception:
+                logger.exception("Error saving numpy arrays to %s", path / Dumper.arrays)
 
     @staticmethod
     def load(  # noqa: PLR0912, C901, PLR0915
@@ -114,69 +153,115 @@ def load(  # noqa: PLR0912, C901, PLR0915
         estimators: dict[str, Any] = {}
         cross_encoders: dict[str, Any] = {}
         pydantic_models: dict[str, Any] = {}
+        hf_models: dict[str, Any] = {}
+        hf_tokenizers: dict[str, Any] = {}
 
         for child in path.iterdir():
-            if child.name == Dumper.tags:
-                tags = {tags_dump.name: TagsList.load(tags_dump) for tags_dump in child.iterdir()}
-            elif child.name == Dumper.simple_attrs:
-                with child.open() as file:
-                    simple_attrs = json.load(file)
-            elif child.name == Dumper.arrays:
-                arrays = dict(np.load(child))
-            elif child.name == Dumper.embedders:
-                embedders = {
-                    embedder_dump.name: Embedder.load(embedder_dump, override_config=embedder_config)
-                    for embedder_dump in child.iterdir()
-                }
-            elif child.name == Dumper.indexes:
-                indexes = {index_dump.name: VectorIndex.load(index_dump) for index_dump in child.iterdir()}
-            elif child.name == Dumper.estimators:
-                estimators = {estimator_dump.name: joblib.load(estimator_dump) for estimator_dump in child.iterdir()}
-            elif child.name == Dumper.cross_encoders:
-                cross_encoders = {
-                    cross_encoder_dump.name: Ranker.load(cross_encoder_dump, override_config=cross_encoder_config)
-                    for cross_encoder_dump in child.iterdir()
-                }
-            elif child.name == Dumper.pydantic_models:
-                for model_file in child.iterdir():
-                    with model_file.open("r", encoding="utf-8") as file:
-                        content = json.load(file)
-                    variable_name = model_file.stem
-
-                    # First try to get the type annotation from the class annotations.
-                    model_type = obj.__class__.__annotations__.get(variable_name)
-
-                    # Fallback: inspect __init__ signature if not found in class-level annotations.
-                    if model_type is None:
-                        sig = inspect.signature(obj.__init__)
-                        if variable_name in sig.parameters:
-                            model_type = sig.parameters[variable_name].annotation
-
-                    if model_type is None:
-                        msg = f"No type annotation found for {variable_name}"
-                        logger.error(msg)
-                        continue
-
-                    # If the annotation is a Union, extract the pydantic model type.
-                    if get_origin(model_type) in (UnionType, Union):
-                        for arg in get_args(model_type):
-                            if isinstance(arg, type) and issubclass(arg, BaseModel):
-                                model_type = arg
-                                break
-                        else:
-                            msg = f"No pydantic type found in Union for {variable_name}"
-                            logger.error(msg)
-                            continue
-
-                    if not (isinstance(model_type, type) and issubclass(model_type, BaseModel)):
-                        msg = f"Type for {variable_name} is not a pydantic model: {model_type}"
-                        logger.error(msg)
-                        continue
-
-                    pydantic_models[variable_name] = model_type(**content)
-            else:
-                msg = f"Found unexpected child {child}"
-                logger.error(msg)
+            if child.is_file():
+                if child.name == Dumper.simple_attrs:
+                    try:
+                        with child.open(encoding="utf-8") as file:
+                            simple_attrs = json.load(file)
+                    except Exception:
+                        logger.exception("Error loading simple attributes from %s", child)
+                elif child.name == Dumper.arrays:
+                    try:
+                        arrays = dict(np.load(child, allow_pickle=False))
+                    except Exception as e:  # noqa: BLE001
+                        logger.warning("Could not load numpy arrays from %s: %s", child, e)
+
+            elif child.is_dir():
+                if child.name == Dumper.hf_models:
+                    for model_dir in child.iterdir():
+                        if model_dir.is_dir():
+                            attr_name = model_dir.name
+                            try:
+                                hf_models[attr_name] = AutoModelForSequenceClassification.from_pretrained(model_dir)
+                            except Exception:
+                                logger.exception("Error loading Hugging Face model '%s' from %s", attr_name, model_dir)
+                elif child.name == Dumper.hf_tokenizers:
+                    for tokenizer_dir in child.iterdir():
+                        if tokenizer_dir.is_dir():
+                            attr_name = tokenizer_dir.name
+                            try:
+                                hf_tokenizers[attr_name] = AutoTokenizer.from_pretrained(tokenizer_dir)
+                            except Exception:
+                                logger.exception(
+                                    "Error loading Hugging Face tokenizer '%s' from %s", attr_name, tokenizer_dir
+                                )
+                elif child.name == Dumper.pydantic_models:
+                    for model_file in child.iterdir():
+                        if model_file.is_file() and model_file.suffix == ".json":
+                            variable_name = model_file.stem
+                            try:
+                                with model_file.open("r", encoding="utf-8") as file:
+                                    content = json.load(file)
+
+                                model_type = obj.__class__.__annotations__.get(variable_name)
+
+                                if model_type is None:
+                                    sig = inspect.signature(obj.__init__)
+                                    if variable_name in sig.parameters:
+                                        model_type = sig.parameters[variable_name].annotation
+
+                                if model_type is None:
+                                    logger.error("No type annotation found for pydantic model %s", variable_name)
+                                    continue
+
+                                potential_types = []
+                                if get_origin(model_type) in (UnionType, Union):
+                                    potential_types.extend(get_args(model_type))
+                                else:
+                                    potential_types.append(model_type)
+
+                                pydantic_type = None
+                                for p_type in potential_types:
+                                    if inspect.isclass(p_type) and issubclass(p_type, BaseModel):
+                                        pydantic_type = p_type
+                                        break
+
+                                if pydantic_type is None:
+                                    logger.error("No pydantic type found in annotation for %s", variable_name)
+                                    continue
+
+                                pydantic_models[variable_name] = pydantic_type(**content)
+                            except Exception:
+                                logger.exception("Error loading pydantic model %s from %s", variable_name, model_file)
+
+                elif child.name == Dumper.tags:
+                    tags = {tags_dump.name: TagsList.load(tags_dump) for tags_dump in child.iterdir()}
+                elif child.name == Dumper.embedders:
+                    embedders = {
+                        embedder_dump.name: Embedder.load(embedder_dump, override_config=embedder_config)
+                        for embedder_dump in child.iterdir()
+                    }
+                elif child.name == Dumper.indexes:
+                    indexes = {index_dump.name: VectorIndex.load(index_dump) for index_dump in child.iterdir()}
+                elif child.name == Dumper.estimators:
+                    estimators = {}
+                    for estimator_dump in child.iterdir():
+                        if estimator_dump.is_file() and estimator_dump.suffix == ".joblib":
+                            try:
+                                estimators[estimator_dump.stem] = joblib.load(estimator_dump)
+                            except Exception:
+                                logger.exception(
+                                    "Error loading estimator %s from %s", estimator_dump.stem, estimator_dump
+                                )
+                elif child.name == Dumper.cross_encoders:
+                    cross_encoders = {
+                        cross_encoder_dump.name: Ranker.load(cross_encoder_dump, override_config=cross_encoder_config)
+                        for cross_encoder_dump in child.iterdir()
+                    }
+
         obj.__dict__.update(
-            tags | simple_attrs | arrays | embedders | indexes | estimators | cross_encoders | pydantic_models
+            tags
+            | simple_attrs
+            | arrays
+            | embedders
+            | indexes
+            | estimators
+            | cross_encoders
+            | pydantic_models
+            | hf_models
+            | hf_tokenizers
         )
diff --git a/autointent/modules/scoring/_bert.py b/autointent/modules/scoring/_bert.py
@@ -7,6 +7,7 @@
 import numpy.typing as npt
 import torch
 from datasets import Dataset
+from sklearn.preprocessing import LabelEncoder
 from transformers import (
     AutoModelForSequenceClassification,
     AutoTokenizer,
@@ -79,6 +80,10 @@ def fit(
     ) -> None:
         if hasattr(self, "_model"):
             self.clear_cache()
+        if not isinstance(labels[0], list) and isinstance(labels[0], str):
+            self._label_encoder = LabelEncoder()
+            encoded_labels = self._label_encoder.fit_transform(labels)
+            labels = encoded_labels.tolist()
         self._validate_task(labels)
 
         model_name = self.model_config.model_name
@@ -88,30 +93,20 @@ def fit(
         id2label = {i: i for i in range(self._n_classes)}
 
         self._model = AutoModelForSequenceClassification.from_pretrained(
-            model_name,
-            num_labels=self._n_classes,
-            label2id=label2id,
-            id2label=id2label,
-            problem_type="multi_label_classification" if self._multilabel else "single_label_classification",
+            model_name, num_labels=self._n_classes, label2id=label2id, id2label=id2label
         )
 
         use_cpu = self.model_config.device == "cpu"
 
-        def tokenize_function(examples: dict[str, Any]) -> dict[str, Any]:
-            return self._tokenizer(  # type: ignore[no-any-return]
-                examples["text"], return_tensors="pt", **self.model_config.tokenizer_config.model_dump()
-            )
-
         dataset = Dataset.from_dict({"text": utterances, "labels": labels})
 
-        if self._multilabel:
-            # hugging face uses F.binary_cross_entropy_with_logits under the hood
-            # which requires target labels to be of float type
-            dataset = dataset.map(
-                lambda example: {"label": torch.tensor(example["labels"], dtype=torch.float)}, remove_columns="labels"
-            )
+        def tokenize_function(examples: dict[str, Any]) -> dict[str, Any]:
+            tokenizer_options = self.model_config.tokenizer_config.model_dump()
+            tokenizer_options.pop("padding", None)
+            tokenizer_options.pop("truncation", None)
+            return self._tokenizer(examples["text"], truncation=True, padding=False, **tokenizer_options)
 
-        tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=self.batch_size)
+        tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
 
         with tempfile.TemporaryDirectory() as tmp_dir:
             training_args = TrainingArguments(
@@ -127,12 +122,14 @@ def tokenize_function(examples: dict[str, Any]) -> dict[str, Any]:
                 use_cpu=use_cpu,
             )
 
+            data_collator = DataCollatorWithPadding(tokenizer=self._tokenizer)
+
             trainer = Trainer(
                 model=self._model,
                 args=training_args,
                 train_dataset=tokenized_dataset,
                 tokenizer=self._tokenizer,
-                data_collator=DataCollatorWithPadding(tokenizer=self._tokenizer),
+                data_collator=data_collator,
             )
 
             trainer.train()
@@ -146,9 +143,12 @@ def predict(self, utterances: list[str]) -> npt.NDArray[Any]:
 
         device = next(self._model.parameters()).device
         all_predictions = []
+        tokenizer_options = self.model_config.tokenizer_config.model_dump()
+        tokenizer_options.pop("padding", None)
+        tokenizer_options.pop("truncation", None)
         for i in range(0, len(utterances), self.batch_size):
             batch = utterances[i : i + self.batch_size]
-            inputs = self._tokenizer(batch, return_tensors="pt", **self.model_config.tokenizer_config.model_dump())
+            inputs = self._tokenizer(batch, return_tensors="pt", padding=True, truncation=True, **tokenizer_options)
             inputs = {k: v.to(device) for k, v in inputs.items()}
             with torch.no_grad():
                 outputs = self._model(**inputs)