dumper saving (#180)

SeBorgey · github-actions[bot] · voorhs · web-flow · commit 2c6ace8346a4 · 2025-04-09T15:36:05.000+03:00
* added main code for saving models * Update optimizer_config.schema.json * checker fixes * Revert "checker fixes" This reverts commit 6e32eb9. * Revert "added main code for saving models" This reverts commit 5637fb8. * drat main code for new dumper * ruf fix * comments * added code for test dumper * Check dumper (#182) * Feat/code carbon each node (#175) * feat: update codecarbon * feat: update codecarbon * feat: added codecarbon * Update optimizer_config.schema.json * fix: fixed import mypy * fix: codecarbon package * fix: only float\integer log * fix: codecarbon package * fix: mypy * fix: test * fix: delete emissions * fix: test --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> * standartize pyproject & speedup tests (#176) * speedup tests * fix pyproject * Update optimizer_config.schema.json * move optional dependencies * fixes * add xdist * fix ci * download data from hub in doc * add caching * add doc cache --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: voorhs <ilya_alekseev_2016@list.ru> * add proper `omit` definition for tests coverage report (#179) * add proper `omit` definition * Update optimizer_config.schema.json * exclude tmp from coverage report --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> * add node validators (#177) * add node validators * add comments * Update optimizer_config.schema.json * rename bert model * lint * fixes * fix test --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: voorhs <ilya_alekseev_2016@list.ru> * update makefile * update bert test * mypy workaround * attempt to fix windows permission error * workaround --------- Co-authored-by: Darinochka <39233990+Darinochka@users.noreply.github.com> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: Roman Solomatin <samoed.roman@gmail.com> --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: Алексеев Илья <44509110+voorhs@users.noreply.github.com> Co-authored-by: Darinochka <39233990+Darinochka@users.noreply.github.com> Co-authored-by: Roman Solomatin <samoed.roman@gmail.com>
diff --git a/Makefile b/Makefile
@@ -3,7 +3,7 @@ poetry = poetry run
 
 .PHONY: install
 install:
-	poetry install --with dev,test,typing,docs
+	poetry install --extras "dev test typing docs"
 
 .PHONY: test
 test:
@@ -24,7 +24,7 @@ lint:
 
 .PHONY: sync
 sync:
-	poetry sync --with dev,test,typing,docs
+	poetry sync --extras "dev test typing docs"
 
 .PHONY: docs
 docs:
diff --git a/autointent/_dump_tools.py b/autointent/_dump_tools.py
@@ -33,6 +33,8 @@ class Dumper:
     estimators = "estimators"
     cross_encoders = "cross_encoders"
     pydantic_models: str = "pydantic"
+    hf_models = "hf_models"
+    hf_tokenizers = "hf_tokenizers"
 
     @staticmethod
     def make_subdirectories(path: Path) -> None:
@@ -48,12 +50,14 @@ def make_subdirectories(path: Path) -> None:
             path / Dumper.estimators,
             path / Dumper.cross_encoders,
             path / Dumper.pydantic_models,
+            path / Dumper.hf_models,
+            path / Dumper.hf_tokenizers,
         ]
         for subdir in subdirectories:
             subdir.mkdir(parents=True, exist_ok=True)
 
     @staticmethod
-    def dump(obj: Any, path: Path) -> None:  # noqa: ANN401, C901
+    def dump(obj: Any, path: Path) -> None:  # noqa: ANN401, C901, PLR0912, PLR0915
         """Dump modules attributes to filestystem.
 
         Args:
@@ -89,6 +93,28 @@ def dump(obj: Any, path: Path) -> None:  # noqa: ANN401, C901
                 except Exception as e:
                     msg = f"Error dumping pydantic model {key}: {e}"
                     logging.exception(msg)
+            elif (key == "_model" or "model" in key.lower()) and hasattr(val, "save_pretrained"):
+                model_path = path / Dumper.hf_models / key
+                model_path.mkdir(parents=True, exist_ok=True)
+                try:
+                    val.save_pretrained(model_path)
+                    class_info = {"module": val.__class__.__module__, "name": val.__class__.__name__}
+                    with (model_path / "class_info.json").open("w") as f:
+                        json.dump(class_info, f)
+                except Exception as e:
+                    msg = f"Error dumping HF model {key}: {e}"
+                    logger.exception(msg)
+            elif (key == "_tokenizer" or "tokenizer" in key.lower()) and hasattr(val, "save_pretrained"):
+                tokenizer_path = path / Dumper.hf_tokenizers / key
+                tokenizer_path.mkdir(parents=True, exist_ok=True)
+                try:
+                    val.save_pretrained(tokenizer_path)
+                    class_info = {"module": val.__class__.__module__, "name": val.__class__.__name__}
+                    with (tokenizer_path / "class_info.json").open("w") as f:
+                        json.dump(class_info, f)
+                except Exception as e:
+                    msg = f"Error dumping HF tokenizer {key}: {e}"
+                    logger.exception(msg)
             else:
                 msg = f"Attribute {key} of type {type(val)} cannot be dumped to file system."
                 logger.error(msg)
@@ -114,6 +140,8 @@ def load(  # noqa: PLR0912, C901, PLR0915
         estimators: dict[str, Any] = {}
         cross_encoders: dict[str, Any] = {}
         pydantic_models: dict[str, Any] = {}
+        hf_models: dict[str, Any] = {}
+        hf_tokenizers: dict[str, Any] = {}
 
         for child in path.iterdir():
             if child.name == Dumper.tags:
@@ -151,7 +179,6 @@ def load(  # noqa: PLR0912, C901, PLR0915
                         sig = inspect.signature(obj.__init__)
                         if variable_name in sig.parameters:
                             model_type = sig.parameters[variable_name].annotation
-
                     if model_type is None:
                         msg = f"No type annotation found for {variable_name}"
                         logger.error(msg)
@@ -174,9 +201,45 @@ def load(  # noqa: PLR0912, C901, PLR0915
                         continue
 
                     pydantic_models[variable_name] = model_type(**content)
+            elif child.name == Dumper.hf_models:
+                for model_dir in child.iterdir():
+                    try:
+                        with (model_dir / "class_info.json").open("r") as f:
+                            class_info = json.load(f)
+
+                        module = __import__(class_info["module"], fromlist=[class_info["name"]])
+                        model_class = getattr(module, class_info["name"])
+
+                        hf_models[model_dir.name] = model_class.from_pretrained(model_dir)
+                    except Exception as e:  # noqa: PERF203
+                        msg = f"Error loading HF model {model_dir.name}: {e}"
+                        logger.exception(msg)
+            elif child.name == Dumper.hf_tokenizers:
+                for tokenizer_dir in child.iterdir():
+                    try:
+                        with (tokenizer_dir / "class_info.json").open("r") as f:
+                            class_info = json.load(f)
+
+                        module = __import__(class_info["module"], fromlist=[class_info["name"]])
+                        tokenizer_class = getattr(module, class_info["name"])
+
+                        hf_tokenizers[tokenizer_dir.name] = tokenizer_class.from_pretrained(tokenizer_dir)
+                    except Exception as e:  # noqa: PERF203
+                        msg = f"Error loading HF tokenizer {tokenizer_dir.name}: {e}"
+                        logger.exception(msg)
             else:
                 msg = f"Found unexpected child {child}"
                 logger.error(msg)
+
         obj.__dict__.update(
-            tags | simple_attrs | arrays | embedders | indexes | estimators | cross_encoders | pydantic_models
+            tags
+            | simple_attrs
+            | arrays
+            | embedders
+            | indexes
+            | estimators
+            | cross_encoders
+            | pydantic_models
+            | hf_models
+            | hf_tokenizers
         )
diff --git a/autointent/context/data_handler/_stratification.py b/autointent/context/data_handler/_stratification.py
@@ -12,7 +12,7 @@
 from numpy import typing as npt
 from sklearn.model_selection import train_test_split
 from skmultilearn.model_selection import IterativeStratification
-from transformers import set_seed
+from transformers import set_seed  # type: ignore[attr-defined]
 
 from autointent import Dataset
 from autointent.custom_types import LabelType
@@ -128,7 +128,8 @@ def _split_multilabel(self, dataset: HFDataset, test_size: float) -> Sequence[np
         Returns:
             A sequence containing indices for train and test splits.
         """
-        set_seed(self.random_seed)  # workaround for buggy nature of IterativeStratification from skmultilearn
+        if self.random_seed is not None:
+            set_seed(self.random_seed)  # workaround for buggy nature of IterativeStratification from skmultilearn
         splitter = IterativeStratification(
             n_splits=2,
             order=2,
diff --git a/autointent/modules/scoring/_bert.py b/autointent/modules/scoring/_bert.py
@@ -7,7 +7,7 @@
 import numpy.typing as npt
 import torch
 from datasets import Dataset
-from transformers import (
+from transformers import (  # type: ignore[attr-defined]
     AutoModelForSequenceClassification,
     AutoTokenizer,
     DataCollatorWithPadding,
@@ -127,15 +127,15 @@ def tokenize_function(examples: dict[str, Any]) -> dict[str, Any]:
                 use_cpu=use_cpu,
             )
 
-            trainer = Trainer(
+            trainer = Trainer(  # type: ignore[no-untyped-call]
                 model=self._model,
                 args=training_args,
                 train_dataset=tokenized_dataset,
                 tokenizer=self._tokenizer,
                 data_collator=DataCollatorWithPadding(tokenizer=self._tokenizer),
             )
 
-            trainer.train()
+            trainer.train()  # type: ignore[attr-defined]
 
         self._model.eval()
 
diff --git a/tests/modules/scoring/test_bert.py b/tests/modules/scoring/test_bert.py
@@ -1,10 +1,59 @@
+import shutil
+import tempfile
+from pathlib import Path
+
 import numpy as np
 import pytest
 
 from autointent.context.data_handler import DataHandler
 from autointent.modules import BertScorer
 
 
+def test_bert_scorer_dump_load(dataset):
+    """Test that BertScorer can be saved and loaded while preserving predictions."""
+    data_handler = DataHandler(dataset)
+
+    # Create and train scorer
+    scorer_original = BertScorer(classification_model_config="prajjwal1/bert-tiny", num_train_epochs=1, batch_size=8)
+    scorer_original.fit(data_handler.train_utterances(0), data_handler.train_labels(0))
+
+    # Test data
+    test_data = [
+        "why is there a hold on my account",
+        "why is my bank account frozen",
+    ]
+
+    # Get predictions before saving
+    predictions_before = scorer_original.predict(test_data)
+
+    # Create temp directory and save model
+    temp_dir_path = Path(tempfile.mkdtemp(prefix="bert_scorer_test_"))
+    try:
+        # Save the model
+        scorer_original.dump(str(temp_dir_path))
+
+        # Create a new scorer and load saved model
+        scorer_loaded = BertScorer(classification_model_config="prajjwal1/bert-tiny", num_train_epochs=1, batch_size=8)
+        scorer_loaded.load(str(temp_dir_path))
+
+        # Verify model and tokenizer are loaded
+        assert hasattr(scorer_loaded, "_model")
+        assert scorer_loaded._model is not None
+        assert hasattr(scorer_loaded, "_tokenizer")
+        assert scorer_loaded._tokenizer is not None
+
+        # Get predictions after loading
+        predictions_after = scorer_loaded.predict(test_data)
+
+        # Verify predictions match
+        assert predictions_before.shape == predictions_after.shape
+        np.testing.assert_allclose(predictions_before, predictions_after, atol=1e-6)
+
+    finally:
+        # Clean up
+        shutil.rmtree(temp_dir_path, ignore_errors=True) # workaround for windows permission error
+
+
 def test_bert_prediction(dataset):
     """Test that the transformer model can fit and make predictions."""
     data_handler = DataHandler(dataset)