fix dump tools

voorhs · voorhs · commit c6cab95d88bb · 2025-05-07T15:47:52.000+03:00
diff --git a/autointent/_dump_tools.py b/autointent/_dump_tools.py
@@ -7,8 +7,16 @@
 import joblib
 import numpy as np
 import numpy.typing as npt
+from peft import PeftModel
 from pydantic import BaseModel
 from sklearn.base import BaseEstimator
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    PreTrainedModel,
+    PreTrainedTokenizer,
+    PreTrainedTokenizerFast,
+)
 
 from autointent import Embedder, Ranker, VectorIndex
 from autointent.configs import CrossEncoderConfig, EmbedderConfig
@@ -34,6 +42,7 @@ class Dumper:
     pydantic_models: str = "pydantic"
     hf_models = "hf_models"
     hf_tokenizers = "hf_tokenizers"
+    peft_models = "peft_models"
 
     @staticmethod
     def make_subdirectories(path: Path, exists_ok: bool = False) -> None:
@@ -52,6 +61,7 @@ def make_subdirectories(path: Path, exists_ok: bool = False) -> None:
             path / Dumper.pydantic_models,
             path / Dumper.hf_models,
             path / Dumper.hf_tokenizers,
+            path / Dumper.peft_models,
         ]
         for subdir in subdirectories:
             subdir.mkdir(parents=True, exist_ok=exists_ok)
@@ -101,25 +111,34 @@ def dump(obj: Any, path: Path, exists_ok: bool = False, exclude: list[type[Any]]
                 except Exception as e:
                     msg = f"Error dumping pydantic model {key}: {e}"
                     logging.exception(msg)
-            elif (key == "_model" or "model" in key.lower()) and hasattr(val, "save_pretrained"):
+            elif isinstance(val, PeftModel):
+                try:
+                    if val._is_prompt_learning:  # noqa: SLF001
+                        model_path = path / Dumper.peft_models / key
+                        model_path.mkdir(parents=True, exist_ok=True)
+                        val.save_pretrained(model_path / "peft") # save peft config and prompt encoder
+                        val.base_model.save_pretrained(model_path / "base_model")  # save bert classifier
+                    else:
+                        model_path = path / Dumper.hf_models / key
+                        model_path.mkdir(parents=True, exist_ok=True)
+                        merged_model: PreTrainedModel = val.merge_and_unload()
+                        merged_model.save_pretrained(model_path)
+                except Exception as e:
+                    msg = f"Error dumping PeftModel {key}: {e}"
+                    logger.exception(msg)
+            elif isinstance(val, PreTrainedModel):
                 model_path = path / Dumper.hf_models / key
                 model_path.mkdir(parents=True, exist_ok=True)
                 try:
                     val.save_pretrained(model_path)
-                    class_info = {"module": val.__class__.__module__, "name": val.__class__.__name__}
-                    with (model_path / "class_info.json").open("w") as f:
-                        json.dump(class_info, f)
                 except Exception as e:
                     msg = f"Error dumping HF model {key}: {e}"
                     logger.exception(msg)
-            elif (key == "_tokenizer" or "tokenizer" in key.lower()) and hasattr(val, "save_pretrained"):
+            elif isinstance(val, PreTrainedTokenizer | PreTrainedTokenizerFast):
                 tokenizer_path = path / Dumper.hf_tokenizers / key
                 tokenizer_path.mkdir(parents=True, exist_ok=True)
                 try:
                     val.save_pretrained(tokenizer_path)
-                    class_info = {"module": val.__class__.__module__, "name": val.__class__.__name__}
-                    with (tokenizer_path / "class_info.json").open("w") as f:
-                        json.dump(class_info, f)
                 except Exception as e:
                     msg = f"Error dumping HF tokenizer {key}: {e}"
                     logger.exception(msg)
@@ -202,29 +221,25 @@ def load(  # noqa: C901, PLR0912, PLR0915
                         msg = f"Error loading Pydantic model from {model_dir}: {e}"
                         logger.exception(msg)
                         continue
+            elif child.name == Dumper.peft_models:
+                for model_dir in child.iterdir():
+                    try:
+                        model = AutoModelForSequenceClassification.from_pretrained(model_dir / "base_model")
+                        hf_models[model_dir.name] = PeftModel.from_pretrained(model, model_dir / "peft")
+                    except Exception as e:  # noqa: PERF203
+                        msg = f"Error loading PeftModel {model_dir.name}: {e}"
+                        logger.exception(msg)
             elif child.name == Dumper.hf_models:
                 for model_dir in child.iterdir():
                     try:
-                        with (model_dir / "class_info.json").open("r") as f:
-                            class_info = json.load(f)
-
-                        module = __import__(class_info["module"], fromlist=[class_info["name"]])
-                        model_class = getattr(module, class_info["name"])
-
-                        hf_models[model_dir.name] = model_class.from_pretrained(model_dir)
+                        hf_models[model_dir.name] = AutoModelForSequenceClassification.from_pretrained(model_dir)
                     except Exception as e:  # noqa: PERF203
                         msg = f"Error loading HF model {model_dir.name}: {e}"
                         logger.exception(msg)
             elif child.name == Dumper.hf_tokenizers:
                 for tokenizer_dir in child.iterdir():
                     try:
-                        with (tokenizer_dir / "class_info.json").open("r") as f:
-                            class_info = json.load(f)
-
-                        module = __import__(class_info["module"], fromlist=[class_info["name"]])
-                        tokenizer_class = getattr(module, class_info["name"])
-
-                        hf_tokenizers[tokenizer_dir.name] = tokenizer_class.from_pretrained(tokenizer_dir)
+                        hf_tokenizers[tokenizer_dir.name] = AutoTokenizer.from_pretrained(tokenizer_dir)
                     except Exception as e:  # noqa: PERF203
                         msg = f"Error loading HF tokenizer {tokenizer_dir.name}: {e}"
                         logger.exception(msg)
diff --git a/autointent/modules/scoring/_bert.py b/autointent/modules/scoring/_bert.py
@@ -11,6 +11,9 @@
     AutoModelForSequenceClassification,
     AutoTokenizer,
     DataCollatorWithPadding,
+    PreTrainedModel,
+    PreTrainedTokenizer,
+    PreTrainedTokenizerFast,
     Trainer,
     TrainingArguments,
 )
@@ -26,8 +29,8 @@ class BertScorer(BaseScorer):
     name = "bert"
     supports_multiclass = True
     supports_multilabel = True
-    _model: Any
-    _tokenizer: Any
+    _model: PreTrainedModel
+    _tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast
 
     def __init__(
         self,
diff --git a/autointent/modules/scoring/_lora/lora.py b/autointent/modules/scoring/_lora/lora.py
@@ -3,7 +3,6 @@
 from typing import Any
 
 from peft import LoraConfig, get_peft_model
-from transformers import AutoModelForSequenceClassification
 
 from autointent import Context
 from autointent._callbacks import REPORTERS_NAMES
@@ -59,10 +58,6 @@ class BERTLoRAScorer(BertScorer):
     """
 
     name = "lora"
-    supports_multiclass = True
-    supports_multilabel = True
-    _model: Any
-    _tokenizer: Any
 
     def __init__(
         self,
@@ -72,7 +67,7 @@ def __init__(
         learning_rate: float = 5e-5,
         seed: int = 0,
         report_to: REPORTERS_NAMES | None = None,  # type: ignore[valid-type]
-        **lora_kwargs: dict[str, Any],
+        **lora_kwargs: Any,  # noqa: ANN401
     ) -> None:
         super().__init__(
             classification_model_config=classification_model_config,
@@ -93,7 +88,7 @@ def from_context(
         batch_size: int = 8,
         learning_rate: float = 5e-5,
         seed: int = 0,
-        **lora_kwargs: dict[str, Any],
+        **lora_kwargs: Any,  # noqa: ANN401
     ) -> "BERTLoRAScorer":
         if classification_model_config is None:
             classification_model_config = context.resolve_transformer()
@@ -108,10 +103,5 @@ def from_context(
         )
 
     def _initialize_model(self) -> None:
-        self._model = AutoModelForSequenceClassification.from_pretrained(
-            self.classification_model_config.model_name,
-            num_labels=self._n_classes,
-            problem_type="multi_label_classification" if self._multilabel else "single_label_classification",
-            trust_remote_code=self.classification_model_config.trust_remote_code,
-        )
+        super()._initialize_model()
         self._model = get_peft_model(self._model, self._lora_config)
diff --git a/autointent/modules/scoring/_ptuning/ptuning.py b/autointent/modules/scoring/_ptuning/ptuning.py
@@ -2,11 +2,7 @@
 
 from typing import Any
 
-import torch
 from peft import PromptEncoderConfig, get_peft_model
-from transformers import (
-    AutoModelForSequenceClassification,
-)
 
 from autointent import Context
 from autointent._callbacks import REPORTERS_NAMES
@@ -54,10 +50,6 @@ class PTuningScorer(BertScorer):
     """
 
     name = "ptuning"
-    supports_multiclass = True
-    supports_multilabel = True
-    _model: Any
-    _tokenizer: Any
 
     def __init__(
         self,
@@ -67,7 +59,7 @@ def __init__(
         learning_rate: float = 5e-5,
         seed: int = 0,
         report_to: REPORTERS_NAMES | None = None,  # type: ignore[valid-type]
-        **ptuning_kwargs: dict[str, Any],
+        **ptuning_kwargs: Any,  # noqa: ANN401
     ) -> None:
         super().__init__(
             classification_model_config=classification_model_config,
@@ -77,8 +69,7 @@ def __init__(
             seed=seed,
             report_to=report_to,
         )
-        self._ptuning_config = PromptEncoderConfig(**ptuning_kwargs)  # type: ignore[arg-type]
-        torch.manual_seed(seed)
+        self._ptuning_config = PromptEncoderConfig(task_type="SEQ_CLS", **ptuning_kwargs)  # type: ignore[arg-type]
 
     @classmethod
     def from_context(
@@ -89,7 +80,7 @@ def from_context(
         batch_size: int = 8,
         learning_rate: float = 5e-5,
         seed: int = 0,
-        **ptuning_kwargs: dict[str, Any],
+        **ptuning_kwargs: Any,  # noqa: ANN401
     ) -> "PTuningScorer":
         """Create a PTuningScorer instance using a Context object.
 
@@ -119,12 +110,5 @@ def from_context(
 
     def _initialize_model(self) -> None:
         """Initialize the model with P-tuning configuration."""
-        model_name = self.classification_model_config.model_name
-        self._model = AutoModelForSequenceClassification.from_pretrained(
-            model_name,
-            num_labels=self._n_classes,
-            problem_type="multi_label_classification" if self._multilabel else "single_label_classification",
-            trust_remote_code=self.classification_model_config.trust_remote_code,
-            return_dict=True,
-        )
+        super()._initialize_model()
         self._model = get_peft_model(self._model, self._ptuning_config)
diff --git a/tests/assets/configs/multiclass.yaml b/tests/assets/configs/multiclass.yaml
@@ -48,7 +48,6 @@
       classification_model_config: ["prajjwal1/bert-tiny"]
       num_train_epochs: [1]
       batch_size: [8, 16]
-      task_type: ["SEQ_CLS"]
       num_virtual_tokens: [10, 20]
 - node_type: decision
   target_metric: decision_accuracy
diff --git a/tests/assets/configs/multilabel.yaml b/tests/assets/configs/multilabel.yaml
@@ -36,7 +36,6 @@
       classification_model_config: ["prajjwal1/bert-tiny"]
       num_train_epochs: [1]
       batch_size: [8]
-      task_type: ["SEQ_CLS"]
       num_virtual_tokens: [10, 20]
     - module_name: lora
       classification_model_config:
diff --git a/tests/modules/scoring/test_ptuning.py b/tests/modules/scoring/test_ptuning.py
@@ -17,7 +17,6 @@ def test_ptuning_scorer_dump_load(dataset):
         classification_model_config="prajjwal1/bert-tiny",
         num_train_epochs=1,
         batch_size=8,
-        task_type="SEQ_CLS",
         num_virtual_tokens=10,
         seed=42,
     )
@@ -38,7 +37,6 @@ def test_ptuning_scorer_dump_load(dataset):
             classification_model_config="prajjwal1/bert-tiny",
             num_train_epochs=1,
             batch_size=8,
-            task_type="SEQ_CLS",
             num_virtual_tokens=10,
             seed=42,
         )
@@ -66,7 +64,6 @@ def test_ptuning_prediction(dataset):
         classification_model_config="prajjwal1/bert-tiny",
         num_train_epochs=1,
         batch_size=8,
-        task_type="SEQ_CLS",
         num_virtual_tokens=10,
         seed=42,
     )
@@ -106,7 +103,6 @@ def test_ptuning_cache_clearing(dataset):
         classification_model_config="prajjwal1/bert-tiny",
         num_train_epochs=1,
         batch_size=8,
-        task_type="SEQ_CLS",
         num_virtual_tokens=20,
         seed=42,
     )