updated after mr #165

riapush · riapush · commit 5f46d8092cd5 · 2025-03-26T10:28:57.000+03:00
diff --git a/autointent/modules/scoring/_lora/lora.py b/autointent/modules/scoring/_lora/lora.py
@@ -17,25 +17,11 @@
 )
 
 from autointent import Context
-from autointent.configs import EmbedderConfig
+from autointent.configs import HFModelConfig
 from autointent.custom_types import ListOfLabels
 from autointent.modules.base import BaseScorer
 
 
-class TokenizerConfig:
-    """Configuration for tokenizer parameters."""
-
-    def __init__(
-        self,
-        max_length: int = 128,
-        padding: str = "max_length",
-        truncation: bool = True,
-    ) -> None:
-        self.max_length = max_length
-        self.padding = padding
-        self.truncation = truncation
-
-
 class BERTLoRAScorer(BaseScorer):
     name = "lora"
     supports_multiclass = True
@@ -46,40 +32,31 @@ class BERTLoRAScorer(BaseScorer):
 
     def __init__(
         self,
-        model_config: EmbedderConfig | str | dict[str, Any] | None = None,
+        model_config: HFModelConfig | str | dict[str, Any] | None = None,
         num_train_epochs: int = 3,
         batch_size: int = 8,
         learning_rate: float = 5e-5,
         seed: int = 0,
-        tokenizer_config: TokenizerConfig | None = None,
-        lora_rank: int = 16,
-        lora_alpha: int = 32,
-        lora_dropout: float = 0.1,
+        **lora_kwargs: Any,
     ) -> None:
-        self.model_config = EmbedderConfig.from_search_config(model_config)
+        self.model_config = HFModelConfig.from_search_config(model_config)
         self.num_train_epochs = num_train_epochs
         self.batch_size = batch_size
         self.learning_rate = learning_rate
         self.seed = seed
         self._multilabel = False
-        self.tokenizer_config = tokenizer_config or TokenizerConfig()
-        self.lora_rank = lora_rank
-        self.lora_alpha = lora_alpha
-        self.lora_dropout = lora_dropout
+        self._lora_config = LoraConfig(**lora_kwargs)
 
     @classmethod
     def from_context(
         cls,
         context: Context,
-        model_config: EmbedderConfig | str | dict[str, Any] | None = None,
+        model_config: HFModelConfig | str | dict[str, Any] | None = None,
         num_train_epochs: int = 10,
         batch_size: int = 8,
         learning_rate: float = 5e-5,
         seed: int = 0,
-        tokenizer_config: TokenizerConfig | None = None,
-        lora_rank: int = 8,
-        lora_alpha: int = 32,
-        lora_dropout: float = 0.1,
+        **lora_kwargs: Any,
     ) -> "BERTLoRAScorer":
         if model_config is None:
             model_config = context.resolve_embedder()
@@ -89,10 +66,7 @@ def from_context(
             batch_size=batch_size,
             learning_rate=learning_rate,
             seed=seed,
-            tokenizer_config=tokenizer_config,
-            lora_rank=lora_rank,
-            lora_alpha=lora_alpha,
-            lora_dropout=lora_dropout,
+            **lora_kwargs,
         )
 
     def get_embedder_config(self) -> dict[str, Any]:
@@ -123,26 +97,14 @@ def fit(
         self._tokenizer = AutoTokenizer.from_pretrained(model_name)
         self._model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
 
-        # Configure LoRA
-        lora_config = LoraConfig(
-            r=self.lora_rank,  # Rank of the low-rank matrices
-            lora_alpha=self.lora_alpha,  # Scaling factor
-            target_modules=["query", "value"],  # Target modules to apply LoRA
-            lora_dropout=self.lora_dropout,  # Dropout rate for LoRA layers
-            bias="none",  # Whether to add bias to LoRA layers
-        )
-
         # Apply LoRA to the model
-        self._model = get_peft_model(self._model, lora_config)
+        self._model = get_peft_model(self._model, self._lora_config)
 
         use_cpu = hasattr(self.model_config, "device") and self.model_config.device == "cpu"
 
         def tokenize_function(examples: dict[str, Any]) -> dict[str, Any]:
-            return self._tokenizer(
-                examples["text"],
-                padding=self.tokenizer_config.padding,
-                truncation=self.tokenizer_config.truncation,
-                max_length=self.tokenizer_config.max_length,
+            return self._tokenizer(  # type: ignore[no-any-return]
+                examples["text"], return_tensors="pt", **self.model_config.tokenizer_config.model_dump()
             )
 
         dataset = Dataset.from_dict({"text": utterances, "labels": labels})
@@ -180,7 +142,7 @@ def predict(self, utterances: list[str]) -> npt.NDArray[Any]:
             raise RuntimeError(msg)
 
         inputs = self._tokenizer(
-            utterances, padding=True, truncation=True, max_length=self.tokenizer_config.max_length, return_tensors="pt"
+            utterances, padding=True, truncation=True, max_length=self.model_config.tokenizer_config.max_length, return_tensors="pt"
         )
 
         with torch.no_grad():