deeppavlov
diff --git a/‎autointent/_dump_tools.py‎
Lines changed: 46 additions & 9 deletions b/‎autointent/_dump_tools.py‎
Lines changed: 46 additions & 9 deletions
diff --git a/‎autointent/_wrappers/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎autointent/_wrappers/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎autointent/_wrappers/base_torch_module.py‎
Lines changed: 86 additions & 1 deletion b/‎autointent/_wrappers/base_torch_module.py‎
Lines changed: 86 additions & 1 deletion
diff --git a/‎autointent/configs/__init__.py‎
Lines changed: 3 additions & 1 deletion b/‎autointent/configs/__init__.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎autointent/configs/_torch.py‎
Lines changed: 45 additions & 0 deletions b/‎autointent/configs/_torch.py‎
Lines changed: 45 additions & 0 deletions
@@ -11,7 +11,6 @@
 from peft import PeftModel
 from pydantic import BaseModel
 from sklearn.base import BaseEstimator
-from torch import nn
 from transformers import (  # type: ignore[attr-defined]
     AutoModelForSequenceClassification,
     AutoTokenizer,
@@ -21,15 +20,22 @@
 )
 
 from autointent import Embedder, Ranker, VectorIndex
-from autointent._wrappers import BaseTorchModule
+from autointent._wrappers import BaseTorchModuleWithVocab
 from autointent.configs import CrossEncoderConfig, EmbedderConfig
 from autointent.context.optimization_info import Artifact
 from autointent.schemas import TagsList
 
 ModuleSimpleAttributes = None | str | int | float | bool | list  # type: ignore[type-arg]
 
 ModuleAttributes: TypeAlias = (
-    ModuleSimpleAttributes | TagsList | np.ndarray | Embedder | VectorIndex | BaseEstimator | Ranker | nn.Module  # type: ignore[type-arg]
+    ModuleSimpleAttributes
+    | TagsList
+    | np.ndarray  # type: ignore[type-arg]
+    | Embedder
+    | VectorIndex
+    | BaseEstimator
+    | Ranker
+    | BaseTorchModuleWithVocab
 )
 
 logger = logging.getLogger(__name__)
@@ -75,14 +81,21 @@ def make_subdirectories(path: Path, exists_ok: bool = False) -> None:
             subdir.mkdir(parents=True, exist_ok=exists_ok)
 
     @staticmethod
-    def dump(obj: Any, path: Path, exists_ok: bool = False, exclude: list[type[Any]] | None = None) -> None:  # noqa: ANN401, C901, PLR0912, PLR0915
+    def dump(  # noqa: C901, PLR0912, PLR0915
+        obj: Any,  # noqa: ANN401
+        path: Path,
+        exists_ok: bool = False,
+        exclude: list[type[Any]] | None = None,
+        raise_errors: bool = False,
+    ) -> None:
         """Dump modules attributes to filestystem.
 
         Args:
             obj: Object to dump
             path: Path to dump to
             exists_ok: If True, do not raise an error if the directory already exists
             exclude: List of types to exclude from dumping
+            raise_errors: whether to raise dumping errors or just log
         """
         attrs: dict[str, ModuleAttributes] = vars(obj)
         simple_attrs = {}
@@ -119,25 +132,29 @@ def dump(obj: Any, path: Path, exists_ok: bool = False, exclude: list[type[Any]]
                 except Exception as e:
                     msg = f"Error dumping pydantic model {key}: {e}"
                     logging.exception(msg)
+                    if raise_errors:
+                        raise
             elif isinstance(val, PeftModel):
                 # dumping peft models is a nightmare...
                 # this might break with new versions of peft
                 try:
                     if val._is_prompt_learning:  # noqa: SLF001
                         # strategy to save prompt learning models: save prompt encoder and bert classifier separately
                         model_path = path / Dumper.ptuning_models / key
-                        model_path.mkdir(parents=True, exist_ok=True)
+                        model_path.mkdir(parents=True, exist_ok=exists_ok)
                         val.save_pretrained(str(model_path / "peft"))
                         val.base_model.save_pretrained(model_path / "base_model")  # type: ignore[attr-defined]
                     else:
                         # strategy to save lora models: merge adapters and save as usual hugging face model
                         model_path = path / Dumper.hf_models / key
-                        model_path.mkdir(parents=True, exist_ok=True)
+                        model_path.mkdir(parents=True, exist_ok=exists_ok)
                         merged_model: PreTrainedModel = val.merge_and_unload()
                         merged_model.save_pretrained(model_path)  # type: ignore[attr-defined]
                 except Exception as e:
                     msg = f"Error dumping PeftModel {key}: {e}"
                     logger.exception(msg)
+                    if raise_errors:
+                        raise
             elif isinstance(val, PreTrainedModel):
                 model_path = path / Dumper.hf_models / key
                 model_path.mkdir(parents=True, exist_ok=True)
@@ -146,7 +163,9 @@ def dump(obj: Any, path: Path, exists_ok: bool = False, exclude: list[type[Any]]
                 except Exception as e:
                     msg = f"Error dumping HF model {key}: {e}"
                     logger.exception(msg)
-            elif isinstance(val, BaseTorchModule):
+                    if raise_errors:
+                        raise
+            elif isinstance(val, BaseTorchModuleWithVocab):
                 model_path = path / Dumper.torch_models / key
                 model_path.mkdir(parents=True, exist_ok=True)
                 try:
@@ -160,6 +179,8 @@ def dump(obj: Any, path: Path, exists_ok: bool = False, exclude: list[type[Any]]
                 except Exception as e:
                     msg = f"Error dumping torch model {key}: {e}"
                     logger.exception(msg)
+                    if raise_errors:
+                        raise
             elif isinstance(val, PreTrainedTokenizer | PreTrainedTokenizerFast):
                 tokenizer_path = path / Dumper.hf_tokenizers / key
                 tokenizer_path.mkdir(parents=True, exist_ok=True)
@@ -168,11 +189,15 @@ def dump(obj: Any, path: Path, exists_ok: bool = False, exclude: list[type[Any]]
                 except Exception as e:
                     msg = f"Error dumping HF tokenizer {key}: {e}"
                     logger.exception(msg)
+                    if raise_errors:
+                        raise
             elif isinstance(val, CatBoostClassifier):
                 val.save_model(str(path / Dumper.catboost_models / key), format="cbm")
             else:
                 msg = f"Attribute {key} of type {type(val)} cannot be dumped to file system."
                 logger.error(msg)
+                if raise_errors:
+                    raise TypeError(msg)
 
         with (path / Dumper.simple_attrs).open("w", encoding="utf-8") as file:
             json.dump(simple_attrs, file, ensure_ascii=False, indent=4)
@@ -185,6 +210,7 @@ def load(  # noqa: C901, PLR0912, PLR0915
         path: Path,
         embedder_config: EmbedderConfig | None = None,
         cross_encoder_config: CrossEncoderConfig | None = None,
+        raise_errors: bool = False,
     ) -> None:
         """Load attributes from file system."""
         tags: dict[str, Any] = {}
@@ -250,7 +276,8 @@ def load(  # noqa: C901, PLR0912, PLR0915
                     except Exception as e:
                         msg = f"Error loading Pydantic model from {model_dir}: {e}"
                         logger.exception(msg)
-                        continue
+                        if raise_errors:
+                            raise
             elif child.name == Dumper.ptuning_models:
                 for model_dir in child.iterdir():
                     try:
@@ -259,20 +286,26 @@ def load(  # noqa: C901, PLR0912, PLR0915
                     except Exception as e:  # noqa: PERF203
                         msg = f"Error loading PeftModel {model_dir.name}: {e}"
                         logger.exception(msg)
+                        if raise_errors:
+                            raise
             elif child.name == Dumper.hf_models:
                 for model_dir in child.iterdir():
                     try:
                         hf_models[model_dir.name] = AutoModelForSequenceClassification.from_pretrained(model_dir)  # type: ignore[no-untyped-call]
                     except Exception as e:  # noqa: PERF203
                         msg = f"Error loading HF model {model_dir.name}: {e}"
                         logger.exception(msg)
+                        if raise_errors:
+                            raise
             elif child.name == Dumper.hf_tokenizers:
                 for tokenizer_dir in child.iterdir():
                     try:
                         hf_tokenizers[tokenizer_dir.name] = AutoTokenizer.from_pretrained(tokenizer_dir)
                     except Exception as e:  # noqa: PERF203
                         msg = f"Error loading HF tokenizer {tokenizer_dir.name}: {e}"
                         logger.exception(msg)
+                        if raise_errors:
+                            raise
             elif child.name == Dumper.catboost_models:
                 for model_file in child.iterdir():
                     try:
@@ -288,15 +321,19 @@ def load(  # noqa: C901, PLR0912, PLR0915
                         with (model_dir / "class_info.json").open("r") as f:
                             class_info = json.load(f)
                         module = importlib.import_module(class_info["module"])
-                        model_class: BaseTorchModule = getattr(module, class_info["name"])
+                        model_class: BaseTorchModuleWithVocab = getattr(module, class_info["name"])
                         model = model_class.load(model_dir)
                         torch_models[model_dir.name] = model
                 except Exception as e:
                     msg = f"Error loading torch model {model_dir.name}: {e}"
                     logger.exception(msg)
+                    if raise_errors:
+                        raise
             else:
                 msg = f"Found unexpected child {child}"
                 logger.error(msg)
+                if raise_errors:
+                    raise ValueError(msg)
 
         obj.__dict__.update(
             tags
 
@@ -1,6 +1,6 @@
 from .ranker import Ranker
 from .embedder import Embedder
 from .vector_index import VectorIndex
-from .base_torch_module import BaseTorchModule
+from .base_torch_module import BaseTorchModuleWithVocab
 
-__all__ = ["BaseTorchModule", "Embedder", "Ranker", "VectorIndex"]
+__all__ = ["BaseTorchModuleWithVocab", "Embedder", "Ranker", "VectorIndex"]
@@ -1,12 +1,97 @@
+"""Torch model for text classification."""
+
+import re
 from abc import ABC, abstractmethod
+from collections import Counter
 from pathlib import Path
+from typing import Any
 
 import torch
 from torch import nn
 from typing_extensions import Self
 
+from autointent.configs import VocabConfig
+
+
+class BaseTorchModuleWithVocab(nn.Module, ABC):
+    def __init__(
+        self,
+        embed_dim: int,
+        vocab_config: VocabConfig | None = None,
+    ) -> None:
+        super().__init__()
+
+        self.embed_dim = embed_dim
+        self.vocab_config = VocabConfig.from_search_config(vocab_config)
+
+        # Vocabulary management
+        self._unk_token = "<UNK>"  # noqa: S105
+        self._pad_token = "<PAD>"  # noqa: S105
+        self._unk_idx = 1
+
+        if self.vocab_config.vocab is not None:
+            self.set_vocab(self.vocab_config.vocab)
+
+    def set_vocab(self, vocab: dict[str, Any]) -> None:
+        """Save vocabulary into module's attributes and initialize embeddings matrix."""
+        self.vocab_config.vocab = vocab
+        self.embedding = nn.Embedding(
+            num_embeddings=len(self.vocab_config.vocab),
+            embedding_dim=self.embed_dim,
+            padding_idx=self.vocab_config.padding_idx,
+        )
+
+    def build_vocab(self, utterances: list[str]) -> None:
+        """Build vocabulary from training utterances."""
+        if self.vocab_config.vocab is not None:
+            msg = "Vocab is already built."
+            raise RuntimeError(msg)
+
+        word_counts: Counter[str] = Counter()
+        for utterance in utterances:
+            words = re.findall(r"\w+", utterance.lower())
+            word_counts.update(words)
+
+        # Create vocabulary with special tokens
+        vocab = {self._pad_token: self.vocab_config.padding_idx, self._unk_token: self._unk_idx}
+
+        # Convert Counter to list of (word, count) tuples sorted by frequency
+        sorted_words = word_counts.most_common(self.vocab_config.max_vocab_size)
+        for word, _ in sorted_words:
+            if word not in vocab:
+                vocab[word] = len(vocab)
+
+        self.set_vocab(vocab)
+
+    def text_to_indices(self, utterances: list[str]) -> list[list[int]]:
+        """Convert utterances to padded sequences of word indices."""
+        if self.vocab_config.vocab is None:
+            msg = "Vocab is not built."
+            raise RuntimeError(msg)
+
+        sequences: list[list[int]] = []
+        for utterance in utterances:
+            words = re.findall(r"\w+", utterance.lower())
+            # Convert words to indices, using UNK for unknown words
+            seq = [self.vocab_config.vocab.get(word, self._unk_idx) for word in words]
+            # Truncate if too long
+            seq = seq[: self.vocab_config.max_seq_length]
+            # Pad if too short
+            seq = seq + [self.vocab_config.padding_idx] * (self.vocab_config.max_seq_length - len(seq))
+            sequences.append(seq)
+        return sequences
+
+    @abstractmethod
+    def forward(self, text: torch.Tensor) -> torch.Tensor:
+        """Compute sentence embeddings for given text.
+
+        Args:
+            text: torch tensor of shape (B, T), token ids
+
+        Returns:
+            embeddings of shape (B, H)
+        """
 
-class BaseTorchModule(nn.Module, ABC):
     @abstractmethod
     def dump(self, path: Path) -> None:
         """Dump torch module to disk.
 
@@ -2,6 +2,7 @@
 
 from ._inference_node import InferenceNodeConfig
 from ._optimization import DataConfig, HPOConfig, LoggingConfig
+from ._torch import TorchTrainingConfig, VocabConfig
 from ._transformers import (
     CrossEncoderConfig,
     EarlyStoppingConfig,
@@ -19,8 +20,9 @@
     "HFModelConfig",
     "HPOConfig",
     "InferenceNodeConfig",
-    "InferenceNodeConfig",
     "LoggingConfig",
     "TaskTypeEnum",
     "TokenizerConfig",
+    "TorchTrainingConfig",
+    "VocabConfig",
 ]
@@ -0,0 +1,45 @@
+from typing import Any
+
+from pydantic import BaseModel, ConfigDict, Field
+from typing_extensions import Self
+
+from autointent._callbacks import REPORTERS_NAMES
+from autointent._utils import detect_device
+
+
+class FromDictMixin:
+    @classmethod
+    def from_search_config(cls, values: dict[str, Any] | BaseModel | None) -> Self:
+        """Validate the model configuration.
+
+        This classmethod is used to parse dictionaries that occur in search space configurations.
+
+        Args:
+            values: Model configuration values.
+
+        Returns:
+            Model configuration.
+        """
+        if values is None:
+            return cls()
+        if isinstance(values, BaseModel):
+            return values  # type: ignore[return-value]
+        return cls(**values)
+
+
+class VocabConfig(BaseModel, FromDictMixin):
+    model_config = ConfigDict(extra="forbid")
+    padding_idx: int = 0
+    max_seq_length: int = 50
+    vocab: dict[str, int] | None = None
+    max_vocab_size: int | None = None
+
+
+class TorchTrainingConfig(BaseModel, FromDictMixin):
+    model_config = ConfigDict(extra="forbid")
+    num_train_epochs: int = 3
+    batch_size: int = 8
+    learning_rate: float = 5e-5
+    seed: int = 42
+    report_to: REPORTERS_NAMES | None = None  # type: ignore  # noqa: PGH003
+    device: str = Field(default_factory=detect_device)