deeppavlov
diff --git a/‎.github/workflows/test-nodes.yaml‎
Lines changed: 0 additions & 13 deletions b/‎.github/workflows/test-nodes.yaml‎
Lines changed: 0 additions & 13 deletions
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎autointent/_callbacks/wandb.py‎
Lines changed: 23 additions & 7 deletions b/‎autointent/_callbacks/wandb.py‎
Lines changed: 23 additions & 7 deletions
diff --git a/‎autointent/_dump_tools.py‎
Lines changed: 1 addition & 1 deletion b/‎autointent/_dump_tools.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎autointent/_embedder.py‎
Lines changed: 55 additions & 25 deletions b/‎autointent/_embedder.py‎
Lines changed: 55 additions & 25 deletions
diff --git a/‎autointent/_optimization_config.py‎
Lines changed: 3 additions & 1 deletion b/‎autointent/_optimization_config.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎autointent/_pipeline/_pipeline.py‎
Lines changed: 17 additions & 1 deletion b/‎autointent/_pipeline/_pipeline.py‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎autointent/_ranker.py‎
Lines changed: 9 additions & 3 deletions b/‎autointent/_ranker.py‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎autointent/configs/_transformers.py‎
Lines changed: 8 additions & 5 deletions b/‎autointent/configs/_transformers.py‎
Lines changed: 8 additions & 5 deletions
@@ -179,4 +179,6 @@ tests_logs
 tests/logs
 runs/
 vector_db*
+*.db
+*.sqlite
 /wandb
@@ -1,9 +1,12 @@
+import logging
 import os
 from pathlib import Path
 from typing import Any
 
 from autointent._callbacks.base import OptimizerCallback
 
+logger = logging.getLogger(__name__)
+
 
 class WandbCallback(OptimizerCallback):
     """Wandb callback for logging the optimization process to Weights & Biases (W&B).
@@ -94,13 +97,26 @@ def log_final_metrics(self, metrics: dict[str, Any]) -> None:
         Args:
             metrics: A dictionary of final performance metrics.
         """
-        self.wandb.init(
-            project=self.project_name,
-            group=self.group,
-            name="final_metrics",
-            config=metrics,
-            settings=self.wandb.Settings(x_stats_sampling_interval=self.log_interval_time),
-        )
+        wandb_run_init_args = {
+            "project": self.project_name,
+            "group": self.group,
+            "name": "final_metrics",
+            "settings": self.wandb.Settings(x_stats_sampling_interval=self.log_interval_time),
+        }
+
+        try:
+            self.wandb.init(config=metrics, **wandb_run_init_args)
+        except Exception as e:
+            if "run config cannot exceed" not in str(e):
+                # https://github.com/deeppavlov/AutoIntent/issues/202
+                raise
+            logger.warning("W&B run config is too large, skipping logging modules configs")
+            logger.warning("'final_metrics' will be logged to W&B with pipeline_metrics only")
+            logger.warning("If you want to access modules configs in future, address to the individual modules runs")
+            self.wandb.init(
+                config={},
+                **wandb_run_init_args,
+            )
 
         self.wandb.log(metrics.get("pipeline_metrics", {}))
         self.wandb.finish()
 
@@ -11,7 +11,7 @@
 from peft import PeftModel
 from pydantic import BaseModel
 from sklearn.base import BaseEstimator
-from torch import nn
+
 from transformers import (  # type: ignore[attr-defined]
     AutoModelForSequenceClassification,
     AutoTokenizer,
 
@@ -7,20 +7,25 @@
 import json
 import logging
 import shutil
+from functools import lru_cache
 from pathlib import Path
 from typing import TypedDict
 
+import huggingface_hub
 import numpy as np
 import numpy.typing as npt
 import torch
 from appdirs import user_cache_dir
 from sentence_transformers import SentenceTransformer
+from sentence_transformers.similarity_functions import SimilarityFunction
 
 from ._hash import Hasher
 from .configs import EmbedderConfig, TaskTypeEnum
 
+logger = logging.getLogger(__name__)
 
-def get_embeddings_path(filename: str) -> Path:
+
+def _get_embeddings_path(filename: str) -> Path:
     """Get the path to the embeddings file.
 
     This function constructs the full path to an embeddings file stored
@@ -37,6 +42,23 @@ def get_embeddings_path(filename: str) -> Path:
     return Path(user_cache_dir("autointent")) / "embeddings" / f"{filename}.npy"
 
 
+@lru_cache(maxsize=128)
+def _get_latest_commit_hash(model_name: str) -> str:
+    """Get the latest commit hash for a given Hugging Face model.
+
+    Args:
+        model_name: The name of the model to get the latest commit hash for.
+
+    Returns:
+        The latest commit hash for the given model name or the model name if the commit hash is not found.
+    """
+    commit_hash = huggingface_hub.model_info(model_name, revision="main").sha
+    if commit_hash is None:
+        logger.warning("No commit hash found for model %s", model_name)
+        return model_name
+    return commit_hash
+
+
 class EmbedderDumpMetadata(TypedDict):
     """Metadata for saving and loading an Embedder instance."""
 
@@ -63,7 +85,6 @@ class Embedder:
 
     _metadata_dict_name: str = "metadata.json"
     _dump_dir: Path | None = None
-    config: EmbedderConfig
     embedding_model: SentenceTransformer
 
     def __init__(self, embedder_config: EmbedderConfig) -> None:
@@ -74,34 +95,41 @@ def __init__(self, embedder_config: EmbedderConfig) -> None:
         """
         self.config = embedder_config
 
-        self.embedding_model = SentenceTransformer(
-            self.config.model_name,
-            device=self.config.device,
-            prompts=embedder_config.get_prompt_config(),
-            similarity_fn_name=self.config.similarity_fn_name,
-            trust_remote_code=self.config.trust_remote_code,
-        )
-
-        self._logger = logging.getLogger(__name__)
-
     def __hash__(self) -> int:
         """Compute a hash value for the Embedder.
 
         Returns:
             The hash value of the Embedder.
         """
         hasher = Hasher()
-        for parameter in self.embedding_model.parameters():
-            hasher.update(parameter.detach().cpu().numpy())
+        if self.config.freeze:
+            commit_hash = _get_latest_commit_hash(self.config.model_name)
+            hasher.update(commit_hash)
+        else:
+            self._load_model()
+            for parameter in self.embedding_model.parameters():
+                hasher.update(parameter.detach().cpu().numpy())
         hasher.update(self.config.tokenizer_config.max_length)
         return hasher.intdigest()
 
+    def _load_model(self) -> None:
+        """Load sentence transformers model to device."""
+        if not hasattr(self, "embedding_model"):
+            self.embedding_model = SentenceTransformer(
+                self.config.model_name,
+                device=self.config.device,
+                prompts=self.config.get_prompt_config(),
+                similarity_fn_name=self.config.similarity_fn_name,
+                trust_remote_code=self.config.trust_remote_code,
+            )
+
     def clear_ram(self) -> None:
         """Move the embedding model to CPU and delete it from memory."""
-        self._logger.debug("Clearing embedder %s from memory", self.config.model_name)
-        self.embedding_model.cpu()
-        del self.embedding_model
-        torch.cuda.empty_cache()
+        if hasattr(self, "embedding_model"):
+            logger.debug("Clearing embedder %s from memory", self.config.model_name)
+            self.embedding_model.cpu()
+            del self.embedding_model
+            torch.cuda.empty_cache()
 
     def delete(self) -> None:
         """Delete the embedding model and its associated directory."""
@@ -165,11 +193,13 @@ def embed(self, utterances: list[str], task_type: TaskTypeEnum | None = None) ->
             hasher.update(self)
             hasher.update(utterances)
 
-            embeddings_path = get_embeddings_path(hasher.hexdigest())
+            embeddings_path = _get_embeddings_path(hasher.hexdigest())
             if embeddings_path.exists():
                 return np.load(embeddings_path)  # type: ignore[no-any-return]
 
-        self._logger.debug(
+        self._load_model()
+
+        logger.debug(
             "Calculating embeddings with model %s, batch_size=%d, max_seq_length=%s, embedder_device=%s",
             self.config.model_name,
             self.config.batch_size,
@@ -200,11 +230,11 @@ def similarity(
         """Calculate similarity between two sets of embeddings.
 
         Args:
-            embeddings1: First set of embeddings.
-            embeddings2: Second set of embeddings.
+            embeddings1: First set of embeddings (size n).
+            embeddings2: Second set of embeddings (size m).
 
         Returns:
-            A numpy array of similarities.
+            A numpy array of similarities (size n x m).
         """
-        result = self.embedding_model.similarity(embeddings1, embeddings2)
-        return result.detach().cpu().numpy().astype(np.float32)
+        similarity_fn = SimilarityFunction.to_similarity_fn(self.config.similarity_fn_name)
+        return similarity_fn(embeddings1, embeddings2).detach().cpu().numpy().astype(np.float32)
@@ -2,7 +2,7 @@
 
 from pydantic import BaseModel, PositiveInt
 
-from .configs import CrossEncoderConfig, DataConfig, EmbedderConfig, LoggingConfig
+from .configs import CrossEncoderConfig, DataConfig, EmbedderConfig, HFModelConfig, LoggingConfig
 from .custom_types import SamplerType
 
 
@@ -25,6 +25,8 @@ class OptimizationConfig(BaseModel):
 
     cross_encoder_config: CrossEncoderConfig = CrossEncoderConfig()
 
+    transformer_config: HFModelConfig = HFModelConfig()
+
     sampler: SamplerType = "brute"
     """See tutorial on optuna and presets."""
 
 
@@ -14,6 +14,7 @@
     CrossEncoderConfig,
     DataConfig,
     EmbedderConfig,
+    HFModelConfig,
     InferenceNodeConfig,
     LoggingConfig,
 )
@@ -67,10 +68,13 @@ def __init__(
             self.embedder_config = EmbedderConfig()
             self.cross_encoder_config = CrossEncoderConfig()
             self.data_config = DataConfig()
+            self.transformer_config = HFModelConfig()
         elif not isinstance(nodes[0], InferenceNode):
             assert_never(nodes)
 
-    def set_config(self, config: LoggingConfig | EmbedderConfig | CrossEncoderConfig | DataConfig) -> None:
+    def set_config(
+        self, config: LoggingConfig | EmbedderConfig | CrossEncoderConfig | DataConfig | HFModelConfig
+    ) -> None:
         """Set the configuration for the pipeline.
 
         Args:
@@ -84,6 +88,8 @@ def set_config(self, config: LoggingConfig | EmbedderConfig | CrossEncoderConfig
             self.cross_encoder_config = config
         elif isinstance(config, DataConfig):
             self.data_config = config
+        elif isinstance(config, HFModelConfig):
+            self.transformer_config = config
         else:
             assert_never(config)
 
@@ -133,6 +139,7 @@ def from_optimization_config(cls, config: dict[str, Any] | Path | str | Optimiza
         pipeline.set_config(optimization_config.data_config)
         pipeline.set_config(optimization_config.embedder_config)
         pipeline.set_config(optimization_config.cross_encoder_config)
+        pipeline.set_config(optimization_config.transformer_config)
         return pipeline
 
     def _fit(self, context: Context, sampler: SamplerType) -> None:
@@ -144,6 +151,14 @@ def _fit(self, context: Context, sampler: SamplerType) -> None:
         """
         self.context = context
         self._logger.info("starting pipeline optimization...")
+
+        if not context.logging_config.dump_modules:
+            self._logger.warning(
+                "Memory storage is not compatible with resuming optimization. "
+                "Modules from previous runs won't be available. "
+                "Set dump_modules=True in LoggingConfig to enable proper resuming."
+            )
+
         self.context.callback_handler.start_run(
             run_name=self.context.logging_config.get_run_name(),
             dirpath=self.context.logging_config.dirpath,
@@ -190,6 +205,7 @@ def fit(
         context.configure_logging(self.logging_config)
         context.configure_transformer(self.embedder_config)
         context.configure_transformer(self.cross_encoder_config)
+        context.configure_transformer(self.transformer_config)
 
         self.validate_modules(dataset, mode=incompatible_search_space)
 
 
@@ -10,7 +10,7 @@
 import logging
 from pathlib import Path
 from random import shuffle
-from typing import Any, TypedDict
+from typing import Any, Literal, TypedDict
 
 import joblib
 import numpy as np
@@ -101,12 +101,14 @@ def __init__(
         self,
         cross_encoder_config: CrossEncoderConfig | str | dict[str, Any],
         classifier_head: LogisticRegressionCV | None = None,
+        output_range: Literal["sigmoid", "tanh"] = "sigmoid",
     ) -> None:
         """Initialize the Ranker.
 
         Args:
             cross_encoder_config: Configuration for the cross-encoder model
             classifier_head: Optional pre-trained classifier head
+            output_range: Range of the output probabilities ([0, 1] for sigmoid, [-1, 1] for tanh)
         """
         self.config = CrossEncoderConfig.from_search_config(cross_encoder_config)
         self.cross_encoder = st.CrossEncoder(
@@ -117,6 +119,7 @@ def __init__(
         )
         self._train_head = False
         self._clf = classifier_head
+        self.output_range = output_range
 
         if classifier_head is not None or self.config.train_head:
             self._train_head = True
@@ -148,7 +151,7 @@ def _get_features_or_predictions(self, pairs: list[tuple[str, str]]) -> npt.NDAr
                 self.cross_encoder.predict(
                     pairs,
                     batch_size=self.config.batch_size,
-                    activation_fct=nn.Sigmoid(),
+                    activation_fct=nn.Sigmoid() if self.output_range == "sigmoid" else nn.Tanh(),
                 )
             )
 
@@ -210,7 +213,10 @@ def predict(self, pairs: list[tuple[str, str]]) -> npt.NDArray[Any]:
         features = self._get_features_or_predictions(pairs)
 
         if self._clf is not None:
-            return np.array(self._clf.predict_proba(features)[:, 1])
+            probs = np.array(self._clf.predict_proba(features)[:, 1])
+            if self.output_range == "tanh":
+                probs = probs * 2 - 1
+            return probs
         return features
 
     def rank(
 
@@ -61,9 +61,11 @@ class EmbedderConfig(HFModelConfig):
     sts_prompt: str | None = Field(None, description="Prompt for finding most similar sentences.")
     query_prompt: str | None = Field(None, description="Prompt for query.")
     passage_prompt: str | None = Field(None, description="Prompt for passage.")
-    similarity_fn_name: str | None = Field(
-        "cosine", description="Name of the similarity function to use (cosine, dot, euclidean, manhattan)."
+    similarity_fn_name: Literal["cosine", "dot", "euclidean", "manhattan"] = Field(
+        "cosine", description="Name of the similarity function to use."
     )
+    use_cache: bool = Field(True, description="Whether to use embeddings caching.")
+    freeze: bool = Field(True, description="Whether to freeze the model parameters.")
 
     def get_prompt_config(self) -> dict[str, str] | None:
         """Get the prompt config for the given prompt type.
@@ -111,11 +113,12 @@ def get_prompt_type(self, prompt_type: TaskTypeEnum | None) -> str | None:  # no
             return self.default_prompt
         assert_never(prompt_type)
 
-    use_cache: bool = Field(False, description="Whether to use embeddings caching.")
-
 
 class CrossEncoderConfig(HFModelConfig):
-    model_name: str = Field("cross-encoder/ms-marco-MiniLM-L-6-v2", description="Name of the hugging face model.")
+    model_name: str = Field("cross-encoder/ms-marco-MiniLM-L6-v2", description="Name of the hugging face model.")
     train_head: bool = Field(
         False, description="Whether to train the head of the model. If False, LogReg will be trained."
     )
+    tokenizer_config: TokenizerConfig = Field(
+        default_factory=lambda: TokenizerConfig(max_length=512)
+    )  # this is because sentence-transformers doesn't allow you to customize tokenizer settings properly