Refactor/embedding caching (#195)

voorhs · github-actions[bot] · web-flow · commit 389eb40c1de4 · 2025-05-03T14:01:21.000+03:00
* implement new hashing strategy

* fix codestyle

* Update optimizer_config.schema.json

* minor bug fix

* fix typing error

* refactor similarity calculation

* Update optimizer_config.schema.json

* upd callback test

* solve 429 error

---------

Co-authored-by: github-actions[bot] &lt;github-actions[bot]@users.noreply.github.com&gt;
diff --git a/autointent/_embedder.py b/autointent/_embedder.py
@@ -7,20 +7,25 @@
 import json
 import logging
 import shutil
+from functools import lru_cache
 from pathlib import Path
 from typing import TypedDict
 
+import huggingface_hub
 import numpy as np
 import numpy.typing as npt
 import torch
 from appdirs import user_cache_dir
 from sentence_transformers import SentenceTransformer
+from sentence_transformers.similarity_functions import SimilarityFunction
 
 from ._hash import Hasher
 from .configs import EmbedderConfig, TaskTypeEnum
 
+logger = logging.getLogger(__name__)
 
-def get_embeddings_path(filename: str) -> Path:
+
+def _get_embeddings_path(filename: str) -> Path:
     """Get the path to the embeddings file.
 
     This function constructs the full path to an embeddings file stored
@@ -37,6 +42,23 @@ def get_embeddings_path(filename: str) -> Path:
     return Path(user_cache_dir("autointent")) / "embeddings" / f"{filename}.npy"
 
 
+@lru_cache(maxsize=128)
+def _get_latest_commit_hash(model_name: str) -> str:
+    """Get the latest commit hash for a given Hugging Face model.
+
+    Args:
+        model_name: The name of the model to get the latest commit hash for.
+
+    Returns:
+        The latest commit hash for the given model name or the model name if the commit hash is not found.
+    """
+    commit_hash = huggingface_hub.model_info(model_name, revision="main").sha
+    if commit_hash is None:
+        logger.warning("No commit hash found for model %s", model_name)
+        return model_name
+    return commit_hash
+
+
 class EmbedderDumpMetadata(TypedDict):
     """Metadata for saving and loading an Embedder instance."""
 
@@ -63,7 +85,6 @@ class Embedder:
 
     _metadata_dict_name: str = "metadata.json"
     _dump_dir: Path | None = None
-    config: EmbedderConfig
     embedding_model: SentenceTransformer
 
     def __init__(self, embedder_config: EmbedderConfig) -> None:
@@ -74,34 +95,41 @@ def __init__(self, embedder_config: EmbedderConfig) -> None:
         """
         self.config = embedder_config
 
-        self.embedding_model = SentenceTransformer(
-            self.config.model_name,
-            device=self.config.device,
-            prompts=embedder_config.get_prompt_config(),
-            similarity_fn_name=self.config.similarity_fn_name,
-            trust_remote_code=self.config.trust_remote_code,
-        )
-
-        self._logger = logging.getLogger(__name__)
-
     def __hash__(self) -> int:
         """Compute a hash value for the Embedder.
 
         Returns:
             The hash value of the Embedder.
         """
         hasher = Hasher()
-        for parameter in self.embedding_model.parameters():
-            hasher.update(parameter.detach().cpu().numpy())
+        if self.config.freeze:
+            commit_hash = _get_latest_commit_hash(self.config.model_name)
+            hasher.update(commit_hash)
+        else:
+            self._load_model()
+            for parameter in self.embedding_model.parameters():
+                hasher.update(parameter.detach().cpu().numpy())
         hasher.update(self.config.tokenizer_config.max_length)
         return hasher.intdigest()
 
+    def _load_model(self) -> None:
+        """Load sentence transformers model to device."""
+        if not hasattr(self, "embedding_model"):
+            self.embedding_model = SentenceTransformer(
+                self.config.model_name,
+                device=self.config.device,
+                prompts=self.config.get_prompt_config(),
+                similarity_fn_name=self.config.similarity_fn_name,
+                trust_remote_code=self.config.trust_remote_code,
+            )
+
     def clear_ram(self) -> None:
         """Move the embedding model to CPU and delete it from memory."""
-        self._logger.debug("Clearing embedder %s from memory", self.config.model_name)
-        self.embedding_model.cpu()
-        del self.embedding_model
-        torch.cuda.empty_cache()
+        if hasattr(self, "embedding_model"):
+            logger.debug("Clearing embedder %s from memory", self.config.model_name)
+            self.embedding_model.cpu()
+            del self.embedding_model
+            torch.cuda.empty_cache()
 
     def delete(self) -> None:
         """Delete the embedding model and its associated directory."""
@@ -165,11 +193,13 @@ def embed(self, utterances: list[str], task_type: TaskTypeEnum | None = None) ->
             hasher.update(self)
             hasher.update(utterances)
 
-            embeddings_path = get_embeddings_path(hasher.hexdigest())
+            embeddings_path = _get_embeddings_path(hasher.hexdigest())
             if embeddings_path.exists():
                 return np.load(embeddings_path)  # type: ignore[no-any-return]
 
-        self._logger.debug(
+        self._load_model()
+
+        logger.debug(
             "Calculating embeddings with model %s, batch_size=%d, max_seq_length=%s, embedder_device=%s",
             self.config.model_name,
             self.config.batch_size,
@@ -200,11 +230,11 @@ def similarity(
         """Calculate similarity between two sets of embeddings.
 
         Args:
-            embeddings1: First set of embeddings.
-            embeddings2: Second set of embeddings.
+            embeddings1: First set of embeddings (size n).
+            embeddings2: Second set of embeddings (size m).
 
         Returns:
-            A numpy array of similarities.
+            A numpy array of similarities (size n x m).
         """
-        result = self.embedding_model.similarity(embeddings1, embeddings2)
-        return result.detach().cpu().numpy().astype(np.float32)
+        similarity_fn = SimilarityFunction.to_similarity_fn(self.config.similarity_fn_name)
+        return similarity_fn(embeddings1, embeddings2).detach().cpu().numpy().astype(np.float32)
diff --git a/autointent/configs/_transformers.py b/autointent/configs/_transformers.py
@@ -61,9 +61,11 @@ class EmbedderConfig(HFModelConfig):
     sts_prompt: str | None = Field(None, description="Prompt for finding most similar sentences.")
     query_prompt: str | None = Field(None, description="Prompt for query.")
     passage_prompt: str | None = Field(None, description="Prompt for passage.")
-    similarity_fn_name: str | None = Field(
-        "cosine", description="Name of the similarity function to use (cosine, dot, euclidean, manhattan)."
+    similarity_fn_name: Literal["cosine", "dot", "euclidean", "manhattan"] = Field(
+        "cosine", description="Name of the similarity function to use."
     )
+    use_cache: bool = Field(True, description="Whether to use embeddings caching.")
+    freeze: bool = Field(True, description="Whether to freeze the model parameters.")
 
     def get_prompt_config(self) -> dict[str, str] | None:
         """Get the prompt config for the given prompt type.
@@ -111,8 +113,6 @@ def get_prompt_type(self, prompt_type: TaskTypeEnum | None) -> str | None:  # no
             return self.default_prompt
         assert_never(prompt_type)
 
-    use_cache: bool = Field(False, description="Whether to use embeddings caching.")
-
 
 class CrossEncoderConfig(HFModelConfig):
     model_name: str = Field("cross-encoder/ms-marco-MiniLM-L6-v2", description="Name of the hugging face model.")
diff --git a/docs/optimizer_config.schema.json b/docs/optimizer_config.schema.json
@@ -226,23 +226,28 @@
                     "title": "Passage Prompt"
                 },
                 "similarity_fn_name": {
-                    "anyOf": [
-                        {
-                            "type": "string"
-                        },
-                        {
-                            "type": "null"
-                        }
-                    ],
                     "default": "cosine",
-                    "description": "Name of the similarity function to use (cosine, dot, euclidean, manhattan).",
-                    "title": "Similarity Fn Name"
+                    "description": "Name of the similarity function to use.",
+                    "enum": [
+                        "cosine",
+                        "dot",
+                        "euclidean",
+                        "manhattan"
+                    ],
+                    "title": "Similarity Fn Name",
+                    "type": "string"
                 },
                 "use_cache": {
-                    "default": false,
+                    "default": true,
                     "description": "Whether to use embeddings caching.",
                     "title": "Use Cache",
                     "type": "boolean"
+                },
+                "freeze": {
+                    "default": true,
+                    "description": "Whether to freeze the model parameters.",
+                    "title": "Freeze",
+                    "type": "boolean"
                 }
             },
             "title": "EmbedderConfig",
@@ -418,7 +423,8 @@
                 "query_prompt": null,
                 "passage_prompt": null,
                 "similarity_fn_name": "cosine",
-                "use_cache": false
+                "use_cache": true,
+                "freeze": true
             }
         },
         "cross_encoder_config": {
diff --git a/tests/callback/test_callback.py b/tests/callback/test_callback.py
@@ -140,12 +140,13 @@ def test_pipeline_callbacks(dataset):
                         "cluster_prompt": None,
                         "default_prompt": None,
                         "device": None,
+                        "freeze": True,
                         "tokenizer_config": {"max_length": None, "truncation": True, "padding": True},
                         "model_name": "sergeyzh/rubert-tiny-turbo",
                         "passage_prompt": None,
                         "query_prompt": None,
                         "sts_prompt": None,
-                        "use_cache": False,
+                        "use_cache": True,
                         "similarity_fn_name": "cosine",
                         "trust_remote_code": False,
                     },
@@ -176,12 +177,13 @@ def test_pipeline_callbacks(dataset):
                         "cluster_prompt": None,
                         "default_prompt": None,
                         "device": None,
+                        "freeze": True,
                         "tokenizer_config": {"max_length": None, "truncation": True, "padding": True},
                         "model_name": "sergeyzh/rubert-tiny-turbo",
                         "passage_prompt": None,
                         "query_prompt": None,
                         "sts_prompt": None,
-                        "use_cache": False,
+                        "use_cache": True,
                         "similarity_fn_name": "cosine",
                         "trust_remote_code": False,
                     },
@@ -212,12 +214,13 @@ def test_pipeline_callbacks(dataset):
                         "cluster_prompt": None,
                         "default_prompt": None,
                         "device": None,
+                        "freeze": True,
                         "tokenizer_config": {"max_length": None, "truncation": True, "padding": True},
                         "model_name": "sergeyzh/rubert-tiny-turbo",
                         "passage_prompt": None,
                         "query_prompt": None,
                         "sts_prompt": None,
-                        "use_cache": False,
+                        "use_cache": True,
                         "similarity_fn_name": "cosine",
                         "trust_remote_code": False,
                     },
diff --git a/user_guides/advanced/02_automl.py b/user_guides/advanced/02_automl.py