update after merge

Samoed · Samoed · commit cf325f1697e7 · 2025-01-19T15:41:31.000+03:00
diff --git a/autointent/_embedder.py b/autointent/_embedder.py
@@ -70,7 +70,7 @@ def __init__(
         """
         Initialize the Embedder.
 
-        :param model_name: Path to a local model directory or a Hugging Face model name.
+        :param model_name_or_path: Path to a local model directory or a Hugging Face model name.
         :param device: Device to run the model on (e.g., "cpu", "cuda").
         :param batch_size: Batch size for embedding calculations.
         :param max_length: Maximum sequence length for the embedding model.
diff --git a/autointent/modules/__init__.py b/autointent/modules/__init__.py
@@ -25,11 +25,23 @@ def _create_modules_dict(modules: list[type[T]]) -> dict[str, type[T]]:
 RETRIEVAL_MODULES_MULTILABEL = RETRIEVAL_MODULES_MULTICLASS
 
 SCORING_MODULES_MULTICLASS: dict[str, type[ScoringModule]] = _create_modules_dict(
-    [DNNCScorer, KNNScorer, LinearScorer, DescriptionScorer, RerankScorer, SklearnScorer,]
+    [
+        DNNCScorer,
+        KNNScorer,
+        LinearScorer,
+        DescriptionScorer,
+        RerankScorer,
+        SklearnScorer,
+    ]
 )
 
 SCORING_MODULES_MULTILABEL: dict[str, type[ScoringModule]] = _create_modules_dict(
-    [MLKnnScorer, LinearScorer, DescriptionScorer, SklearnScorer, ],
+    [
+        MLKnnScorer,
+        LinearScorer,
+        DescriptionScorer,
+        SklearnScorer,
+    ],
 )
 
 PREDICTION_MODULES_MULTICLASS: dict[str, type[DecisionModule]] = _create_modules_dict(
@@ -40,29 +52,4 @@ def _create_modules_dict(modules: list[type[T]]) -> dict[str, type[T]]:
     [AdaptiveDecision, ThresholdDecision, TunableDecision],
 )
 
-__all__ = [
-    "PREDICTION_MODULES_MULTICLASS",
-    "PREDICTION_MODULES_MULTILABEL",
-    "RETRIEVAL_MODULES_MULTICLASS",
-    "RETRIEVAL_MODULES_MULTILABEL",
-    "SCORING_MODULES_MULTICLASS",
-    "SCORING_MODULES_MULTILABEL",
-    "AdaptivePredictor",
-    "ArgmaxPredictor",
-    "DNNCScorer",
-    "DescriptionScorer",
-    "JinoosPredictor",
-    "KNNScorer",
-    "LinearScorer",
-    "MLKnnScorer",
-    "Module",
-    "PredictionModule",
-    "RegExp",
-    "RerankScorer",
-    "RetrievalModule",
-    "ScoringModule",
-    "ThresholdPredictor",
-    "TunablePredictor",
-    "VectorDBModule",
-    "SklearnScorer",
-]
+__all__ = []  # type: ignore[var-annotated]
diff --git a/autointent/modules/scoring/_sklearn/scorer.py b/autointent/modules/scoring/_sklearn/scorer.py
@@ -5,14 +5,14 @@
 import joblib
 import numpy as np
 import numpy.typing as npt
+from sklearn.linear_model import LogisticRegression
 from sklearn.multioutput import MultiOutputClassifier
 from sklearn.utils import all_estimators
 from typing_extensions import Self
 
 from autointent import Context, Embedder
-from autointent.context.vector_index_client import VectorIndexClient
 from autointent.custom_types import BaseMetadataDict, LabelType
-from autointent.modules.scoring._base import ScoringModule
+from autointent.modules.abc import ScoringModule
 
 AVAILIABLE_CLASSIFIERS = {name: class_ for name, class_ in all_estimators() if hasattr(class_, "predict_proba")}
 
@@ -90,7 +90,7 @@ def __init__(
     def from_context(
         cls,
         context: Context,
-        clf_name: str,
+        clf_name: str = LogisticRegression.__name__,
         clf_args: dict[str, Any] | None = None,
         embedder_name: str | None = None,
     ) -> Self:
@@ -105,10 +105,8 @@ def from_context(
         """
         if embedder_name is None:
             embedder_name = context.optimization_info.get_best_embedder()
-            precomputed_embeddings = True
-        else:
-            precomputed_embeddings = context.vector_index_client.exists(embedder_name)
-        instance = cls(
+
+        return cls(
             embedder_name=embedder_name,
             device=context.get_device(),
             seed=context.seed,
@@ -117,9 +115,6 @@ def from_context(
             clf_name=clf_name,
             clf_args=clf_args,
         )
-        instance.precomputed_embeddings = precomputed_embeddings
-        instance.db_dir = str(context.get_db_dir())
-        return instance
 
     def fit(
         self,
@@ -135,23 +130,13 @@ def fit(
         """
         self._multilabel = isinstance(labels[0], list)
 
-        if self.precomputed_embeddings:
-            # this happens only when SklearnScorer is within Pipeline opimization after RetrievalNode optimization
-            vector_index_client = VectorIndexClient(self.device, self.db_dir, self.batch_size, self.max_length)
-            vector_index = vector_index_client.get_index(self.embedder_name)
-            features = vector_index.get_all_embeddings()
-            if len(features) != len(utterances):
-                msg = "Vector index mismatches provided utterances"
-                raise ValueError(msg)
-            embedder = vector_index.embedder
-        else:
-            embedder = Embedder(
-                device=self.device,
-                model_name=self.embedder_name,
-                batch_size=self.batch_size,
-                max_length=self.max_length,
-            )
-            features = embedder.embed(utterances)
+        embedder = Embedder(
+            device=self.device,
+            model_name_or_path=self.embedder_name,
+            batch_size=self.batch_size,
+            max_length=self.max_length,
+        )
+        features = embedder.embed(utterances)
         self.clf_args = {} if self.clf_args is None else self.clf_args
         if AVAILIABLE_CLASSIFIERS.get(self.clf_name):
             base_clf = AVAILIABLE_CLASSIFIERS[self.clf_name](**self.clf_args)
@@ -229,7 +214,7 @@ def load(self, path: str) -> None:
         embedder_dir = dump_dir / self.embedding_model_subdir
         self._embedder = Embedder(
             device=self.device,
-            model_name=embedder_dir,
+            model_name_or_path=embedder_dir,
             batch_size=metadata["batch_size"],
             max_length=metadata["max_length"],
         )