add test

Samoed · Samoed · commit 0c3b3bcde9e6 · 2025-01-19T17:09:15.000+03:00
diff --git a/autointent/modules/scoring/_sklearn/__init__.py b/autointent/modules/scoring/_sklearn/__init__.py
@@ -1,3 +1,3 @@
-from .scorer import SklearnScorer
+from .sklearn_scorer import SklearnScorer
 
 __all__ = ["SklearnScorer"]
diff --git a/autointent/modules/scoring/_sklearn/sklearn_scorer.py b/autointent/modules/scoring/_sklearn/sklearn_scorer.py
@@ -1,4 +1,5 @@
 import json
+import logging
 from pathlib import Path
 from typing import Any
 
@@ -14,7 +15,19 @@
 from autointent.custom_types import BaseMetadataDict, LabelType
 from autointent.modules.abc import ScoringModule
 
-AVAILIABLE_CLASSIFIERS = {name: class_ for name, class_ in all_estimators() if hasattr(class_, "predict_proba")}
+logger = logging.getLogger(__name__)
+AVAILIABLE_CLASSIFIERS = {
+    name: class_
+    for name, class_ in all_estimators(
+        type_filter=[
+            # remove transformer (e.g. TfidfTransformer) from the list of available classifiers
+            "classifier",
+            "regressor",
+            "cluster",
+        ]
+    )
+    if hasattr(class_, "predict_proba")
+}
 
 
 class SklearnScorerDumpDict(BaseMetadataDict):
@@ -64,7 +77,7 @@ def __init__(
         max_length: int | None = None,
     ) -> None:
         """
-        Initialize the LinearScorer.
+        Initialize the SklearnScorer.
 
         :param embedder_name: Name of the embedder model.
         :param clf_name: Name of the sklearn classifier to use.
@@ -84,7 +97,7 @@ def __init__(
         self.batch_size = batch_size
         self.max_length = max_length
         self.clf_name = clf_name
-        self.clf_args = clf_args
+        self.clf_args = clf_args or {}
 
     @classmethod
     def from_context(
@@ -122,7 +135,7 @@ def fit(
         labels: list[LabelType],
     ) -> None:
         """
-        Train the chosen skearn classifier.
+        Train the chosen sklearn classifier.
 
         :param utterances: List of training utterances.
         :param labels: List of labels corresponding to the utterances.
@@ -137,11 +150,11 @@ def fit(
             max_length=self.max_length,
         )
         features = embedder.embed(utterances)
-        self.clf_args = {} if self.clf_args is None else self.clf_args
         if AVAILIABLE_CLASSIFIERS.get(self.clf_name):
             base_clf = AVAILIABLE_CLASSIFIERS[self.clf_name](**self.clf_args)
         else:
             msg = f"Class {self.clf_name} does not exist in sklearn or does not have predict_proba method"
+            logger.error(msg)
             raise ValueError(msg)
 
         clf = MultiOutputClassifier(base_clf) if self._multilabel else base_clf
@@ -170,7 +183,7 @@ def clear_cache(self) -> None:
 
     def dump(self, path: str) -> None:
         """
-        Save the LinearScorer's metadata, classifier, and embedder to disk.
+        Save the SklearnScorer's metadata, classifier, and embedder to disk.
 
         :param path: Path to the directory where assets will be dumped.
         """
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -7,7 +7,7 @@
 from autointent import Dataset
 
 
-def setup_environment() -> tuple[Path, Path, Path]:
+def setup_environment() -> tuple[Path, Path]:
     logs_dir = ires.files("tests").joinpath("logs") / str(uuid4())
     dump_dir = logs_dir / "modules_dump"
     return dump_dir, logs_dir
diff --git a/tests/modules/scoring/test_sklearn.py b/tests/modules/scoring/test_sklearn.py
@@ -0,0 +1,61 @@
+import numpy as np
+
+from autointent.context.data_handler import DataHandler
+from autointent.modules import SklearnScorer
+from tests.conftest import setup_environment
+
+
+def test_base_linear(dataset):
+    dump_dir, logs_dir = setup_environment()
+
+    data_handler = DataHandler(dataset)
+
+    scorer = SklearnScorer(embedder_name="sergeyzh/rubert-tiny-turbo", clf_name="LogisticRegression")
+
+    scorer.fit(data_handler.train_utterances(0), data_handler.train_labels(0))
+    test_data = [
+        "why is there a hold on my american saving bank account",
+        "i am nost sure why my account is blocked",
+        "why is there a hold on my capital one checking account",
+        "i think my account is blocked but i do not know the reason",
+        "can you tell me why is my bank account frozen",
+    ]
+    predictions = scorer.predict(test_data)
+
+    np.testing.assert_almost_equal(
+        np.array(
+            [
+                [
+                    0.23748632,
+                    0.39067508,
+                    0.2393372,
+                    0.13250139,
+                ],
+                [0.23913757, 0.37610976, 0.24952359, 0.13522908],
+                [
+                    0.25714506,
+                    0.34984371,
+                    0.25495681,
+                    0.13805442,
+                ],
+                [
+                    0.2571957,
+                    0.34850898,
+                    0.25346288,
+                    0.14083245,
+                ],
+                [
+                    0.23885061,
+                    0.41527567,
+                    0.21830964,
+                    0.12756408,
+                ],
+            ],
+        ),
+        predictions,
+        decimal=2,
+    )
+
+    predictions, metadata = scorer.predict_with_metadata(test_data)
+    assert len(predictions) == len(test_data)
+    assert metadata is None

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`		`-from .scorer import SklearnScorer`
	`1`	`+from .sklearn_scorer import SklearnScorer`
`2`	`2`
`3`	`3`	`__all__ = ["SklearnScorer"]`