use our metrics

voorhs · voorhs · commit e72bde59f5f9 · 2025-06-10T22:34:00.000+03:00
diff --git a/autointent/configs/_transformers.py b/autointent/configs/_transformers.py
@@ -4,6 +4,8 @@
 from pydantic import BaseModel, ConfigDict, Field, PositiveInt
 from typing_extensions import Self, assert_never
 
+from autointent.metrics import SCORING_METRICS_MULTICLASS, SCORING_METRICS_MULTILABEL
+
 
 class TokenizerConfig(BaseModel):
     padding: bool | Literal["longest", "max_length", "do_not_pad"] = True
@@ -128,5 +130,4 @@ class EarlyStoppingConfig(BaseModel):
     val_fraction: float = 0.2
     patience: int = 1
     threshold: float = 0.0
-    metric: Literal["f1", "accuracy", "recall", "precision"] | None = "f1"
-    averaging: Literal["macro", "micro"] = "macro"  # doesnt affect `accuracy`
+    metric: Literal[tuple((SCORING_METRICS_MULTILABEL | SCORING_METRICS_MULTICLASS).keys())] | None = "scoring_f1"  # type: ignore[valid-type]
diff --git a/autointent/modules/scoring/_bert.py b/autointent/modules/scoring/_bert.py
@@ -4,7 +4,6 @@
 from collections.abc import Callable
 from typing import Any
 
-import evaluate
 import numpy as np
 import numpy.typing as npt
 import torch
@@ -25,6 +24,7 @@
 from autointent._callbacks import REPORTERS_NAMES
 from autointent.configs import EarlyStoppingConfig, HFModelConfig
 from autointent.custom_types import ListOfLabels
+from autointent.metrics import SCORING_METRICS_MULTICLASS, SCORING_METRICS_MULTILABEL
 from autointent.modules.base import BaseScorer
 
 
@@ -191,19 +191,13 @@ def _get_compute_metrics(self) -> Callable[[EvalPrediction], dict[str, float]] |
         if self.early_stopping_config.metric is None:
             return None
 
-        metric_fn = evaluate.load(self.early_stopping_config.metric)
-
-        compute_kwargs = {}
-
-        if self.early_stopping_config.metric in ["f1", "recall", "precision"]:
-            compute_kwargs["average"] = self.early_stopping_config.averaging
+        metric_name = self.early_stopping_config.metric
+        metric_fn = (SCORING_METRICS_MULTILABEL | SCORING_METRICS_MULTICLASS)[metric_name]
 
         def compute_metrics(output: EvalPrediction) -> dict[str, float]:
-            return metric_fn.compute(  # type: ignore[no-any-return]
-                predictions=output.predictions.argmax(axis=-1).tolist(),  # type: ignore[union-attr]
-                references=output.label_ids,
-                **compute_kwargs,
-            )
+            return {
+                metric_name: metric_fn(output.label_ids.tolist(), output.predictions.tolist())  # type: ignore[union-attr]
+            }
 
         return compute_metrics