report multiple scores

Samoed · Samoed · commit 9f6613ba5e04 · 2025-01-11T17:15:25.000+03:00
diff --git a/autointent/_callbacks/base.py b/autointent/_callbacks/base.py
@@ -42,6 +42,14 @@ def log_value(self, **kwargs: dict[str, Any]) -> None:
         :param kwargs: Data to log.
         """
 
+    @abstractmethod
+    def log_metrics(self, metrics: dict[str, Any]) -> None:
+        """
+        Log metrics during training.
+
+        :param metrics: Metrics to log.
+        """
+
     @abstractmethod
     def end_module(self) -> None:
         """End a module."""
diff --git a/autointent/_callbacks/callback_handler.py b/autointent/_callbacks/callback_handler.py
@@ -44,6 +44,14 @@ def log_value(self, **kwargs: dict[str, Any]) -> None:
         """
         self.call_events("log_value", **kwargs)
 
+    def log_metrics(self, metrics: dict[str, Any]) -> None:
+        """
+        Log metrics during training.
+
+        :param metrics: Metrics to log.
+        """
+        self.call_events("log_metrics", metrics=metrics)
+
     def end_module(self) -> None:
         """End a module."""
         self.call_events("end_module")
diff --git a/autointent/_callbacks/tensorboard.py b/autointent/_callbacks/tensorboard.py
@@ -73,6 +73,22 @@ def log_value(self, **kwargs: dict[str, Any]) -> None:
             else:
                 self.module_writer.add_text(key, str(value))  # type: ignore[no-untyped-call]
 
+    def log_metrics(self, metrics: dict[str, Any]) -> None:
+        """
+        Log metrics during training.
+
+        :param metrics: Metrics to log.
+        """
+        if self.module_writer is None:
+            msg = "start_run must be called before log_value."
+            raise RuntimeError(msg)
+
+        for key, value in metrics.items():
+            if isinstance(value, int | float):
+                self.module_writer.add_scalar(key, value)  # type: ignore[no-untyped-call]
+            else:
+                self.module_writer.add_text(key, str(value))  # type: ignore[no-untyped-call]
+
     def log_final_metrics(self, metrics: dict[str, Any]) -> None:
         """
         Log final metrics.
diff --git a/autointent/_callbacks/wandb.py b/autointent/_callbacks/wandb.py
@@ -59,6 +59,14 @@ def log_value(self, **kwargs: dict[str, Any]) -> None:
         """
         self.wandb.log(kwargs)
 
+    def log_metrics(self, metrics: dict[str, Any]) -> None:
+        """
+        Log metrics during training.
+
+        :param metrics: Metrics to log.
+        """
+        self.wandb.log(metrics)
+
     def log_final_metrics(self, metrics: dict[str, Any]) -> None:
         """
         Log final metrics.
diff --git a/autointent/_embedder.py b/autointent/_embedder.py
@@ -179,4 +179,4 @@ def embed(self, utterances: list[str]) -> npt.NDArray[np.float32]:
             embeddings_path.parent.mkdir(parents=True, exist_ok=True)
             np.save(embeddings_path, embeddings)
 
-        return embeddings  # type: ignore[return-value]
+        return embeddings
diff --git a/autointent/metrics/decision.py b/autointent/metrics/decision.py
@@ -50,7 +50,7 @@ def decision_accuracy(y_true: LABELS_VALUE_TYPE, y_pred: LABELS_VALUE_TYPE) -> f
     :return: Score of the decision accuracy
     """
     y_true_, y_pred_ = transform(y_true, y_pred)
-    return np.mean(y_true_ == y_pred_)  # type: ignore[no-any-return]
+    return float(np.mean(y_true_ == y_pred_))
 
 
 def _decision_roc_auc_multiclass(y_true: LABELS_VALUE_TYPE, y_pred: LABELS_VALUE_TYPE) -> float:
@@ -83,7 +83,7 @@ def _decision_roc_auc_multiclass(y_true: LABELS_VALUE_TYPE, y_pred: LABELS_VALUE
         binarized_pred = (y_pred_ == k).astype(int)
         roc_auc_scores.append(roc_auc_score(binarized_true, binarized_pred))
 
-    return np.mean(roc_auc_scores)  # type: ignore[return-value]
+    return float(np.mean(roc_auc_scores))
 
 
 def _decision_roc_auc_multilabel(y_true: LABELS_VALUE_TYPE, y_pred: LABELS_VALUE_TYPE) -> float:
@@ -98,7 +98,7 @@ def _decision_roc_auc_multilabel(y_true: LABELS_VALUE_TYPE, y_pred: LABELS_VALUE
     :param y_pred: Predicted values of labels
     :return: Score of the decision accuracy
     """
-    return roc_auc_score(y_true, y_pred, average="macro")  # type: ignore[no-any-return]
+    return float(roc_auc_score(y_true, y_pred, average="macro"))
 
 
 def decision_roc_auc(y_true: LABELS_VALUE_TYPE, y_pred: LABELS_VALUE_TYPE) -> float:
@@ -135,7 +135,7 @@ def decision_precision(y_true: LABELS_VALUE_TYPE, y_pred: LABELS_VALUE_TYPE) ->
     :param y_pred: Predicted values of labels
     :return: Score of the decision precision
     """
-    return precision_score(y_true, y_pred, average="macro")  # type: ignore[no-any-return]
+    return float(precision_score(y_true, y_pred, average="macro"))
 
 
 def decision_recall(y_true: LABELS_VALUE_TYPE, y_pred: LABELS_VALUE_TYPE) -> float:
@@ -150,7 +150,7 @@ def decision_recall(y_true: LABELS_VALUE_TYPE, y_pred: LABELS_VALUE_TYPE) -> flo
     :param y_pred: Predicted values of labels
     :return: Score of the decision recall
     """
-    return recall_score(y_true, y_pred, average="macro")  # type: ignore[no-any-return]
+    return float(recall_score(y_true, y_pred, average="macro"))
 
 
 def decision_f1(y_true: LABELS_VALUE_TYPE, y_pred: LABELS_VALUE_TYPE) -> float:
@@ -165,4 +165,4 @@ def decision_f1(y_true: LABELS_VALUE_TYPE, y_pred: LABELS_VALUE_TYPE) -> float:
     :param y_pred: Predicted values of labels
     :return: Score of the decision accuracy
     """
-    return f1_score(y_true, y_pred, average="macro")  # type: ignore[no-any-return]
+    return float(f1_score(y_true, y_pred, average="macro"))
diff --git a/autointent/metrics/retrieval.py b/autointent/metrics/retrieval.py
@@ -630,7 +630,7 @@ def retrieval_ndcg(query_labels: LABELS_VALUE_TYPE, candidates_labels: CANDIDATE
         cur_idcg = _idcg(rel_scores, k)
         ndcg_scores.append(0.0 if cur_idcg == 0 else cur_dcg / cur_idcg)
 
-    return np.mean(ndcg_scores)  # type: ignore[return-value]
+    return float(np.mean(ndcg_scores))
 
 
 def retrieval_ndcg_intersecting(
diff --git a/autointent/metrics/scoring.py b/autointent/metrics/scoring.py
@@ -71,7 +71,7 @@ def scoring_log_likelihood(labels: LABELS_VALUE_TYPE, scores: SCORES_VALUE_TYPE,
         log_likelihood = labels_array * np.log(scores_array) + (1 - labels_array) * np.log(1 - scores_array)
         clipped_one = log_likelihood.clip(min=-100, max=100)
         res = clipped_one.mean()
-    return res  # type: ignore[no-any-return]
+    return float(res)
 
 
 def scoring_roc_auc(labels: LABELS_VALUE_TYPE, scores: SCORES_VALUE_TYPE) -> float:
@@ -96,7 +96,7 @@ def scoring_roc_auc(labels: LABELS_VALUE_TYPE, scores: SCORES_VALUE_TYPE) -> flo
     if labels_.ndim == 1:
         labels_ = (labels_[:, None] == np.arange(n_classes)[None, :]).astype(int)
 
-    return roc_auc_score(labels_, scores_, average="macro")  # type: ignore[no-any-return]
+    return float(roc_auc_score(labels_, scores_, average="macro"))
 
 
 def _calculate_decision_metric(func: DecisionMetricFn, labels: LABELS_VALUE_TYPE, scores: SCORES_VALUE_TYPE) -> float:
@@ -206,7 +206,7 @@ def scoring_hit_rate(labels: LABELS_VALUE_TYPE, scores: SCORES_VALUE_TYPE) -> fl
     top_ranked_labels = np.argmax(scores_, axis=1)
     is_in = labels_[np.arange(len(labels)), top_ranked_labels]
 
-    return np.mean(is_in)  # type: ignore[no-any-return]
+    return float(np.mean(is_in))
 
 
 def scoring_neg_coverage(labels: LABELS_VALUE_TYPE, scores: SCORES_VALUE_TYPE) -> float:
@@ -242,7 +242,7 @@ def scoring_neg_coverage(labels: LABELS_VALUE_TYPE, scores: SCORES_VALUE_TYPE) -
     labels_, scores_ = transform(labels, scores)
 
     n_classes = scores_.shape[1]
-    return 1 - (coverage_error(labels, scores) - 1) / (n_classes - 1)  # type: ignore[no-any-return]
+    return float(1 - (coverage_error(labels, scores) - 1) / (n_classes - 1))
 
 
 def scoring_neg_ranking_loss(labels: LABELS_VALUE_TYPE, scores: SCORES_VALUE_TYPE) -> float:
@@ -258,7 +258,7 @@ def scoring_neg_ranking_loss(labels: LABELS_VALUE_TYPE, scores: SCORES_VALUE_TYP
     :param scores: for each utterance, this list contains scores for each of `n_classes` classes
     :return: Score of the scoring metric
     """
-    return -label_ranking_loss(labels, scores)  # type: ignore[no-any-return]
+    return float(-label_ranking_loss(labels, scores))
 
 
 def scoring_map(labels: LABELS_VALUE_TYPE, scores: SCORES_VALUE_TYPE) -> float:
@@ -274,4 +274,4 @@ def scoring_map(labels: LABELS_VALUE_TYPE, scores: SCORES_VALUE_TYPE) -> float:
     :param scores: for each sample, this list contains scores for each of `n_classes` classes
     :return: mean average precision score
     """
-    return label_ranking_average_precision_score(labels, scores)  # type: ignore[no-any-return]
+    return float(label_ranking_average_precision_score(labels, scores))
diff --git a/autointent/modules/abc/_base.py b/autointent/modules/abc/_base.py
@@ -8,7 +8,6 @@
 from autointent.context import Context
 from autointent.context.optimization_info import Artifact
 from autointent.custom_types import BaseMetadataDict
-from autointent.metrics import METRIC_FN
 
 
 class Module(ABC):
@@ -33,14 +32,15 @@ def score(
         self,
         context: Context,
         split: Literal["validation", "test"],
-        metric_fn: METRIC_FN,
-    ) -> float:
+        main_metric: str,
+    ) -> dict[str, float | str]:
         """
         Calculate metric on test set and return metric value.
 
         :param context: Context to score
         :param split: Split to score on
-        :param metric_fn: Metric function
+        :param main_metric: Name of main metric for evaluation
+        :return: Computed metrics value for the test set or error code of metrics
         """
 
     @abstractmethod
@@ -102,3 +102,20 @@ def from_context(cls, context: Context, **kwargs: dict[str, Any]) -> "Module":
     def get_embedder_name(self) -> str | None:
         """Experimental method."""
         return None
+
+    @staticmethod
+    def score_metrics(params: tuple[Any, Any], metrics_dict: dict[str, Any]) -> dict[str, float | str]:
+        """
+        Score metrics on the test set.
+
+        :param params: Params to score
+        :param metrics_dict:
+        :return:
+        """
+        metrics = {}
+        for metric_name, metric_fn in metrics_dict.items():
+            try:
+                metrics[metric_name] = metric_fn(*params)
+            except Exception as e:  # noqa: PERF203, BLE001
+                metrics[metric_name] = str(e)
+        return metrics
diff --git a/autointent/modules/abc/_decision.py b/autointent/modules/abc/_decision.py
@@ -9,7 +9,7 @@
 from autointent import Context
 from autointent.context.optimization_info import DecisionArtifact
 from autointent.custom_types import LabelType
-from autointent.metrics import DecisionMetricFn
+from autointent.metrics import PREDICTION_METRICS_MULTICLASS
 from autointent.modules.abc import Module
 from autointent.schemas import Tag
 
@@ -44,19 +44,19 @@ def score(
         self,
         context: Context,
         split: Literal["validation", "test"],
-        metric_fn: DecisionMetricFn,
-    ) -> float:
+        main_metric: str,
+    ) -> dict[str, float | str]:
         """
         Calculate metric on test set and return metric value.
 
         :param context: Context to score
         :param split: Target split
-        :param metric_fn: Metric function
-        :return: Score
+        :param main_metric: Name of main metric for evaluation
+        :return: Computed metrics value for the test set or error code of metrics
         """
         labels, scores = get_decision_evaluation_data(context, split)
         self._decisions = self.predict(scores)
-        return metric_fn(labels, self._decisions)
+        return self.score_metrics((labels, self._decisions), PREDICTION_METRICS_MULTICLASS)
 
     def get_assets(self) -> DecisionArtifact:
         """Return useful assets that represent intermediate data into context."""
diff --git a/autointent/modules/abc/_scoring.py b/autointent/modules/abc/_scoring.py
@@ -8,7 +8,7 @@
 from autointent import Context
 from autointent.context.optimization_info import ScorerArtifact
 from autointent.custom_types import Split
-from autointent.metrics import ScoringMetricFn
+from autointent.metrics import SCORING_METRICS_MULTICLASS, SCORING_METRICS_MULTILABEL
 from autointent.modules.abc import Module
 
 
@@ -24,15 +24,15 @@ def score(
         self,
         context: Context,
         split: Literal["validation", "test"],
-        metric_fn: ScoringMetricFn,
-    ) -> float:
+        main_metric: str,
+    ) -> dict[str, float | str]:
         """
         Evaluate the scorer on a test set and compute the specified metric.
 
         :param context: Context containing test set and other data.
         :param split: Target split
-        :param metric_fn: Function to compute the scoring metric.
-        :return: Computed metric value for the test set.
+        :param main_metric: Name of main metric for evaluation
+        :return: Computed metrics value for the test set or error code of metrics
         """
         if split == "validation":
             utterances = context.data_handler.validation_utterances(0)
@@ -58,7 +58,8 @@ def score(
         self._validation_scores = self.predict(context.data_handler.validation_utterances(1))
         self._test_scores = self.predict(context.data_handler.test_utterances())
 
-        return metric_fn(labels, scores)
+        metrics_dict = SCORING_METRICS_MULTILABEL if context.is_multilabel() else SCORING_METRICS_MULTICLASS
+        return self.score_metrics((labels, scores), metrics_dict)
 
     def get_assets(self) -> ScorerArtifact:
         """
diff --git a/autointent/modules/embedding/_retrieval.py b/autointent/modules/embedding/_retrieval.py
@@ -8,7 +8,7 @@
 from autointent.context.optimization_info import RetrieverArtifact
 from autointent.context.vector_index_client import VectorIndex, VectorIndexClient, get_db_dir
 from autointent.custom_types import BaseMetadataDict, LabelType
-from autointent.metrics import RetrievalMetricFn
+from autointent.metrics import RETRIEVAL_METRICS_MULTICLASS, RETRIEVAL_METRICS_MULTILABEL
 from autointent.modules.abc import EmbeddingModule
 
 
@@ -150,15 +150,15 @@ def score(
         self,
         context: Context,
         split: Literal["validation", "test"],
-        metric_fn: RetrievalMetricFn,
-    ) -> float:
+        main_metric: str,
+    ) -> dict[str, float | str]:
         """
         Evaluate the embedding model using a specified metric function.
 
         :param context: The context containing test data and labels.
         :param split: Target split
-        :param metric_fn: Function to compute the retrieval metric.
-        :return: Computed metric score.
+        :param main_metric: Name of main metric for evaluation
+        :return: Computed metrics value for the test set or error code of metrics
         """
         if split == "validation":
             utterances = context.data_handler.validation_utterances(0)
@@ -170,7 +170,9 @@ def score(
             message = f"Invalid split '{split}' provided. Expected one of 'validation', or 'test'."
             raise ValueError(message)
         predictions, _, _ = self.vector_index.query(utterances, self.k)
-        return metric_fn(labels, predictions)
+
+        metrics_dict = RETRIEVAL_METRICS_MULTILABEL if context.is_multilabel() else RETRIEVAL_METRICS_MULTICLASS
+        return self.score_metrics((labels, predictions), metrics_dict)
 
     def get_assets(self) -> RetrieverArtifact:
         """
diff --git a/autointent/modules/regexp/_regexp.py b/autointent/modules/regexp/_regexp.py
@@ -9,7 +9,7 @@
 from autointent.context.data_handler._data_handler import RegexPatterns
 from autointent.context.optimization_info import Artifact
 from autointent.custom_types import LabelType
-from autointent.metrics.regexp import RegexpMetricFn
+from autointent.metrics import REGEXP_METRICS
 from autointent.modules.abc import Module
 from autointent.schemas import Intent
 
@@ -114,13 +114,15 @@ def score(
         self,
         context: Context,
         split: Literal["validation", "test"],
-        metric_fn: RegexpMetricFn,
-    ) -> float:
+        main_metric: str,
+    ) -> dict[str, float | str]:
         """
         Calculate metric on test set and return metric value.
 
         :param context: Context to score
-        :param metric_fn: Metric function
+        :param split: Split to score on
+        :param main_metric: Name of main metric for evaluation
+        :return: Computed metrics value for the test set or error code of metrics
         """
         # TODO add parameter to a whole pipeline (or just to regexp module):
         # whether or not to omit utterances on next stages if they were detected with regexp module
@@ -133,7 +135,7 @@ def score(
         if assets["test_matches"] is None:
             msg = "no matches found"
             raise ValueError(msg)
-        return metric_fn(context.data_handler.test_labels(), assets["test_matches"])
+        return self.score_metrics((context.data_handler.test_labels(), assets["test_matches"]), REGEXP_METRICS)
 
     def clear_cache(self) -> None:
         """Clear cache."""
diff --git a/autointent/modules/scoring/_dnnc/dnnc.py b/autointent/modules/scoring/_dnnc/dnnc.py
@@ -241,7 +241,7 @@ def _get_cross_encoder_scores(self, utterances: list[str], candidates: list[list
             logger.error(msg)
             raise ValueError(msg)
 
-        flattened_cross_encoder_scores: npt.NDArray[np.float64] = self.model.predict(flattened_text_pairs)  # type: ignore[assignment]
+        flattened_cross_encoder_scores: npt.NDArray[np.float64] = self.model.predict(flattened_text_pairs)
         return [
             flattened_cross_encoder_scores[i : i + self.k].tolist()  # type: ignore[misc]
             for i in range(0, len(flattened_cross_encoder_scores), self.k)
diff --git a/autointent/nodes/_optimization/_node_optimizer.py b/autointent/nodes/_optimization/_node_optimizer.py
diff --git a/tests/callback/test_callback.py b/tests/callback/test_callback.py