report multiple scores (#91)

Samoed · web-flow · commit 0b507ae8f28d · 2025-01-14T12:01:38.000+03:00
* report multiple scores

* try to fix

* fix score test

* remove main metric

* remove main metric

* fix

* fix
diff --git a/autointent/_callbacks/base.py b/autointent/_callbacks/base.py
@@ -42,6 +42,14 @@ def log_value(self, **kwargs: dict[str, Any]) -> None:
         :param kwargs: Data to log.
         """
 
+    @abstractmethod
+    def log_metrics(self, metrics: dict[str, Any]) -> None:
+        """
+        Log metrics during training.
+
+        :param metrics: Metrics to log.
+        """
+
     @abstractmethod
     def end_module(self) -> None:
         """End a module."""
diff --git a/autointent/_callbacks/callback_handler.py b/autointent/_callbacks/callback_handler.py
@@ -44,6 +44,14 @@ def log_value(self, **kwargs: dict[str, Any]) -> None:
         """
         self.call_events("log_value", **kwargs)
 
+    def log_metrics(self, metrics: dict[str, Any]) -> None:
+        """
+        Log metrics during training.
+
+        :param metrics: Metrics to log.
+        """
+        self.call_events("log_metrics", metrics=metrics)
+
     def end_module(self) -> None:
         """End a module."""
         self.call_events("end_module")
diff --git a/autointent/_callbacks/tensorboard.py b/autointent/_callbacks/tensorboard.py
@@ -73,6 +73,22 @@ def log_value(self, **kwargs: dict[str, Any]) -> None:
             else:
                 self.module_writer.add_text(key, str(value))  # type: ignore[no-untyped-call]
 
+    def log_metrics(self, metrics: dict[str, Any]) -> None:
+        """
+        Log metrics during training.
+
+        :param metrics: Metrics to log.
+        """
+        if self.module_writer is None:
+            msg = "start_run must be called before log_value."
+            raise RuntimeError(msg)
+
+        for key, value in metrics.items():
+            if isinstance(value, int | float):
+                self.module_writer.add_scalar(key, value)  # type: ignore[no-untyped-call]
+            else:
+                self.module_writer.add_text(key, str(value))  # type: ignore[no-untyped-call]
+
     def log_final_metrics(self, metrics: dict[str, Any]) -> None:
         """
         Log final metrics.
diff --git a/autointent/_callbacks/wandb.py b/autointent/_callbacks/wandb.py
@@ -59,6 +59,14 @@ def log_value(self, **kwargs: dict[str, Any]) -> None:
         """
         self.wandb.log(kwargs)
 
+    def log_metrics(self, metrics: dict[str, Any]) -> None:
+        """
+        Log metrics during training.
+
+        :param metrics: Metrics to log.
+        """
+        self.wandb.log(metrics)
+
     def log_final_metrics(self, metrics: dict[str, Any]) -> None:
         """
         Log final metrics.
diff --git a/autointent/metrics/decision.py b/autointent/metrics/decision.py
@@ -50,7 +50,7 @@ def decision_accuracy(y_true: LABELS_VALUE_TYPE, y_pred: LABELS_VALUE_TYPE) -> f
     :return: Score of the decision accuracy
     """
     y_true_, y_pred_ = transform(y_true, y_pred)
-    return np.mean(y_true_ == y_pred_)  # type: ignore[no-any-return]
+    return float(np.mean(y_true_ == y_pred_))
 
 
 def _decision_roc_auc_multiclass(y_true: LABELS_VALUE_TYPE, y_pred: LABELS_VALUE_TYPE) -> float:
@@ -83,7 +83,7 @@ def _decision_roc_auc_multiclass(y_true: LABELS_VALUE_TYPE, y_pred: LABELS_VALUE
         binarized_pred = (y_pred_ == k).astype(int)
         roc_auc_scores.append(roc_auc_score(binarized_true, binarized_pred))
 
-    return np.mean(roc_auc_scores)  # type: ignore[return-value]
+    return float(np.mean(roc_auc_scores))
 
 
 def _decision_roc_auc_multilabel(y_true: LABELS_VALUE_TYPE, y_pred: LABELS_VALUE_TYPE) -> float:
@@ -98,7 +98,7 @@ def _decision_roc_auc_multilabel(y_true: LABELS_VALUE_TYPE, y_pred: LABELS_VALUE
     :param y_pred: Predicted values of labels
     :return: Score of the decision accuracy
     """
-    return roc_auc_score(y_true, y_pred, average="macro")  # type: ignore[no-any-return]
+    return float(roc_auc_score(y_true, y_pred, average="macro"))
 
 
 def decision_roc_auc(y_true: LABELS_VALUE_TYPE, y_pred: LABELS_VALUE_TYPE) -> float:
@@ -135,7 +135,7 @@ def decision_precision(y_true: LABELS_VALUE_TYPE, y_pred: LABELS_VALUE_TYPE) ->
     :param y_pred: Predicted values of labels
     :return: Score of the decision precision
     """
-    return precision_score(y_true, y_pred, average="macro")  # type: ignore[no-any-return]
+    return float(precision_score(y_true, y_pred, average="macro"))
 
 
 def decision_recall(y_true: LABELS_VALUE_TYPE, y_pred: LABELS_VALUE_TYPE) -> float:
@@ -150,7 +150,7 @@ def decision_recall(y_true: LABELS_VALUE_TYPE, y_pred: LABELS_VALUE_TYPE) -> flo
     :param y_pred: Predicted values of labels
     :return: Score of the decision recall
     """
-    return recall_score(y_true, y_pred, average="macro")  # type: ignore[no-any-return]
+    return float(recall_score(y_true, y_pred, average="macro"))
 
 
 def decision_f1(y_true: LABELS_VALUE_TYPE, y_pred: LABELS_VALUE_TYPE) -> float:
@@ -165,4 +165,4 @@ def decision_f1(y_true: LABELS_VALUE_TYPE, y_pred: LABELS_VALUE_TYPE) -> float:
     :param y_pred: Predicted values of labels
     :return: Score of the decision accuracy
     """
-    return f1_score(y_true, y_pred, average="macro")  # type: ignore[no-any-return]
+    return float(f1_score(y_true, y_pred, average="macro"))
diff --git a/autointent/metrics/retrieval.py b/autointent/metrics/retrieval.py
@@ -630,7 +630,7 @@ def retrieval_ndcg(query_labels: LABELS_VALUE_TYPE, candidates_labels: CANDIDATE
         cur_idcg = _idcg(rel_scores, k)
         ndcg_scores.append(0.0 if cur_idcg == 0 else cur_dcg / cur_idcg)
 
-    return np.mean(ndcg_scores)  # type: ignore[return-value]
+    return float(np.mean(ndcg_scores))
 
 
 def retrieval_ndcg_intersecting(
diff --git a/autointent/metrics/scoring.py b/autointent/metrics/scoring.py
@@ -71,7 +71,8 @@ def scoring_log_likelihood(labels: LABELS_VALUE_TYPE, scores: SCORES_VALUE_TYPE,
         log_likelihood = labels_array * np.log(scores_array) + (1 - labels_array) * np.log(1 - scores_array)
         clipped_one = log_likelihood.clip(min=-100, max=100)
         res = clipped_one.mean()
-    return res  # type: ignore[no-any-return]
+    # test produces different output
+    return round(float(res), 6)
 
 
 def scoring_roc_auc(labels: LABELS_VALUE_TYPE, scores: SCORES_VALUE_TYPE) -> float:
@@ -96,7 +97,7 @@ def scoring_roc_auc(labels: LABELS_VALUE_TYPE, scores: SCORES_VALUE_TYPE) -> flo
     if labels_.ndim == 1:
         labels_ = (labels_[:, None] == np.arange(n_classes)[None, :]).astype(int)
 
-    return roc_auc_score(labels_, scores_, average="macro")  # type: ignore[no-any-return]
+    return float(roc_auc_score(labels_, scores_, average="macro"))
 
 
 def _calculate_decision_metric(func: DecisionMetricFn, labels: LABELS_VALUE_TYPE, scores: SCORES_VALUE_TYPE) -> float:
@@ -206,7 +207,7 @@ def scoring_hit_rate(labels: LABELS_VALUE_TYPE, scores: SCORES_VALUE_TYPE) -> fl
     top_ranked_labels = np.argmax(scores_, axis=1)
     is_in = labels_[np.arange(len(labels)), top_ranked_labels]
 
-    return np.mean(is_in)  # type: ignore[no-any-return]
+    return float(np.mean(is_in))
 
 
 def scoring_neg_coverage(labels: LABELS_VALUE_TYPE, scores: SCORES_VALUE_TYPE) -> float:
@@ -242,7 +243,7 @@ def scoring_neg_coverage(labels: LABELS_VALUE_TYPE, scores: SCORES_VALUE_TYPE) -
     labels_, scores_ = transform(labels, scores)
 
     n_classes = scores_.shape[1]
-    return 1 - (coverage_error(labels, scores) - 1) / (n_classes - 1)  # type: ignore[no-any-return]
+    return float(1 - (coverage_error(labels, scores) - 1) / (n_classes - 1))
 
 
 def scoring_neg_ranking_loss(labels: LABELS_VALUE_TYPE, scores: SCORES_VALUE_TYPE) -> float:
@@ -258,7 +259,7 @@ def scoring_neg_ranking_loss(labels: LABELS_VALUE_TYPE, scores: SCORES_VALUE_TYP
     :param scores: for each utterance, this list contains scores for each of `n_classes` classes
     :return: Score of the scoring metric
     """
-    return -label_ranking_loss(labels, scores)  # type: ignore[no-any-return]
+    return float(-label_ranking_loss(labels, scores))
 
 
 def scoring_map(labels: LABELS_VALUE_TYPE, scores: SCORES_VALUE_TYPE) -> float:
@@ -274,4 +275,4 @@ def scoring_map(labels: LABELS_VALUE_TYPE, scores: SCORES_VALUE_TYPE) -> float:
     :param scores: for each sample, this list contains scores for each of `n_classes` classes
     :return: mean average precision score
     """
-    return label_ranking_average_precision_score(labels, scores)  # type: ignore[no-any-return]
+    return float(label_ranking_average_precision_score(labels, scores))
diff --git a/autointent/modules/abc/_base.py b/autointent/modules/abc/_base.py
@@ -10,7 +10,6 @@
 from autointent.context import Context
 from autointent.context.optimization_info import Artifact
 from autointent.custom_types import BaseMetadataDict
-from autointent.metrics import METRIC_FN
 
 
 class Module(ABC):
@@ -35,14 +34,13 @@ def score(
         self,
         context: Context,
         split: Literal["validation", "test"],
-        metric_fn: METRIC_FN,
-    ) -> float:
+    ) -> dict[str, float | str]:
         """
         Calculate metric on test set and return metric value.
 
         :param context: Context to score
         :param split: Split to score on
-        :param metric_fn: Metric function
+        :return: Computed metrics value for the test set or error code of metrics
         """
 
     @abstractmethod
@@ -104,3 +102,20 @@ def from_context(cls, context: Context, **kwargs: dict[str, Any]) -> "Module":
     def get_embedder_name(self) -> str | None:
         """Experimental method."""
         return None
+
+    @staticmethod
+    def score_metrics(params: tuple[Any, Any], metrics_dict: dict[str, Any]) -> dict[str, float | str]:
+        """
+        Score metrics on the test set.
+
+        :param params: Params to score
+        :param metrics_dict:
+        :return:
+        """
+        metrics = {}
+        for metric_name, metric_fn in metrics_dict.items():
+            try:
+                metrics[metric_name] = metric_fn(*params)
+            except Exception as e:  # noqa: PERF203, BLE001
+                metrics[metric_name] = str(e)
+        return metrics
diff --git a/autointent/modules/abc/_decision.py b/autointent/modules/abc/_decision.py
@@ -9,7 +9,7 @@
 from autointent import Context
 from autointent.context.optimization_info import DecisionArtifact
 from autointent.custom_types import LabelType
-from autointent.metrics import DecisionMetricFn
+from autointent.metrics import PREDICTION_METRICS_MULTICLASS
 from autointent.modules.abc import Module
 from autointent.schemas import Tag
 
@@ -44,19 +44,17 @@ def score(
         self,
         context: Context,
         split: Literal["validation", "test"],
-        metric_fn: DecisionMetricFn,
-    ) -> float:
+    ) -> dict[str, float | str]:
         """
         Calculate metric on test set and return metric value.
 
         :param context: Context to score
         :param split: Target split
-        :param metric_fn: Metric function
-        :return: Score
+        :return: Computed metrics value for the test set or error code of metrics
         """
         labels, scores = get_decision_evaluation_data(context, split)
         self._decisions = self.predict(scores)
-        return metric_fn(labels, self._decisions)
+        return self.score_metrics((labels, self._decisions), PREDICTION_METRICS_MULTICLASS)
 
     def get_assets(self) -> DecisionArtifact:
         """Return useful assets that represent intermediate data into context."""
diff --git a/autointent/modules/abc/_scoring.py b/autointent/modules/abc/_scoring.py
@@ -8,7 +8,7 @@
 from autointent import Context
 from autointent.context.optimization_info import ScorerArtifact
 from autointent.custom_types import Split
-from autointent.metrics import ScoringMetricFn
+from autointent.metrics import SCORING_METRICS_MULTICLASS, SCORING_METRICS_MULTILABEL
 from autointent.modules.abc import Module
 
 
@@ -24,15 +24,13 @@ def score(
         self,
         context: Context,
         split: Literal["validation", "test"],
-        metric_fn: ScoringMetricFn,
-    ) -> float:
+    ) -> dict[str, float | str]:
         """
         Evaluate the scorer on a test set and compute the specified metric.
 
         :param context: Context containing test set and other data.
         :param split: Target split
-        :param metric_fn: Function to compute the scoring metric.
-        :return: Computed metric value for the test set.
+        :return: Computed metrics value for the test set or error code of metrics
         """
         if split == "validation":
             utterances = context.data_handler.validation_utterances(0)
@@ -58,7 +56,8 @@ def score(
         self._validation_scores = self.predict(context.data_handler.validation_utterances(1))
         self._test_scores = self.predict(context.data_handler.test_utterances())
 
-        return metric_fn(labels, scores)
+        metrics_dict = SCORING_METRICS_MULTILABEL if context.is_multilabel() else SCORING_METRICS_MULTICLASS
+        return self.score_metrics((labels, scores), metrics_dict)
 
     def get_assets(self) -> ScorerArtifact:
         """
diff --git a/autointent/modules/embedding/_retrieval.py b/autointent/modules/embedding/_retrieval.py
@@ -7,7 +7,7 @@
 from autointent.context import Context
 from autointent.context.optimization_info import RetrieverArtifact
 from autointent.custom_types import LabelType
-from autointent.metrics import RetrievalMetricFn
+from autointent.metrics import RETRIEVAL_METRICS_MULTICLASS, RETRIEVAL_METRICS_MULTILABEL
 from autointent.modules.abc import EmbeddingModule
 
 
@@ -117,15 +117,13 @@ def score(
         self,
         context: Context,
         split: Literal["validation", "test"],
-        metric_fn: RetrievalMetricFn,
-    ) -> float:
+    ) -> dict[str, float | str]:
         """
         Evaluate the embedding model using a specified metric function.
 
         :param context: The context containing test data and labels.
         :param split: Target split
-        :param metric_fn: Function to compute the retrieval metric.
-        :return: Computed metric score.
+        :return: Computed metrics value for the test set or error code of metrics
         """
         if split == "validation":
             utterances = context.data_handler.validation_utterances(0)
@@ -137,7 +135,9 @@ def score(
             message = f"Invalid split '{split}' provided. Expected one of 'validation', or 'test'."
             raise ValueError(message)
         predictions, _, _ = self._vector_index.query(utterances, self.k)
-        return metric_fn(labels, predictions)
+
+        metrics_dict = RETRIEVAL_METRICS_MULTILABEL if context.is_multilabel() else RETRIEVAL_METRICS_MULTICLASS
+        return self.score_metrics((labels, predictions), metrics_dict)
 
     def get_assets(self) -> RetrieverArtifact:
         """
diff --git a/autointent/modules/regexp/_regexp.py b/autointent/modules/regexp/_regexp.py
@@ -9,7 +9,7 @@
 from autointent.context.data_handler._data_handler import RegexPatterns
 from autointent.context.optimization_info import Artifact
 from autointent.custom_types import LabelType
-from autointent.metrics.regexp import RegexpMetricFn
+from autointent.metrics import REGEXP_METRICS
 from autointent.modules.abc import Module
 from autointent.schemas import Intent
 
@@ -114,13 +114,13 @@ def score(
         self,
         context: Context,
         split: Literal["validation", "test"],
-        metric_fn: RegexpMetricFn,
-    ) -> float:
+    ) -> dict[str, float | str]:
         """
         Calculate metric on test set and return metric value.
 
         :param context: Context to score
-        :param metric_fn: Metric function
+        :param split: Split to score on
+        :return: Computed metrics value for the test set or error code of metrics
         """
         # TODO add parameter to a whole pipeline (or just to regexp module):
         # whether or not to omit utterances on next stages if they were detected with regexp module
@@ -133,7 +133,7 @@ def score(
         if assets["test_matches"] is None:
             msg = "no matches found"
             raise ValueError(msg)
-        return metric_fn(context.data_handler.test_labels(), assets["test_matches"])
+        return self.score_metrics((context.data_handler.test_labels(), assets["test_matches"]), REGEXP_METRICS)
 
     def clear_cache(self) -> None:
         """Clear cache."""
diff --git a/autointent/nodes/_optimization/_node_optimizer.py b/autointent/nodes/_optimization/_node_optimizer.py
@@ -61,10 +61,14 @@ def fit(self, context: Context) -> None:
                 self.module_fit(module, context)
 
                 self._logger.debug("scoring %s module...", module_name)
-                metric_value = module.score(context, "validation", self.node_info.metrics_available[self.metric_name])
+                metrics = module.score(context, "validation")
+                metric_value = metrics[self.metric_name]
 
-                log_data = {self.metric_name: metric_value}
-                context.callback_handler.log_value(**log_data)  # type: ignore[arg-type]
+                # some metrics can produce error. When main metric produces error raise it.
+                if isinstance(metric_value, str):
+                    raise Exception(metric_value)  # noqa: TRY004, TRY002
+
+                context.callback_handler.log_metrics(metrics)
                 context.callback_handler.end_module()
 
                 dump_dir = context.get_dump_dir()
diff --git a/tests/callback/test_callback.py b/tests/callback/test_callback.py
diff --git a/tests/metrics/test_scoring_metrics.py b/tests/metrics/test_scoring_metrics.py