deeppavlov · voorhs · Jan 31, 2025 · Jan 28, 2025 · Jan 28, 2025 · Jan 28, 2025
diff --git a/autointent/_datafiles/default-multiclass-config.yaml b/autointent/_datafiles/default-multiclass-config.yaml
@@ -1,14 +1,14 @@
 # TODO: make up a better and more versatile config
 - node_type: embedding
-  metric: retrieval_hit_rate
+  target_metric: retrieval_hit_rate
   search_space:
     - module_name: retrieval
       k: [10]
       embedder_name:
         - avsolatorio/GIST-small-Embedding-v0
         - infgrad/stella-base-en-v2
 - node_type: scoring
-  metric: scoring_roc_auc
+  target_metric: scoring_roc_auc
   search_space:
     - module_name: knn
       k: [1, 3, 5, 10]
@@ -20,8 +20,8 @@
         - cross-encoder/ms-marco-MiniLM-L-6-v2
       k: [1, 3, 5, 10]
 - node_type: decision
-  metric: decision_accuracy
+  target_metric: decision_accuracy
   search_space:
     - module_name: threshold
       thresh: [0.5]
-    - module_name: argmax
+    - module_name: argmax
diff --git a/autointent/_datafiles/default-multilabel-config.yaml b/autointent/_datafiles/default-multilabel-config.yaml
@@ -1,21 +1,21 @@
 # TODO: make up a better and more versatile config
 - node_type: embedding
-  metric: retrieval_hit_rate_intersecting
+  target_metric: retrieval_hit_rate_intersecting
   search_space:
     - module_name: retrieval
       k: [10]
       embedder_name:
         - deepvk/USER-bge-m3
 - node_type: scoring
-  metric: scoring_roc_auc
+  target_metric: scoring_roc_auc
   search_space:
     - module_name: knn
       k: [3]
       weights: ["uniform", "distance", "closest"]
     - module_name: linear
 - node_type: decision
-  metric: decision_accuracy
+  target_metric: decision_accuracy
   search_space:
     - module_name: threshold
       thresh: [0.5]
-    - module_name: adaptive
+    - module_name: adaptive
diff --git a/autointent/modules/abc/_base.py b/autointent/modules/abc/_base.py
@@ -34,11 +34,7 @@ def fit(self, *args: tuple[Any], **kwargs: dict[str, Any]) -> None:
         """
 
     @abstractmethod
-    def score(
-        self,
-        context: Context,
-        split: Literal["validation", "test"],
-    ) -> dict[str, float | str]:
+    def score(self, context: Context, split: Literal["validation", "test"], metrics: list[str]) -> dict[str, float]:
         """
         Calculate metric on test set and return metric value.
 
@@ -110,7 +106,7 @@ def get_embedder_name(self) -> str | None:
         return None
 
     @staticmethod
-    def score_metrics(params: tuple[Any, Any], metrics_dict: dict[str, Any]) -> dict[str, float | str]:
+    def score_metrics(params: tuple[Any, Any], metrics_dict: dict[str, Any]) -> dict[str, float]:
         """
         Score metrics on the test set.
 

diff --git a/autointent/modules/abc/_decision.py b/autointent/modules/abc/_decision.py
@@ -40,11 +40,7 @@ def predict(self, scores: npt.NDArray[Any]) -> ListOfGenericLabels:
         :param scores: Scores to predict
         """
 
-    def score(
-        self,
-        context: Context,
-        split: Literal["validation", "test"],
-    ) -> dict[str, float | str]:
+    def score(self, context: Context, split: Literal["validation", "test"], metrics: list[str]) -> dict[str, float]:
         """
         Calculate metric on test set and return metric value.
 
@@ -54,7 +50,8 @@ def score(
         """
         labels, scores = get_decision_evaluation_data(context, split)
         self._decisions = self.predict(scores)
-        return self.score_metrics((labels, self._decisions), PREDICTION_METRICS_MULTICLASS)
+        chosen_metrics = {name: fn for name, fn in PREDICTION_METRICS_MULTICLASS.items() if name in metrics}
+        return self.score_metrics((labels, self._decisions), chosen_metrics)
 
     def get_assets(self) -> DecisionArtifact:
         """Return useful assets that represent intermediate data into context."""

diff --git a/autointent/modules/abc/_scoring.py b/autointent/modules/abc/_scoring.py
@@ -21,11 +21,7 @@ class ScoringModule(Module, ABC):
 
     supports_oos = False
 
-    def score(
-        self,
-        context: Context,
-        split: Literal["validation", "test"],
-    ) -> dict[str, float | str]:
+    def score(self, context: Context, split: Literal["validation", "test"], metrics: list[str]) -> dict[str, float]:
         """
         Evaluate the scorer on a test set and compute the specified metric.
 
@@ -50,7 +46,8 @@ def score(
         self._test_scores = self.predict(context.data_handler.test_utterances())
 
         metrics_dict = SCORING_METRICS_MULTILABEL if context.is_multilabel() else SCORING_METRICS_MULTICLASS
-        return self.score_metrics((labels, scores), metrics_dict)
+        chosen_metrics = {name: fn for name, fn in metrics_dict.items() if name in metrics}
+        return self.score_metrics((labels, scores), chosen_metrics)
 
     def get_assets(self) -> ScorerArtifact:
         """

diff --git a/autointent/modules/embedding/_logreg.py b/autointent/modules/embedding/_logreg.py
@@ -129,11 +129,7 @@ def fit(self, utterances: list[str], labels: ListOfLabels) -> None:
 
         self._classifier.fit(embeddings, labels)
 
-    def score(
-        self,
-        context: Context,
-        split: Literal["validation", "test"],
-    ) -> dict[str, float | str]:
+    def score(self, context: Context, split: Literal["validation", "test"], metrics: list[str]) -> dict[str, float]:
         """
         Evaluate the embedding model using a specified metric function.
 
@@ -153,7 +149,8 @@ def score(
 
         probas = self.predict(utterances)
         metrics_dict = SCORING_METRICS_MULTILABEL if context.is_multilabel() else SCORING_METRICS_MULTICLASS
-        return self.score_metrics((labels, probas), metrics_dict)
+        chosen_metrics = {name: fn for name, fn in metrics_dict.items() if name in metrics}
+        return self.score_metrics((labels, probas), chosen_metrics)
 
     def get_assets(self) -> RetrieverArtifact:
         """

diff --git a/autointent/modules/embedding/_retrieval.py b/autointent/modules/embedding/_retrieval.py
@@ -109,11 +109,7 @@ def fit(self, utterances: list[str], labels: ListOfLabels) -> None:
         )
         self._vector_index.add(utterances, labels)
 
-    def score(
-        self,
-        context: Context,
-        split: Literal["validation", "test"],
-    ) -> dict[str, float | str]:
+    def score(self, context: Context, split: Literal["validation", "test"], metrics: list[str]) -> dict[str, float]:
         """
         Evaluate the embedding model using a specified metric function.
 
@@ -133,7 +129,8 @@ def score(
         predictions, _, _ = self._vector_index.query(utterances, self.k)
 
         metrics_dict = RETRIEVAL_METRICS_MULTILABEL if context.is_multilabel() else RETRIEVAL_METRICS_MULTICLASS
-        return self.score_metrics((labels, predictions), metrics_dict)
+        chosen_metrics = {name: fn for name, fn in metrics_dict.items() if name in metrics}
+        return self.score_metrics((labels, predictions), chosen_metrics)
 
     def get_assets(self) -> RetrieverArtifact:
         """

diff --git a/autointent/modules/regexp/_regexp.py b/autointent/modules/regexp/_regexp.py
@@ -108,11 +108,7 @@ def _predict_single(self, utterance: str) -> tuple[LabelType, dict[str, list[str
             matches["partial_matches"].extend(intent_matches["partial_matches"])
         return list(prediction), matches
 
-    def score(
-        self,
-        context: Context,
-        split: Literal["validation", "test"],
-    ) -> dict[str, float | str]:
+    def score(self, context: Context, split: Literal["validation", "test"], metrics: list[str]) -> dict[str, float]:
         """
         Calculate metric on test set and return metric value.
 
@@ -128,7 +124,8 @@ def score(
         if assets["test_matches"] is None:
             msg = "no matches found"
             raise ValueError(msg)
-        return self.score_metrics((context.data_handler.test_labels(), assets["test_matches"]), REGEXP_METRICS)
+        chosen_metrics = {name: fn for name, fn in REGEXP_METRICS.items() if name in metrics}
+        return self.score_metrics((context.data_handler.test_labels(), assets["test_matches"]), chosen_metrics)
 
     def clear_cache(self) -> None:
         """Clear cache."""

diff --git a/autointent/nodes/_optimization/_node_optimizer.py b/autointent/nodes/_optimization/_node_optimizer.py
@@ -19,7 +19,13 @@
 class NodeOptimizer:
     """Node optimizer class."""
 
-    def __init__(self, node_type: NodeType, search_space: list[dict[str, Any]], metric: str) -> None:
+    def __init__(
+        self,
+        node_type: NodeType,
+        search_space: list[dict[str, Any]],
+        target_metric: str,
+        metrics: list[str] | None = None,
+    ) -> None:
         """
         Initialize the node optimizer.
 
@@ -29,7 +35,12 @@ def __init__(self, node_type: NodeType, search_space: list[dict[str, Any]], metr
         """
         self.node_type = node_type
         self.node_info = NODES_INFO[node_type]
-        self.metric_name = metric
+        self.decision_metric_name = target_metric
+
+        self.metrics = metrics if metrics is not None else []
+        if self.decision_metric_name not in self.metrics:
+            self.metrics.append(self.decision_metric_name)
+
         self.modules_search_spaces = search_space  # TODO search space validation
         self._logger = logging.getLogger(__name__)  # TODO solve duplicate logging messages problem
 
@@ -61,14 +72,10 @@ def fit(self, context: Context) -> None:
                 self.module_fit(module, context)
 
                 self._logger.debug("scoring %s module...", module_name)
-                metrics = module.score(context, "validation")
-                metric_value = metrics[self.metric_name]
-
-                # some metrics can produce error. When main metric produces error raise it.
-                if isinstance(metric_value, str):
-                    raise Exception(metric_value)  # noqa: TRY004, TRY002
+                metrics_score = module.score(context, "validation", self.metrics)
+                metric_value = metrics_score[self.decision_metric_name]
 
-                context.callback_handler.log_metrics(metrics)
+                context.callback_handler.log_metrics(metrics_score)
                 context.callback_handler.end_module()
 
                 dump_dir = context.get_dump_dir()
@@ -84,7 +91,7 @@ def fit(self, context: Context) -> None:
                     module_name,
                     module_kwargs,
                     metric_value,
-                    self.metric_name,
+                    self.decision_metric_name,
                     module.get_assets(),  # retriever name / scores / predictions
                     module_dump_dir,
                     module=module if not context.is_ram_to_clear() else None,

diff --git a/tests/assets/configs/description.yaml b/tests/assets/configs/description.yaml
@@ -1,16 +1,16 @@
 - node_type: embedding
-  metric: retrieval_hit_rate
+  target_metric: retrieval_hit_rate
   search_space:
     - module_name: retrieval
       k: [10]
       embedder_name:
         - sentence-transformers/all-MiniLM-L6-v2
 - node_type: scoring
-  metric: scoring_roc_auc
+  target_metric: scoring_roc_auc
   search_space:
     - module_name: description
       temperature: [1.0, 0.5, 0.1, 0.05]
 - node_type: decision
-  metric: decision_accuracy
+  target_metric: decision_accuracy
   search_space:
     - module_name: argmax
diff --git a/tests/assets/configs/multiclass.yaml b/tests/assets/configs/multiclass.yaml
@@ -1,13 +1,13 @@
 - node_type: embedding
-  metric: retrieval_hit_rate
+  target_metric: retrieval_hit_rate
   search_space:
     - module_name: retrieval
       k: [10]
       embedder_name:
         - sentence-transformers/all-MiniLM-L6-v2
         - avsolatorio/GIST-small-Embedding-v0
 - node_type: scoring
-  metric: scoring_roc_auc
+  target_metric: scoring_roc_auc
   search_space:
     - module_name: knn
       k: [5, 10]
@@ -32,10 +32,10 @@
       cross_encoder_name:
         - cross-encoder/ms-marco-MiniLM-L-6-v2
 - node_type: decision
-  metric: decision_accuracy
+  target_metric: decision_accuracy
   search_space:
     - module_name: threshold
       thresh: [0.5, [0.5, 0.5, 0.5, 0.5]]
     - module_name: tunable
     - module_name: argmax
-    - module_name: jinoos
+    - module_name: jinoos
diff --git a/tests/assets/configs/multilabel.yaml b/tests/assets/configs/multilabel.yaml
@@ -1,13 +1,13 @@
 - node_type: embedding
-  metric: scoring_accuracy
+  target_metric: scoring_accuracy
   search_space:
     - module_name: logreg
       cv: [2]
       embedder_name:
         - sentence-transformers/all-MiniLM-L6-v2
         - avsolatorio/GIST-small-Embedding-v0
 - node_type: scoring
-  metric: scoring_roc_auc
+  target_metric: scoring_roc_auc
   search_space:
     - module_name: knn
       k: [5, 10]
@@ -28,7 +28,7 @@
       cross_encoder_name:
         - cross-encoder/ms-marco-MiniLM-L-6-v2
 - node_type: decision
-  metric: decision_accuracy
+  target_metric: decision_accuracy
   search_space:
     - module_name: threshold
       thresh: [0.5, [0.5, 0.5, 0.5, 0.5]]

diff --git a/tests/callback/test_callback.py b/tests/callback/test_callback.py
@@ -47,7 +47,8 @@ def test_pipeline_callbacks(dataset):
     search_space = [
         {
             "node_type": "embedding",
-            "metric": "retrieval_hit_rate",
+            "target_metric": "retrieval_hit_rate",
+            "metrics": ["retrieval_map", "retrieval_mrr", "retrieval_ndcg", "retrieval_precision"],
             "search_space": [
                 {
                     "module_name": "retrieval",
@@ -58,15 +59,29 @@ def test_pipeline_callbacks(dataset):
         },
         {
             "node_type": "scoring",
-            "metric": "scoring_roc_auc",
+            "target_metric": "scoring_roc_auc",
+            "metrics": [
+                "scoring_accuracy",
+                "scoring_f1",
+                "scoring_log_likelihood",
+                "scoring_precision",
+                "scoring_recall",
+            ],
             "search_space": [
                 {"module_name": "knn", "k": [1], "weights": ["uniform", "distance"]},
                 {"module_name": "linear"},
             ],
         },
         {
             "node_type": "decision",
-            "metric": "decision_accuracy",
+            "target_metric": "decision_accuracy",
+            "metrics": [
+                "decision_accuracy",
+                "decision_f1",
+                "decision_precision",
+                "decision_recall",
+                "decision_roc_auc",
+            ],
             "search_space": [{"module_name": "threshold", "thresh": [0.5]}, {"module_name": "argmax"}],
         },
     ]

diff --git a/tests/nodes/conftest.py b/tests/nodes/conftest.py
@@ -26,7 +26,7 @@ def get_embedding_optimizer(multilabel: bool):
     if multilabel:
         metric = metric + "_intersecting"
     embedding_optimizer_config = {
-        "metric": metric,
+        "target_metric": metric,
         "node_type": "embedding",
         "search_space": [
             {
@@ -48,7 +48,7 @@ def scoring_optimizer_multiclass(embedding_optimizer_multiclass):
     embedding_optimizer_multiclass.fit(context)
 
     scoring_optimizer_config = {
-        "metric": "scoring_roc_auc",
+        "target_metric": "scoring_roc_auc",
         "node_type": "scoring",
         "search_space": [
             {"module_name": "linear"},
@@ -64,7 +64,7 @@ def scoring_optimizer_multilabel(embedding_optimizer_multilabel):
     embedding_optimizer_multilabel.fit(context)
 
     scoring_optimizer_config = {
-        "metric": "scoring_roc_auc",
+        "target_metric": "scoring_roc_auc",
         "node_type": "scoring",
         "search_space": [
             {"module_name": "linear"},