emdgroup · nihaase · Mar 10, 2026 · Copilot · Mar 10, 2026 · Copilot
diff --git a/examples/wf_multiclass_wine.py b/examples/wf_multiclass_wine.py
@@ -39,7 +39,6 @@
     target_col="target",
     sample_id_col="index",
     stratification_col="target",
-    metrics=["AUCROC_MACRO", "AUCROC_WEIGHTED", "ACCBAL_MC"],
     datasplit_seed_outer=1234,
     ignore_data_health_warning=True,
     outer_parallelization=True,

diff --git a/examples/wf_octo_autogluon.py b/examples/wf_octo_autogluon.py
@@ -71,7 +71,6 @@
     target_col="target",
     sample_id_col="index",
     stratification_col="target",  # Ensure balanced splits
-    metrics=["AUCROC", "ACCBAL", "ACC", "LOGLOSS", "F1"],
     n_folds_outer=5,  # 5-fold outer cross-validation
     ignore_data_health_warning=True,
     outer_parallelization=True,

diff --git a/examples/wf_roc_octo.py b/examples/wf_roc_octo.py
@@ -36,7 +36,6 @@
     target_col="target",
     sample_id_col="index",
     stratification_col="target",
-    metrics=["AUCROC", "ACCBAL", "ACC", "LOGLOSS"],
     datasplit_seed_outer=1234,
     ignore_data_health_warning=True,
     outer_parallelization=True,

diff --git a/octopus/modules/autogluon/core.py b/octopus/modules/autogluon/core.py
@@ -28,6 +28,7 @@
 )
 from octopus.logger import LogGroup, get_logger
 from octopus.manager.ray_parallel import setup_ray_for_external_library
+from octopus.metrics import Metrics
 from octopus.metrics.utils import get_score_from_model
 from octopus.modules.base import FIDataset, FIMethod, ModuleExecution, ModuleResult, ResultType
 from octopus.study.context import StudyContext
@@ -354,7 +355,7 @@ def _get_scores(
 
         # Test scores using Octopus metrics for comparison
         assert study_context.target_metric is not None, "target_metric should be set during fit()"
-        all_metrics = list(dict.fromkeys([*study_context.metrics, study_context.target_metric]))
+        all_metrics = Metrics.get_by_type(study_context.ml_type)
         test_performance_octo = {}
         for metric in all_metrics:
             assert feature_cols is not None, "feature_cols should be set during fit()"

diff --git a/octopus/modules/octo/bag.py b/octopus/modules/octo/bag.py
@@ -387,24 +387,25 @@ def get_predictions(self):
 
         return predictions
 
-    def get_performance(self):
+    def get_performance(self, metric: str | None = None):
         """Get performance using get_performance_from_predictions utility.
 
-        This is a simpler alternative to get_performance() that:
-        1. Gets predictions from bag.get_predictions()
-        2. Calculates performance using get_performance_from_predictions()
-        3. Restructures output to match expected format
+        Args:
+            metric: The metric to evaluate. Defaults to self.target_metric when None.
 
         Returns:
             dict: Dictionary with performance values in the same format as get_performance()
         """
+        if metric is None:
+            metric = self.target_metric
+
         # Get predictions from the bag
         predictions = self.get_predictions()
 
         # Calculate performance using the utility function
         performance = get_performance_from_predictions(
             predictions=predictions,
-            target_metric=self.target_metric,
+            target_metric=metric,
             target_assignments=self.target_assignments,
             positive_class=self.positive_class,
         )
@@ -448,7 +449,7 @@ def get_performance_df(self, metric: str) -> pd.DataFrame:
         Returns:
             DataFrame with columns: metric, partition, aggregation, fold, value
         """
-        perf = self.get_performance()
+        perf = self.get_performance(metric=metric)
         rows = []
 
         # Per-fold scores

diff --git a/octopus/modules/octo/core.py b/octopus/modules/octo/core.py
@@ -13,6 +13,7 @@
 
 from octopus.datasplit import DataSplit, InnerSplits
 from octopus.logger import LogGroup, get_logger
+from octopus.metrics import Metrics
 from octopus.models import Models
 from octopus.modules.base import ModuleExecution, ModuleResult, ResultType
 from octopus.modules.mrmr.core import _maxrminr, _relevance_fstats
@@ -90,11 +91,16 @@ def fit(
 
         # Build best ModuleResult
         best_bag = results["best"]["_bag"]
+        all_metrics = Metrics.get_by_type(study_context.ml_type)
+        best_scores = pd.concat(
+            [best_bag.get_performance_df(metric=m) for m in all_metrics],
+            ignore_index=True,
+        )
         best_result = ModuleResult(
             result_type=ResultType.BEST,
             module=self.config.module,
             selected_features=best_selected_features,
-            scores=best_bag.get_performance_df(metric=study_context.target_metric),
+            scores=best_scores,
             predictions=best_bag.get_predictions_df(),
             feature_importances=best_bag.get_feature_importances_df(),
             model=best_bag,
@@ -111,11 +117,15 @@ def fit(
             # Always save ensemble result if it was produced
             if "ensel" in results:
                 ensel_bag = results["ensel"]["_bag"]
+                ensel_scores = pd.concat(
+                    [ensel_bag.get_performance_df(metric=m) for m in all_metrics],
+                    ignore_index=True,
+                )
                 ensel_result = ModuleResult(
                     result_type=ResultType.ENSEMBLE_SELECTION,
                     module=self.config.module,
                     selected_features=ensel_selected_features or best_selected_features,
-                    scores=ensel_bag.get_performance_df(metric=study_context.target_metric),
+                    scores=ensel_scores,
                     predictions=ensel_bag.get_predictions_df(),
                     feature_importances=ensel_bag.get_feature_importances_df(),
                     model=ensel_bag,

diff --git a/octopus/study/context.py b/octopus/study/context.py
@@ -20,9 +20,6 @@ class StudyContext:
     target_metric: str
     """Primary metric for model evaluation."""
 
-    metrics: list[str]
-    """All metrics to calculate."""
-
     target_assignments: dict[str, str]
     """Target column assignments (e.g. {'default': 'target'} or {'duration': ..., 'event': ...})."""
 

diff --git a/octopus/study/core.py b/octopus/study/core.py
@@ -106,12 +106,6 @@ def target_metric(self) -> str:
         """Get target metric. Must be implemented in subclasses."""
         ...
 
-    @property
-    @abstractmethod
-    def metrics(self) -> list:
-        """Get metrics list. Must be implemented in subclasses."""
-        ...
-
     @property
     @abstractmethod
     def target_assignments(self) -> dict[str, str]:
@@ -338,7 +332,6 @@ def _create_study_context(
         return StudyContext(
             ml_type=ml_type,
             target_metric=self.target_metric,
-            metrics=self.metrics,
             target_assignments=self.target_assignments,
             positive_class=positive_class,
             stratification_col=self.stratification_col,
@@ -396,18 +389,6 @@ class OctoRegression(OctoStudy):
     )
     """The primary metric used for model evaluation. Defaults to RMSE."""
 
-    metrics: list = field(
-        default=Factory(lambda self: [self.target_metric], takes_self=True),
-        validator=[
-            validators.instance_of(list),
-            validators.deep_iterable(
-                member_validator=validators.in_(Metrics.get_by_type(MLType.REGRESSION)),
-                iterable_validator=validators.instance_of(list),
-            ),
-        ],
-    )
-    """A list of metrics to be calculated. Defaults to target_metric value."""
-
     @property
     def target_assignments(self) -> dict[str, str]:
         """Get target assignments dict."""
@@ -434,18 +415,6 @@ class OctoClassification(OctoStudy):
     )
     """The primary metric used for model evaluation. Defaults to AUCROC."""
 
-    metrics: list = field(
-        default=Factory(lambda self: [self.target_metric], takes_self=True),
-        validator=[
-            validators.instance_of(list),
-            validators.deep_iterable(
-                member_validator=validators.in_(Metrics.get_by_type(MLType.BINARY, MLType.MULTICLASS)),
-                iterable_validator=validators.instance_of(list),
-            ),
-        ],
-    )
-    """A list of metrics to be calculated. Defaults to target_metric value."""
-
     positive_class: int | None = field(default=None, validator=validators.optional(validators.instance_of(int)))
     """The positive class label for binary classification. Defaults to None. Not used for multiclass."""
 
@@ -494,18 +463,6 @@ class OctoTimeToEvent(OctoStudy):
     )
     """The primary metric used for model evaluation. Defaults to CI (Concordance Index)."""
 
-    metrics: list = field(
-        default=Factory(lambda self: [self.target_metric], takes_self=True),
-        validator=[
-            validators.instance_of(list),
-            validators.deep_iterable(
-                member_validator=validators.in_(Metrics.get_by_type(MLType.TIMETOEVENT)),
-                iterable_validator=validators.instance_of(list),
-            ),
-        ],
-    )
-    """A list of metrics to be calculated. Defaults to target_metric value."""
-
     @property
     def target_assignments(self) -> dict[str, str]:
         """Get target assignments dict."""

diff --git a/tests/infrastructure/test_fsspec.py b/tests/infrastructure/test_fsspec.py
@@ -162,7 +162,6 @@ def run_experiment(self, breast_cancer_dataset, root_dir: UPath):
                     target_col="target",
                     sample_id_col="index",
                     stratification_col="target",
-                    metrics=["AUCROC", "ACCBAL", "ACC", "LOGLOSS"],
                     datasplit_seed_outer=1234,
                     n_folds_outer=2,
                     path=root_dir,

diff --git a/tests/manager/test_core.py b/tests/manager/test_core.py
@@ -55,7 +55,6 @@ def study():
     return StudyContext(
         ml_type=MLType.BINARY,
         target_metric="AUCROC",
-        metrics=["AUCROC"],
         target_assignments={"default": "target"},
         positive_class=1,
         stratification_col=None,

diff --git a/tests/predict/test_predict.py b/tests/predict/test_predict.py
@@ -75,7 +75,6 @@ def _run_classification_study() -> str:
         target_col="target",
         sample_id_col="index",
         stratification_col="target",
-        metrics=["AUCROC", "ACCBAL", "ACC"],
         datasplit_seed_outer=1234,
         n_folds_outer=2,
         path=tmp,

diff --git a/tests/study/test_core.py b/tests/study/test_core.py
@@ -117,31 +117,6 @@ def test_default_workflow():
         assert study.workflow[0].task_id == 0
 
 
-@pytest.mark.parametrize(
-    "metrics_input,expected_metrics",
-    [
-        (None, ["AUCROC"]),  # default metrics
-        (["AUCROC", "ACCBAL", "F1"], ["AUCROC", "ACCBAL", "F1"]),  # custom metrics
-    ],
-)
-def test_metrics(metrics_input, expected_metrics):
-    """Test metrics list with default and custom values."""
-    with tempfile.TemporaryDirectory() as temp_dir:
-        kwargs = {
-            "name": "test",
-            "target_metric": "AUCROC",
-            "feature_cols": ["f1"],
-            "target_col": "target",
-            "sample_id_col": "id",
-            "path": temp_dir,
-        }
-        if metrics_input is not None:
-            kwargs["metrics"] = metrics_input
-
-        study = OctoClassification(**kwargs)
-        assert study.metrics == expected_metrics
-
-
 def test_default_values():
     """Test default values are set correctly."""
     with tempfile.TemporaryDirectory() as temp_dir:

diff --git a/tests/workflows/test_ag_workflows.py b/tests/workflows/test_ag_workflows.py
@@ -54,7 +54,6 @@ def test_full_classification_workflow(self):
             target_col="target",
             sample_id_col="index",
             stratification_col="target",
-            metrics=["AUCROC", "ACCBAL", "ACC", "LOGLOSS"],
             datasplit_seed_outer=1234,
             n_folds_outer=5,
             path=self.studies_path,
@@ -120,7 +119,6 @@ def test_full_regression_workflow(self):
             feature_cols=feature_names,
             target_col="target",
             sample_id_col="index",
-            metrics=["MAE", "MSE", "R2"],
             datasplit_seed_outer=1234,
             n_folds_outer=2,
             path=self.studies_path,

diff --git a/tests/workflows/test_octo_classification.py b/tests/workflows/test_octo_classification.py
@@ -195,7 +195,6 @@ def test_octo_intro_classification_actual_execution(self, breast_cancer_dataset)
                 target_col="target",
                 sample_id_col="index",
                 stratification_col="target",
-                metrics=["AUCROC", "ACCBAL", "ACC", "LOGLOSS"],
                 datasplit_seed_outer=1234,
                 n_folds_outer=2,
                 path=temp_dir,

diff --git a/tests/workflows/test_octo_multiclass.py b/tests/workflows/test_octo_multiclass.py
@@ -151,25 +151,6 @@ def test_multiclass_multiple_models_configuration(self):
 
         assert set(octo_task.models) == set(models)
 
-    def test_multiclass_metrics_configuration(self):
-        """Test multiclass-specific metrics configuration."""
-        with tempfile.TemporaryDirectory() as temp_dir:
-            study = OctoClassification(
-                name="test_multiclass_metrics",
-                target_metric="AUCROC_MACRO",
-                feature_cols=["f1"],
-                target_col="target",
-                sample_id_col="index",
-                metrics=["AUCROC_MACRO", "AUCROC_WEIGHTED", "ACCBAL_MC"],
-                path=temp_dir,
-                ignore_data_health_warning=True,
-            )
-
-            assert study.target_metric == "AUCROC_MACRO"
-            assert "AUCROC_MACRO" in study.metrics
-            assert "AUCROC_WEIGHTED" in study.metrics
-            assert "ACCBAL_MC" in study.metrics
-
     def test_feature_importance_configuration(self):
         """Test feature importance method configuration for multiclass."""
         fi_methods = ["permutation"]
@@ -211,7 +192,6 @@ def test_multiclass_workflow_actual_execution(self, wine_dataset):
                 target_col="target",
                 sample_id_col="index",
                 stratification_col="target",
-                metrics=["AUCROC_MACRO", "AUCROC_WEIGHTED", "ACCBAL_MC"],
                 datasplit_seed_outer=1234,
                 n_folds_outer=2,
                 path=temp_dir,
@@ -314,7 +294,6 @@ def test_multiclass_target_metric_options(self):
                     feature_cols=["f1"],
                     target_col="target",
                     sample_id_col="index",
-                    metrics=["AUCROC_MACRO", "AUCROC_WEIGHTED", "ACCBAL_MC"],
                     path=temp_dir,
                     ignore_data_health_warning=True,
                 )

diff --git a/tests/workflows/test_octo_regression.py b/tests/workflows/test_octo_regression.py
@@ -204,7 +204,6 @@ def test_octo_regression_actual_execution(self, diabetes_dataset):
                 feature_cols=features,
                 target_col="target",
                 sample_id_col="index",
-                metrics=["MAE", "MSE", "R2"],
                 datasplit_seed_outer=1234,
                 n_folds_outer=2,
                 path=temp_dir,

diff --git a/tests/workflows/test_octo_t2e.py b/tests/workflows/test_octo_t2e.py
@@ -184,7 +184,6 @@ def test_octo_timetoevent_actual_execution(self, survival_dataset):
                 duration_col="duration",
                 event_col="event",
                 sample_id_col="index",
-                metrics=["CI"],
                 datasplit_seed_outer=1234,
                 n_folds_outer=2,
                 path=temp_dir,

diff --git a/tests/workflows/test_roc_octo_roc_workflow.py b/tests/workflows/test_roc_octo_roc_workflow.py
@@ -281,7 +281,6 @@ def test_roc_octo_roc_workflow_actual_execution(self, sample_classification_data
                 target_col="target",
                 sample_id_col="sample_id_col",
                 stratification_col="target",
-                metrics=["AUCROC", "ACCBAL"],
                 datasplit_seed_outer=1234,
                 n_folds_outer=2,
                 path=temp_dir,