Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion examples/wf_multiclass_wine.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
target_col="target",
sample_id_col="index",
stratification_col="target",
metrics=["AUCROC_MACRO", "AUCROC_WEIGHTED", "ACCBAL_MC"],
datasplit_seed_outer=1234,
ignore_data_health_warning=True,
outer_parallelization=True,
Expand Down
1 change: 0 additions & 1 deletion examples/wf_octo_autogluon.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@
target_col="target",
sample_id_col="index",
stratification_col="target", # Ensure balanced splits
metrics=["AUCROC", "ACCBAL", "ACC", "LOGLOSS", "F1"],
n_folds_outer=5, # 5-fold outer cross-validation
ignore_data_health_warning=True,
outer_parallelization=True,
Expand Down
1 change: 0 additions & 1 deletion examples/wf_roc_octo.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
target_col="target",
sample_id_col="index",
stratification_col="target",
metrics=["AUCROC", "ACCBAL", "ACC", "LOGLOSS"],
datasplit_seed_outer=1234,
ignore_data_health_warning=True,
outer_parallelization=True,
Expand Down
3 changes: 2 additions & 1 deletion octopus/modules/autogluon/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
)
from octopus.logger import LogGroup, get_logger
from octopus.manager.ray_parallel import setup_ray_for_external_library
from octopus.metrics import Metrics
from octopus.metrics.utils import get_score_from_model
from octopus.modules.base import FIDataset, FIMethod, ModuleExecution, ModuleResult, ResultType
from octopus.study.context import StudyContext
Expand Down Expand Up @@ -354,7 +355,7 @@ def _get_scores(

# Test scores using Octopus metrics for comparison
assert study_context.target_metric is not None, "target_metric should be set during fit()"
all_metrics = list(dict.fromkeys([*study_context.metrics, study_context.target_metric]))
all_metrics = Metrics.get_by_type(study_context.ml_type)
Comment on lines 356 to +358
Copy link

Copilot AI Mar 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Switching from the explicit [...study_context.metrics, study_context.target_metric] list to Metrics.get_by_type(study_context.ml_type) will evaluate (and persist) every registered metric for the ML type. This can materially increase runtime and the size/noise of scores.parquet/performance_results.json, and it also hits the same downstream aggregation issue where multiple metrics in scores.parquet are not preserved. Consider restricting this comparison set to {study_context.target_metric} (or a small curated set) unless/until the rest of the pipeline is updated to handle multi-metric score artifacts.

Copilot uses AI. Check for mistakes.
test_performance_octo = {}
for metric in all_metrics:
assert feature_cols is not None, "feature_cols should be set during fit()"
Expand Down
15 changes: 8 additions & 7 deletions octopus/modules/octo/bag.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,24 +387,25 @@ def get_predictions(self):

return predictions

def get_performance(self):
def get_performance(self, metric: str | None = None):
"""Get performance using get_performance_from_predictions utility.

This is a simpler alternative to get_performance() that:
1. Gets predictions from bag.get_predictions()
2. Calculates performance using get_performance_from_predictions()
3. Restructures output to match expected format
Args:
metric: The metric to evaluate. Defaults to self.target_metric when None.

Returns:
dict: Dictionary with performance values in the same format as get_performance()
"""
Comment on lines +390 to 398
Copy link

Copilot AI Mar 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The updated docstring/return description is self-referential: it says the returned dict matches the format of get_performance(), but this method is get_performance(). Clarify what shape the returned dict has (keys like train_avg, dev_avg, etc.) and/or what legacy method it is intended to be compatible with.

Copilot uses AI. Check for mistakes.
if metric is None:
metric = self.target_metric

# Get predictions from the bag
predictions = self.get_predictions()

# Calculate performance using the utility function
performance = get_performance_from_predictions(
predictions=predictions,
target_metric=self.target_metric,
target_metric=metric,
target_assignments=self.target_assignments,
positive_class=self.positive_class,
)
Expand Down Expand Up @@ -448,7 +449,7 @@ def get_performance_df(self, metric: str) -> pd.DataFrame:
Returns:
DataFrame with columns: metric, partition, aggregation, fold, value
"""
perf = self.get_performance()
perf = self.get_performance(metric=metric)
rows = []

# Per-fold scores
Expand Down
14 changes: 12 additions & 2 deletions octopus/modules/octo/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from octopus.datasplit import DataSplit, InnerSplits
from octopus.logger import LogGroup, get_logger
from octopus.metrics import Metrics
from octopus.models import Models
from octopus.modules.base import ModuleExecution, ModuleResult, ResultType
from octopus.modules.mrmr.core import _maxrminr, _relevance_fstats
Expand Down Expand Up @@ -90,11 +91,16 @@ def fit(

# Build best ModuleResult
best_bag = results["best"]["_bag"]
all_metrics = Metrics.get_by_type(study_context.ml_type)
best_scores = pd.concat(
[best_bag.get_performance_df(metric=m) for m in all_metrics],
ignore_index=True,
)
Comment on lines +94 to +98
Copy link

Copilot AI Mar 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OctoModuleTemplate.fit() now concatenates score DataFrames for all metrics of the ML type. Several downstream consumers (e.g. StudyLoader.build_performance_summary() / show_target_metric_performance) currently assume scores.parquet effectively contains a single metric and build keys that omit the metric column, which will silently overwrite values when multiple metrics are present. Either keep scores limited to study_context.target_metric, or update the downstream aggregation to include the metric name in its keys / grouping so all metrics are preserved.

Copilot uses AI. Check for mistakes.
Comment on lines +94 to +98
Copy link

Copilot AI Mar 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Computing best_scores by calling best_bag.get_performance_df() once per metric likely recomputes predictions/performance repeatedly (each call goes through get_predictions() / get_performance_from_predictions()). If you want to persist multiple metrics, consider computing predictions once and evaluating multiple metrics from that cached structure (or extending the bag API to return a multi-metric scores DataFrame in one pass) to avoid an avoidable O(#metrics) overhead per outer split.

Copilot uses AI. Check for mistakes.
best_result = ModuleResult(
result_type=ResultType.BEST,
module=self.config.module,
selected_features=best_selected_features,
scores=best_bag.get_performance_df(metric=study_context.target_metric),
scores=best_scores,
predictions=best_bag.get_predictions_df(),
feature_importances=best_bag.get_feature_importances_df(),
model=best_bag,
Expand All @@ -111,11 +117,15 @@ def fit(
# Always save ensemble result if it was produced
if "ensel" in results:
ensel_bag = results["ensel"]["_bag"]
ensel_scores = pd.concat(
[ensel_bag.get_performance_df(metric=m) for m in all_metrics],
ignore_index=True,
)
ensel_result = ModuleResult(
result_type=ResultType.ENSEMBLE_SELECTION,
module=self.config.module,
selected_features=ensel_selected_features or best_selected_features,
scores=ensel_bag.get_performance_df(metric=study_context.target_metric),
scores=ensel_scores,
predictions=ensel_bag.get_predictions_df(),
feature_importances=ensel_bag.get_feature_importances_df(),
model=ensel_bag,
Expand Down
3 changes: 0 additions & 3 deletions octopus/study/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,6 @@ class StudyContext:
target_metric: str
"""Primary metric for model evaluation."""

metrics: list[str]
"""All metrics to calculate."""

target_assignments: dict[str, str]
"""Target column assignments (e.g. {'default': 'target'} or {'duration': ..., 'event': ...})."""

Expand Down
43 changes: 0 additions & 43 deletions octopus/study/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,12 +106,6 @@ def target_metric(self) -> str:
"""Get target metric. Must be implemented in subclasses."""
...

@property
@abstractmethod
def metrics(self) -> list:
"""Get metrics list. Must be implemented in subclasses."""
...

@property
@abstractmethod
def target_assignments(self) -> dict[str, str]:
Expand Down Expand Up @@ -338,7 +332,6 @@ def _create_study_context(
return StudyContext(
ml_type=ml_type,
target_metric=self.target_metric,
metrics=self.metrics,
target_assignments=self.target_assignments,
positive_class=positive_class,
stratification_col=self.stratification_col,
Expand Down Expand Up @@ -396,18 +389,6 @@ class OctoRegression(OctoStudy):
)
"""The primary metric used for model evaluation. Defaults to RMSE."""

metrics: list = field(
default=Factory(lambda self: [self.target_metric], takes_self=True),
validator=[
validators.instance_of(list),
validators.deep_iterable(
member_validator=validators.in_(Metrics.get_by_type(MLType.REGRESSION)),
iterable_validator=validators.instance_of(list),
),
],
)
"""A list of metrics to be calculated. Defaults to target_metric value."""

@property
def target_assignments(self) -> dict[str, str]:
"""Get target assignments dict."""
Expand All @@ -434,18 +415,6 @@ class OctoClassification(OctoStudy):
)
"""The primary metric used for model evaluation. Defaults to AUCROC."""

metrics: list = field(
default=Factory(lambda self: [self.target_metric], takes_self=True),
validator=[
validators.instance_of(list),
validators.deep_iterable(
member_validator=validators.in_(Metrics.get_by_type(MLType.BINARY, MLType.MULTICLASS)),
iterable_validator=validators.instance_of(list),
),
],
)
"""A list of metrics to be calculated. Defaults to target_metric value."""

positive_class: int | None = field(default=None, validator=validators.optional(validators.instance_of(int)))
"""The positive class label for binary classification. Defaults to None. Not used for multiclass."""

Expand Down Expand Up @@ -494,18 +463,6 @@ class OctoTimeToEvent(OctoStudy):
)
"""The primary metric used for model evaluation. Defaults to CI (Concordance Index)."""

metrics: list = field(
default=Factory(lambda self: [self.target_metric], takes_self=True),
validator=[
validators.instance_of(list),
validators.deep_iterable(
member_validator=validators.in_(Metrics.get_by_type(MLType.TIMETOEVENT)),
iterable_validator=validators.instance_of(list),
),
],
)
"""A list of metrics to be calculated. Defaults to target_metric value."""

@property
def target_assignments(self) -> dict[str, str]:
"""Get target assignments dict."""
Expand Down
1 change: 0 additions & 1 deletion tests/infrastructure/test_fsspec.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,6 @@ def run_experiment(self, breast_cancer_dataset, root_dir: UPath):
target_col="target",
sample_id_col="index",
stratification_col="target",
metrics=["AUCROC", "ACCBAL", "ACC", "LOGLOSS"],
datasplit_seed_outer=1234,
n_folds_outer=2,
path=root_dir,
Expand Down
1 change: 0 additions & 1 deletion tests/manager/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ def study():
return StudyContext(
ml_type=MLType.BINARY,
target_metric="AUCROC",
metrics=["AUCROC"],
target_assignments={"default": "target"},
positive_class=1,
stratification_col=None,
Expand Down
1 change: 0 additions & 1 deletion tests/predict/test_predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,6 @@ def _run_classification_study() -> str:
target_col="target",
sample_id_col="index",
stratification_col="target",
metrics=["AUCROC", "ACCBAL", "ACC"],
datasplit_seed_outer=1234,
n_folds_outer=2,
path=tmp,
Expand Down
25 changes: 0 additions & 25 deletions tests/study/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,31 +117,6 @@ def test_default_workflow():
assert study.workflow[0].task_id == 0


@pytest.mark.parametrize(
"metrics_input,expected_metrics",
[
(None, ["AUCROC"]), # default metrics
(["AUCROC", "ACCBAL", "F1"], ["AUCROC", "ACCBAL", "F1"]), # custom metrics
],
)
def test_metrics(metrics_input, expected_metrics):
"""Test metrics list with default and custom values."""
with tempfile.TemporaryDirectory() as temp_dir:
kwargs = {
"name": "test",
"target_metric": "AUCROC",
"feature_cols": ["f1"],
"target_col": "target",
"sample_id_col": "id",
"path": temp_dir,
}
if metrics_input is not None:
kwargs["metrics"] = metrics_input

study = OctoClassification(**kwargs)
assert study.metrics == expected_metrics


def test_default_values():
"""Test default values are set correctly."""
with tempfile.TemporaryDirectory() as temp_dir:
Expand Down
2 changes: 0 additions & 2 deletions tests/workflows/test_ag_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ def test_full_classification_workflow(self):
target_col="target",
sample_id_col="index",
stratification_col="target",
metrics=["AUCROC", "ACCBAL", "ACC", "LOGLOSS"],
datasplit_seed_outer=1234,
n_folds_outer=5,
path=self.studies_path,
Expand Down Expand Up @@ -120,7 +119,6 @@ def test_full_regression_workflow(self):
feature_cols=feature_names,
target_col="target",
sample_id_col="index",
metrics=["MAE", "MSE", "R2"],
datasplit_seed_outer=1234,
n_folds_outer=2,
path=self.studies_path,
Expand Down
1 change: 0 additions & 1 deletion tests/workflows/test_octo_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,6 @@ def test_octo_intro_classification_actual_execution(self, breast_cancer_dataset)
target_col="target",
sample_id_col="index",
stratification_col="target",
metrics=["AUCROC", "ACCBAL", "ACC", "LOGLOSS"],
datasplit_seed_outer=1234,
n_folds_outer=2,
path=temp_dir,
Expand Down
21 changes: 0 additions & 21 deletions tests/workflows/test_octo_multiclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,25 +151,6 @@ def test_multiclass_multiple_models_configuration(self):

assert set(octo_task.models) == set(models)

def test_multiclass_metrics_configuration(self):
"""Test multiclass-specific metrics configuration."""
with tempfile.TemporaryDirectory() as temp_dir:
study = OctoClassification(
name="test_multiclass_metrics",
target_metric="AUCROC_MACRO",
feature_cols=["f1"],
target_col="target",
sample_id_col="index",
metrics=["AUCROC_MACRO", "AUCROC_WEIGHTED", "ACCBAL_MC"],
path=temp_dir,
ignore_data_health_warning=True,
)

assert study.target_metric == "AUCROC_MACRO"
assert "AUCROC_MACRO" in study.metrics
assert "AUCROC_WEIGHTED" in study.metrics
assert "ACCBAL_MC" in study.metrics

def test_feature_importance_configuration(self):
"""Test feature importance method configuration for multiclass."""
fi_methods = ["permutation"]
Expand Down Expand Up @@ -211,7 +192,6 @@ def test_multiclass_workflow_actual_execution(self, wine_dataset):
target_col="target",
sample_id_col="index",
stratification_col="target",
metrics=["AUCROC_MACRO", "AUCROC_WEIGHTED", "ACCBAL_MC"],
datasplit_seed_outer=1234,
n_folds_outer=2,
path=temp_dir,
Expand Down Expand Up @@ -314,7 +294,6 @@ def test_multiclass_target_metric_options(self):
feature_cols=["f1"],
target_col="target",
sample_id_col="index",
metrics=["AUCROC_MACRO", "AUCROC_WEIGHTED", "ACCBAL_MC"],
path=temp_dir,
ignore_data_health_warning=True,
)
Expand Down
1 change: 0 additions & 1 deletion tests/workflows/test_octo_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,6 @@ def test_octo_regression_actual_execution(self, diabetes_dataset):
feature_cols=features,
target_col="target",
sample_id_col="index",
metrics=["MAE", "MSE", "R2"],
datasplit_seed_outer=1234,
n_folds_outer=2,
path=temp_dir,
Expand Down
1 change: 0 additions & 1 deletion tests/workflows/test_octo_t2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,6 @@ def test_octo_timetoevent_actual_execution(self, survival_dataset):
duration_col="duration",
event_col="event",
sample_id_col="index",
metrics=["CI"],
datasplit_seed_outer=1234,
n_folds_outer=2,
path=temp_dir,
Expand Down
1 change: 0 additions & 1 deletion tests/workflows/test_roc_octo_roc_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,6 @@ def test_roc_octo_roc_workflow_actual_execution(self, sample_classification_data
target_col="target",
sample_id_col="sample_id_col",
stratification_col="target",
metrics=["AUCROC", "ACCBAL"],
datasplit_seed_outer=1234,
n_folds_outer=2,
path=temp_dir,
Expand Down