-
Notifications
You must be signed in to change notification settings - Fork 0
remove metrics input #348
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
remove metrics input #348
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -387,24 +387,25 @@ def get_predictions(self): | |
|
|
||
| return predictions | ||
|
|
||
| def get_performance(self): | ||
| def get_performance(self, metric: str | None = None): | ||
| """Get performance using get_performance_from_predictions utility. | ||
|
|
||
| This is a simpler alternative to get_performance() that: | ||
| 1. Gets predictions from bag.get_predictions() | ||
| 2. Calculates performance using get_performance_from_predictions() | ||
| 3. Restructures output to match expected format | ||
| Args: | ||
| metric: The metric to evaluate. Defaults to self.target_metric when None. | ||
|
|
||
| Returns: | ||
| dict: Dictionary with performance values in the same format as get_performance() | ||
| """ | ||
|
Comment on lines
+390
to
398
|
||
| if metric is None: | ||
| metric = self.target_metric | ||
|
|
||
| # Get predictions from the bag | ||
| predictions = self.get_predictions() | ||
|
|
||
| # Calculate performance using the utility function | ||
| performance = get_performance_from_predictions( | ||
| predictions=predictions, | ||
| target_metric=self.target_metric, | ||
| target_metric=metric, | ||
| target_assignments=self.target_assignments, | ||
| positive_class=self.positive_class, | ||
| ) | ||
|
|
@@ -448,7 +449,7 @@ def get_performance_df(self, metric: str) -> pd.DataFrame: | |
| Returns: | ||
| DataFrame with columns: metric, partition, aggregation, fold, value | ||
| """ | ||
| perf = self.get_performance() | ||
| perf = self.get_performance(metric=metric) | ||
| rows = [] | ||
|
|
||
| # Per-fold scores | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,6 +13,7 @@ | |
|
|
||
| from octopus.datasplit import DataSplit, InnerSplits | ||
| from octopus.logger import LogGroup, get_logger | ||
| from octopus.metrics import Metrics | ||
| from octopus.models import Models | ||
| from octopus.modules.base import ModuleExecution, ModuleResult, ResultType | ||
| from octopus.modules.mrmr.core import _maxrminr, _relevance_fstats | ||
|
|
@@ -90,11 +91,16 @@ def fit( | |
|
|
||
| # Build best ModuleResult | ||
| best_bag = results["best"]["_bag"] | ||
| all_metrics = Metrics.get_by_type(study_context.ml_type) | ||
| best_scores = pd.concat( | ||
| [best_bag.get_performance_df(metric=m) for m in all_metrics], | ||
| ignore_index=True, | ||
| ) | ||
|
Comment on lines
+94
to
+98
|
||
| best_result = ModuleResult( | ||
| result_type=ResultType.BEST, | ||
| module=self.config.module, | ||
| selected_features=best_selected_features, | ||
| scores=best_bag.get_performance_df(metric=study_context.target_metric), | ||
| scores=best_scores, | ||
| predictions=best_bag.get_predictions_df(), | ||
| feature_importances=best_bag.get_feature_importances_df(), | ||
| model=best_bag, | ||
|
|
@@ -111,11 +117,15 @@ def fit( | |
| # Always save ensemble result if it was produced | ||
| if "ensel" in results: | ||
| ensel_bag = results["ensel"]["_bag"] | ||
| ensel_scores = pd.concat( | ||
| [ensel_bag.get_performance_df(metric=m) for m in all_metrics], | ||
| ignore_index=True, | ||
| ) | ||
| ensel_result = ModuleResult( | ||
| result_type=ResultType.ENSEMBLE_SELECTION, | ||
| module=self.config.module, | ||
| selected_features=ensel_selected_features or best_selected_features, | ||
| scores=ensel_bag.get_performance_df(metric=study_context.target_metric), | ||
| scores=ensel_scores, | ||
| predictions=ensel_bag.get_predictions_df(), | ||
| feature_importances=ensel_bag.get_feature_importances_df(), | ||
| model=ensel_bag, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Switching from the explicit
[...study_context.metrics, study_context.target_metric]list toMetrics.get_by_type(study_context.ml_type)will evaluate (and persist) every registered metric for the ML type. This can materially increase runtime and the size/noise ofscores.parquet/performance_results.json, and it also hits the same downstream aggregation issue where multiple metrics inscores.parquetare not preserved. Consider restricting this comparison set to{study_context.target_metric}(or a small curated set) unless/until the rest of the pipeline is updated to handle multi-metric score artifacts.