emdgroup · dasmy · Mar 12, 2026 · Mar 12, 2026 · Mar 12, 2026 · anwurl
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -12,7 +12,7 @@ on:
   workflow_dispatch:
 
 concurrency:
-  group: ${{ github.workflow }}
+  group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
 jobs:

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -51,7 +51,7 @@ repos:
     rev: "v1.19.1"
     hooks:
       - id: mypy
-        exclude: ^datasets/|examples/|tests/|studies/
+        exclude: ^datasets/|studies/
         additional_dependencies:
           - types-networkx
           - pandas-stubs

diff --git a/examples/basic_classification.py b/examples/basic_classification.py
@@ -8,18 +8,18 @@
 ### Necessary imports for this example
 import os
 
-from sklearn.datasets import load_breast_cancer
-
+from octopus.example_data import load_breast_cancer_data
 from octopus.modules import Octo
 from octopus.study import OctoClassification
 
 ### Load and Preprocess Data
-breast_cancer = load_breast_cancer(as_frame=True)
+df, features, targets = load_breast_cancer_data()
 
-df = breast_cancer["frame"].reset_index()
-df.columns = df.columns.str.replace(" ", "_")
-features = list(breast_cancer["feature_names"])
-features = [feature.replace(" ", "_") for feature in features]
+print("Dataset info:")
+print(f"  Features: {len(features)} - {features}")
+print(f"  Samples: {df.shape[0]}")
+print(f"  Classes: {len(targets)} - {targets}")
+print(f"  Target distribution: {df['target'].value_counts().sort_index().to_dict()}")
 
 ### Create and run OctoClassification
 study = OctoClassification(

diff --git a/examples/basic_regression.py b/examples/basic_regression.py
@@ -8,23 +8,28 @@
 ### Necessary imports for this example
 import os
 
-from sklearn.datasets import load_diabetes
-
+from octopus.example_data import load_diabetes_data
 from octopus.study import OctoRegression
 
 ### Load the diabetes dataset
-diabetes = load_diabetes(as_frame=True)
+df, features, targets = load_diabetes_data()
+
+print("Dataset info:")
+print(f"  Features: {len(features)} - {features}")
+print(f"  Samples: {df.shape[0]}")
+print(f"  Classes: {len(targets)} - {targets}")
+print(f"  Target distribution: {df['target'].value_counts().sort_index().to_dict()}")
 
 ### Create and run OctoRegression
 study = OctoRegression(
     name="basic_regression",
     path=os.environ.get("STUDIES_PATH", "./studies"),
     target_metric="MAE",
-    feature_cols=diabetes["feature_names"],
+    feature_cols=features,
     target_col="target",
     sample_id_col="index",
 )
 
-study.fit(data=diabetes["frame"].reset_index())
+study.fit(data=df)
 
 print("Workflow completed")
diff --git a/examples/multi_workflow.py b/examples/multi_workflow.py
@@ -6,20 +6,25 @@
 ### Necessary imports for this example
 import os
 
-from sklearn.datasets import load_diabetes
-
+from octopus.example_data import load_diabetes_data
 from octopus.modules import Mrmr, Octo
 from octopus.study import OctoRegression
 
 ### Load the diabetes dataset
-diabetes = load_diabetes(as_frame=True)
+df, features, targets = load_diabetes_data()
+
+print("Dataset info:")
+print(f"  Features: {len(features)} - {features}")
+print(f"  Samples: {df.shape[0]}")
+print(f"  Classes: {len(targets)} - {targets}")
+print(f"  Target distribution: {df['target'].value_counts().sort_index().to_dict()}")
 
 ### Create and run OctoRegression with multi-step workflow
 study = OctoRegression(
     name="example_multiworkflow",
     path=os.environ.get("STUDIES_PATH", "./studies"),
     target_metric="R2",
-    feature_cols=diabetes["feature_names"],
+    feature_cols=features,
     target_col="target",
     sample_id_col="index",
     ignore_data_health_warning=True,
@@ -52,6 +57,6 @@
     ],
 )
 
-study.fit(data=diabetes["frame"].reset_index())
+study.fit(data=df)
 
 print("Multi-workflow completed")
diff --git a/examples/use_own_hyperparameters.py b/examples/use_own_hyperparameters.py
@@ -1,28 +1,37 @@
 """Example for using custom hyperparameters in Octopus regression."""
 
-# This example demonstrates how to use Octopus with custom hyperparameters.
+# This example demonstrates how to use custom hyperparameters with Octopus.
+# The key difference from the basic example is the use of the `hyperparameters` parameter
+# in the Octo configuration, where you can define custom hyperparameter ranges
+# for each model using the Hyperparameter class.
+
 # Instead of letting Optuna automatically search the hyperparameter space,
 # you can define your own hyperparameter ranges for the models.
 # We will use the diabetes dataset for this purpose.
 
 ### Necessary imports for this example
 import os
 
-from sklearn.datasets import load_diabetes
-
+from octopus.example_data import load_diabetes_data
 from octopus.models.hyperparameter import IntHyperparameter
 from octopus.modules import Octo
 from octopus.study import OctoRegression
 
 ### Load the diabetes dataset
-diabetes = load_diabetes(as_frame=True)
+df, features, targets = load_diabetes_data()
+
+print("Dataset info:")
+print(f"  Features: {len(features)} - {features}")
+print(f"  Samples: {df.shape[0]}")
+print(f"  Classes: {len(targets)} - {targets}")
+print(f"  Target distribution: {df['target'].value_counts().sort_index().to_dict()}")
 
 ### Create and run OctoRegression with custom hyperparameters
 study = OctoRegression(
     name="use_own_hyperparameters_example",
     path=os.environ.get("STUDIES_PATH", "./studies"),
     target_metric="MAE",
-    feature_cols=diabetes["feature_names"],
+    feature_cols=features,
     target_col="target",
     sample_id_col="index",
     ignore_data_health_warning=True,
@@ -43,11 +52,6 @@
     ],
 )
 
-study.fit(data=diabetes["frame"].reset_index())
+study.fit(data=df)
 
 print("Workflow completed")
-
-# This example demonstrates how to use custom hyperparameters with Octopus.
-# The key difference from the basic example is the use of the `hyperparameters` parameter
-# in the Octo configuration, where you can define custom hyperparameter ranges
-# for each model using the Hyperparameter class.
diff --git a/examples/wf_multiclass_wine.py b/examples/wf_multiclass_wine.py
@@ -10,23 +10,17 @@
 
 import os
 
-from sklearn.datasets import load_wine
-
+from octopus.example_data import load_wine_data
 from octopus.modules import Octo
 from octopus.study import OctoClassification
 
 ### Load and Preprocess Data
-wine = load_wine(as_frame=True)
-
-df = wine["frame"].reset_index()
-df.columns = df.columns.str.replace(" ", "_")
-features = list(wine["feature_names"])
-features = [feature.replace(" ", "_") for feature in features]
+df, features, targets = load_wine_data()
 
 print("Dataset info:")
-print(f"  Features: {len(features)}")
+print(f"  Features: {len(features)} - {features}")
 print(f"  Samples: {df.shape[0]}")
-print(f"  Classes: {len(wine.target_names)} - {wine.target_names}")
+print(f"  Classes: {len(targets)} - {targets}")
 print(f"  Target distribution: {df['target'].value_counts().sort_index().to_dict()}")
 
 ### Create and run OctoClassification for multiclass classification

diff --git a/examples/wf_roc_octo.py b/examples/wf_roc_octo.py
@@ -8,8 +8,7 @@
 
 import os
 
-from sklearn.datasets import load_breast_cancer
-
+from octopus.example_data import load_breast_cancer_data
 from octopus.modules import Octo, Roc
 from octopus.study import OctoClassification
 
@@ -19,12 +18,13 @@
 # This is a binary classification dataset with 30 features
 # Target: 0 = malignant, 1 = benign
 
-breast_cancer = load_breast_cancer(as_frame=True)
+df, features, targets = load_breast_cancer_data()
 
-df = breast_cancer["frame"].reset_index()
-df.columns = df.columns.str.replace(" ", "_")
-features = list(breast_cancer["feature_names"])
-features = [feature.replace(" ", "_") for feature in features]
+print("Dataset info:")
+print(f"  Features: {len(features)} - {features}")
+print(f"  Samples: {df.shape[0]}")
+print(f"  Classes: {len(targets)} - {targets}")
+print(f"  Target distribution: {df['target'].value_counts().sort_index().to_dict()}")
 
 ### Create and run OctoClassification with ROC + Octo workflow
 

diff --git a/octopus/example_data.py b/octopus/example_data.py
@@ -0,0 +1,40 @@
+"""Example data sets for use in Octopus examples."""
+
+import pandas as pd
+from sklearn.datasets import load_breast_cancer, load_diabetes, load_wine
+from sklearn.utils import Bunch
+
+
+def load_breast_cancer_data() -> tuple[pd.DataFrame, list[str], list[str]]:
+    """Load the breast cancer dataset and return pandas dataframe, feature list, and target list."""
+    breast_cancer: Bunch = load_breast_cancer(as_frame=True)  # type: ignore[assignment]
+
+    df = breast_cancer["frame"].reset_index()
+    df.columns = df.columns.str.replace(" ", "_")
+    features = [feature.replace(" ", "_") for feature in breast_cancer["feature_names"]]
+    targets = [str(target) for target in breast_cancer["target_names"]]
+
+    return df, features, targets
+
+
+def load_diabetes_data() -> tuple[pd.DataFrame, list[str], list[str]]:
+    """Load the diabetes dataset and return pandas dataframe, feature list, and target list."""
+    diabetes: Bunch = load_diabetes(as_frame=True)  # type: ignore[assignment]
+
+    df = diabetes["frame"].reset_index()
+    features = [str(feature) for feature in diabetes["feature_names"]]
+    targets = ["target"]
+
+    return df, features, targets
+
+
+def load_wine_data() -> tuple[pd.DataFrame, list[str], list[str]]:
+    """Load the wine dataset and return pandas dataframe, feature list, and target list."""
+    wine: Bunch = load_wine(as_frame=True)  # type: ignore[assignment]
+
+    df = wine["frame"].reset_index()
+    df.columns = df.columns.str.replace(" ", "_")
+    features = [feature.replace(" ", "_") for feature in wine["feature_names"]]
+    targets = [str(target) for target in wine["target_names"]]
+
+    return df, features, targets
diff --git a/octopus/manager/core.py b/octopus/manager/core.py
@@ -2,6 +2,7 @@
 
 import math
 import os
+from collections.abc import Sequence
 
 from attrs import define, field, validators
 
@@ -133,8 +134,8 @@ class OctoManager:
     study_context: StudyContext = field(validator=[validators.instance_of(StudyContext)])
     """Frozen runtime context containing study configuration."""
 
-    workflow: list[Task] = field(validator=[validators.instance_of(list)])
-    """List of workflow tasks to execute."""
+    workflow: Sequence[Task] = field(validator=[validators.instance_of(list)])
+    """Workflow tasks to execute."""
 
     outer_parallelization: bool = field(validator=[validators.instance_of(bool)])
     """Whether to run outersplits in parallel."""

diff --git a/octopus/manager/workflow_runner.py b/octopus/manager/workflow_runner.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import json
+from typing import TYPE_CHECKING
 
 import pandas as pd
 import ray
@@ -14,6 +15,9 @@
 from octopus.modules import ModuleResult, ResultType, StudyContext, Task
 from octopus.utils import calculate_feature_groups, parquet_save
 
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
 logger = get_logger()
 
 
@@ -33,7 +37,7 @@ class WorkflowTaskRunner:
     """
 
     study_context: StudyContext = field(validator=[validators.instance_of(StudyContext)])
-    workflow: list[Task] = field(validator=[validators.instance_of(list)])
+    workflow: Sequence[Task] = field(validator=[validators.instance_of(list)])
     cpus_per_outersplit: int = field(validator=[validators.instance_of(int)])
 
     def run(self, outersplit_id: int, outersplit: OuterSplit) -> None:

diff --git a/octopus/models/core.py b/octopus/models/core.py
@@ -45,6 +45,11 @@ def decorator(factory: Callable[[], ModelConfig]) -> Callable[[], ModelConfig]:
 
         return decorator
 
+    @classmethod
+    def get_registered_models(cls) -> list[ModelName]:
+        """Get a list of all registered model names."""
+        return [ModelName(name) for name in cls._config_factories]
+
     @classmethod
     def get_config(cls, name: ModelName) -> ModelConfig:
         """Get model configuration by name.
@@ -185,6 +190,5 @@ def validate_model_compatibility(cls, model_name: ModelName, ml_type: MLType) ->
         config = cls.get_config(model_name)
         if not config.supports_ml_type(ml_type):
             raise ValueError(
-                f"Model '{model_name}' does not support ml_type '{ml_type.value}'. "
-                f"Supported types: {', '.join(t.value for t in config.ml_types)}"
+                f"Model '{model_name}' does not support ml_type '{ml_type.value}'. Supported types: {', '.join(t.value for t in config.ml_types)}"
             )
diff --git a/octopus/modules/octo/enssel.py b/octopus/modules/octo/enssel.py
@@ -1,6 +1,6 @@
 """Ensemble selection."""
 
-# TOBEDONE
+# TODO
 # - issue: ACC and BALACC need integer pooling values!
 # - potential issue: check start_n, +1 or not
 # - get FI and counts

diff --git a/octopus/modules/octo/training.py b/octopus/modules/octo/training.py
@@ -24,15 +24,15 @@
 from octopus.models import ModelName, Models
 from octopus.types import MLType
 
-# # TOBEDONE pipeline
+# # TODO pipeline
 # - implement cat encoding on module level
 # - how to provide categorical info to catboost and other models?
 
 
 logger = get_logger()
 
 
-class TrainingConfig(TypedDict):
+class TrainingConfig(TypedDict, total=False):
     """Training configuration type."""
 
     outl_reduction: int

diff --git a/octopus/study/core.py b/octopus/study/core.py
@@ -5,6 +5,7 @@
 import os
 import platform
 from abc import ABC, abstractmethod
+from collections.abc import Sequence
 from datetime import UTC
 
 import pandas as pd
@@ -77,7 +78,7 @@ class OctoStudy(ABC):
     run_single_outersplit_num: int = field(default=Factory(lambda: -1), validator=[validators.instance_of(int)])
     """Select a single outersplit to execute. Defaults to -1 to run all outersplits"""
 
-    workflow: list[Task] = field(
+    workflow: Sequence[Task] = field(
         default=Factory(lambda: [Octo(task_id=0)]),
         validator=[validators.instance_of(list), validate_workflow],
     )