Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ on:
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ repos:
rev: "v1.19.1"
hooks:
- id: mypy
exclude: ^datasets/|examples/|tests/|studies/
exclude: ^datasets/|studies/
additional_dependencies:
- types-networkx
- pandas-stubs
Expand Down
14 changes: 7 additions & 7 deletions examples/basic_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,18 @@
### Necessary imports for this example
import os

from sklearn.datasets import load_breast_cancer

from octopus.example_data import load_breast_cancer_data
from octopus.modules import Octo
from octopus.study import OctoClassification

### Load and Preprocess Data
breast_cancer = load_breast_cancer(as_frame=True)
df, features, targets = load_breast_cancer_data()

df = breast_cancer["frame"].reset_index()
df.columns = df.columns.str.replace(" ", "_")
features = list(breast_cancer["feature_names"])
features = [feature.replace(" ", "_") for feature in features]
print("Dataset info:")
print(f" Features: {len(features)} - {features}")
print(f" Samples: {df.shape[0]}")
print(f" Classes: {len(targets)} - {targets}")
print(f" Target distribution: {df['target'].value_counts().sort_index().to_dict()}")

### Create and run OctoClassification
study = OctoClassification(
Expand Down
15 changes: 10 additions & 5 deletions examples/basic_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,28 @@
### Necessary imports for this example
import os

from sklearn.datasets import load_diabetes

from octopus.example_data import load_diabetes_data
from octopus.study import OctoRegression

### Load the diabetes dataset
diabetes = load_diabetes(as_frame=True)
df, features, targets = load_diabetes_data()

print("Dataset info:")
print(f" Features: {len(features)} - {features}")
print(f" Samples: {df.shape[0]}")
print(f" Classes: {len(targets)} - {targets}")
print(f" Target distribution: {df['target'].value_counts().sort_index().to_dict()}")

### Create and run OctoRegression
study = OctoRegression(
name="basic_regression",
path=os.environ.get("STUDIES_PATH", "./studies"),
target_metric="MAE",
feature_cols=diabetes["feature_names"],
feature_cols=features,
target_col="target",
sample_id_col="index",
)

study.fit(data=diabetes["frame"].reset_index())
study.fit(data=df)

print("Workflow completed")
15 changes: 10 additions & 5 deletions examples/multi_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,25 @@
### Necessary imports for this example
import os

from sklearn.datasets import load_diabetes

from octopus.example_data import load_diabetes_data
from octopus.modules import Mrmr, Octo
from octopus.study import OctoRegression

### Load the diabetes dataset
diabetes = load_diabetes(as_frame=True)
df, features, targets = load_diabetes_data()

print("Dataset info:")
print(f" Features: {len(features)} - {features}")
print(f" Samples: {df.shape[0]}")
print(f" Classes: {len(targets)} - {targets}")
print(f" Target distribution: {df['target'].value_counts().sort_index().to_dict()}")

### Create and run OctoRegression with multi-step workflow
study = OctoRegression(
name="example_multiworkflow",
path=os.environ.get("STUDIES_PATH", "./studies"),
target_metric="R2",
feature_cols=diabetes["feature_names"],
feature_cols=features,
target_col="target",
sample_id_col="index",
ignore_data_health_warning=True,
Expand Down Expand Up @@ -52,6 +57,6 @@
],
)

study.fit(data=diabetes["frame"].reset_index())
study.fit(data=df)

print("Multi-workflow completed")
26 changes: 15 additions & 11 deletions examples/use_own_hyperparameters.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,37 @@
"""Example for using custom hyperparameters in Octopus regression."""

# This example demonstrates how to use Octopus with custom hyperparameters.
# This example demonstrates how to use custom hyperparameters with Octopus.
# The key difference from the basic example is the use of the `hyperparameters` parameter
# in the Octo configuration, where you can define custom hyperparameter ranges
# for each model using the Hyperparameter class.

# Instead of letting Optuna automatically search the hyperparameter space,
# you can define your own hyperparameter ranges for the models.
# We will use the diabetes dataset for this purpose.

### Necessary imports for this example
import os

from sklearn.datasets import load_diabetes

from octopus.example_data import load_diabetes_data
from octopus.models.hyperparameter import IntHyperparameter
from octopus.modules import Octo
from octopus.study import OctoRegression

### Load the diabetes dataset
diabetes = load_diabetes(as_frame=True)
df, features, targets = load_diabetes_data()

print("Dataset info:")
print(f" Features: {len(features)} - {features}")
print(f" Samples: {df.shape[0]}")
print(f" Classes: {len(targets)} - {targets}")
print(f" Target distribution: {df['target'].value_counts().sort_index().to_dict()}")

### Create and run OctoRegression with custom hyperparameters
study = OctoRegression(
name="use_own_hyperparameters_example",
path=os.environ.get("STUDIES_PATH", "./studies"),
target_metric="MAE",
feature_cols=diabetes["feature_names"],
feature_cols=features,
target_col="target",
sample_id_col="index",
ignore_data_health_warning=True,
Expand All @@ -43,11 +52,6 @@
],
)

study.fit(data=diabetes["frame"].reset_index())
study.fit(data=df)

print("Workflow completed")

# This example demonstrates how to use custom hyperparameters with Octopus.
# The key difference from the basic example is the use of the `hyperparameters` parameter
# in the Octo configuration, where you can define custom hyperparameter ranges
# for each model using the Hyperparameter class.
14 changes: 4 additions & 10 deletions examples/wf_multiclass_wine.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,23 +10,17 @@

import os

from sklearn.datasets import load_wine

from octopus.example_data import load_wine_data
from octopus.modules import Octo
from octopus.study import OctoClassification

### Load and Preprocess Data
wine = load_wine(as_frame=True)

df = wine["frame"].reset_index()
df.columns = df.columns.str.replace(" ", "_")
features = list(wine["feature_names"])
features = [feature.replace(" ", "_") for feature in features]
df, features, targets = load_wine_data()

print("Dataset info:")
print(f" Features: {len(features)}")
print(f" Features: {len(features)} - {features}")
print(f" Samples: {df.shape[0]}")
print(f" Classes: {len(wine.target_names)} - {wine.target_names}")
print(f" Classes: {len(targets)} - {targets}")
print(f" Target distribution: {df['target'].value_counts().sort_index().to_dict()}")

### Create and run OctoClassification for multiclass classification
Expand Down
14 changes: 7 additions & 7 deletions examples/wf_roc_octo.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@

import os

from sklearn.datasets import load_breast_cancer

from octopus.example_data import load_breast_cancer_data
from octopus.modules import Octo, Roc
from octopus.study import OctoClassification

Expand All @@ -19,12 +18,13 @@
# This is a binary classification dataset with 30 features
# Target: 0 = malignant, 1 = benign

breast_cancer = load_breast_cancer(as_frame=True)
df, features, targets = load_breast_cancer_data()

df = breast_cancer["frame"].reset_index()
df.columns = df.columns.str.replace(" ", "_")
features = list(breast_cancer["feature_names"])
features = [feature.replace(" ", "_") for feature in features]
print("Dataset info:")
print(f" Features: {len(features)} - {features}")
print(f" Samples: {df.shape[0]}")
print(f" Classes: {len(targets)} - {targets}")
print(f" Target distribution: {df['target'].value_counts().sort_index().to_dict()}")

### Create and run OctoClassification with ROC + Octo workflow

Expand Down
40 changes: 40 additions & 0 deletions octopus/example_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""Example data sets for use in Octopus examples."""

import pandas as pd
from sklearn.datasets import load_breast_cancer, load_diabetes, load_wine
from sklearn.utils import Bunch


def load_breast_cancer_data() -> tuple[pd.DataFrame, list[str], list[str]]:
"""Load the breast cancer dataset and return pandas dataframe, feature list, and target list."""
breast_cancer: Bunch = load_breast_cancer(as_frame=True) # type: ignore[assignment]

df = breast_cancer["frame"].reset_index()
Comment on lines +10 to +12
df.columns = df.columns.str.replace(" ", "_")
features = [feature.replace(" ", "_") for feature in breast_cancer["feature_names"]]
targets = [str(target) for target in breast_cancer["target_names"]]

return df, features, targets
Comment on lines +8 to +17
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we put data code in a separate folder?



def load_diabetes_data() -> tuple[pd.DataFrame, list[str], list[str]]:
"""Load the diabetes dataset and return pandas dataframe, feature list, and target list."""
diabetes: Bunch = load_diabetes(as_frame=True) # type: ignore[assignment]

df = diabetes["frame"].reset_index()
features = [str(feature) for feature in diabetes["feature_names"]]
Comment on lines +22 to +25
targets = ["target"]

return df, features, targets


def load_wine_data() -> tuple[pd.DataFrame, list[str], list[str]]:
"""Load the wine dataset and return pandas dataframe, feature list, and target list."""
wine: Bunch = load_wine(as_frame=True) # type: ignore[assignment]

df = wine["frame"].reset_index()
df.columns = df.columns.str.replace(" ", "_")
Comment on lines +33 to +36
features = [feature.replace(" ", "_") for feature in wine["feature_names"]]
targets = [str(target) for target in wine["target_names"]]

return df, features, targets
5 changes: 3 additions & 2 deletions octopus/manager/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import math
import os
from collections.abc import Sequence

from attrs import define, field, validators

Expand Down Expand Up @@ -133,8 +134,8 @@ class OctoManager:
study_context: StudyContext = field(validator=[validators.instance_of(StudyContext)])
"""Frozen runtime context containing study configuration."""

workflow: list[Task] = field(validator=[validators.instance_of(list)])
"""List of workflow tasks to execute."""
workflow: Sequence[Task] = field(validator=[validators.instance_of(list)])
"""Workflow tasks to execute."""

outer_parallelization: bool = field(validator=[validators.instance_of(bool)])
"""Whether to run outersplits in parallel."""
Expand Down
6 changes: 5 additions & 1 deletion octopus/manager/workflow_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import annotations

import json
from typing import TYPE_CHECKING

import pandas as pd
import ray
Expand All @@ -14,6 +15,9 @@
from octopus.modules import ModuleResult, ResultType, StudyContext, Task
from octopus.utils import calculate_feature_groups, parquet_save

if TYPE_CHECKING:
from collections.abc import Sequence

logger = get_logger()


Expand All @@ -33,7 +37,7 @@ class WorkflowTaskRunner:
"""

study_context: StudyContext = field(validator=[validators.instance_of(StudyContext)])
workflow: list[Task] = field(validator=[validators.instance_of(list)])
workflow: Sequence[Task] = field(validator=[validators.instance_of(list)])
Comment on lines -36 to +40
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand why we have this mismatch: Sequence -- list

cpus_per_outersplit: int = field(validator=[validators.instance_of(int)])

def run(self, outersplit_id: int, outersplit: OuterSplit) -> None:
Expand Down
8 changes: 6 additions & 2 deletions octopus/models/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ def decorator(factory: Callable[[], ModelConfig]) -> Callable[[], ModelConfig]:

return decorator

@classmethod
def get_registered_models(cls) -> list[ModelName]:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want this function? In the code you normally only want models that fit to your ml_type. Can be a potential error if used somewhere.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we make it private by adding a leading underscore, @nihaase ?

"""Get a list of all registered model names."""
return [ModelName(name) for name in cls._config_factories]

@classmethod
def get_config(cls, name: ModelName) -> ModelConfig:
"""Get model configuration by name.
Expand Down Expand Up @@ -185,6 +190,5 @@ def validate_model_compatibility(cls, model_name: ModelName, ml_type: MLType) ->
config = cls.get_config(model_name)
if not config.supports_ml_type(ml_type):
raise ValueError(
f"Model '{model_name}' does not support ml_type '{ml_type.value}'. "
f"Supported types: {', '.join(t.value for t in config.ml_types)}"
f"Model '{model_name}' does not support ml_type '{ml_type.value}'. Supported types: {', '.join(t.value for t in config.ml_types)}"
)
2 changes: 1 addition & 1 deletion octopus/modules/octo/enssel.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Ensemble selection."""

# TOBEDONE
# TODO
# - issue: ACC and BALACC need integer pooling values!
# - potential issue: check start_n, +1 or not
# - get FI and counts
Expand Down
4 changes: 2 additions & 2 deletions octopus/modules/octo/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,15 @@
from octopus.models import ModelName, Models
from octopus.types import MLType

# # TOBEDONE pipeline
# # TODO pipeline
# - implement cat encoding on module level
# - how to provide categorical info to catboost and other models?


logger = get_logger()


class TrainingConfig(TypedDict):
class TrainingConfig(TypedDict, total=False):
"""Training configuration type."""

outl_reduction: int
Expand Down
3 changes: 2 additions & 1 deletion octopus/study/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import os
import platform
from abc import ABC, abstractmethod
from collections.abc import Sequence
from datetime import UTC

import pandas as pd
Expand Down Expand Up @@ -77,7 +78,7 @@ class OctoStudy(ABC):
run_single_outersplit_num: int = field(default=Factory(lambda: -1), validator=[validators.instance_of(int)])
"""Select a single outersplit to execute. Defaults to -1 to run all outersplits"""

workflow: list[Task] = field(
workflow: Sequence[Task] = field(
default=Factory(lambda: [Octo(task_id=0)]),
validator=[validators.instance_of(list), validate_workflow],
)
Expand Down
Loading
Loading