octopus/examples/basic_classification.py at 972d17d545f734762105dc51698296f8e87259ee · emdgroup/octopus · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
"""Basic example for using Octopus Classification."""

# This example demonstrates how to use Octopus to create a machine learning classification model.
# We will use the breast cancer dataset for this purpose.
# Please ensure your dataset is clean, with no missing values (`NaN`),
# and that all features are numeric.

### Necessary imports for this example
import os

from octopus.example_data import load_breast_cancer_data
from octopus.modules import Octo
from octopus.study import OctoClassification

### Load and Preprocess Data
df, features, targets = load_breast_cancer_data()

print("Dataset info:")
print(f"  Features: {len(features)} - {features}")
print(f"  Samples: {df.shape[0]}")
print(f"  Classes: {len(targets)} - {targets}")
print(f"  Target distribution: {df['target'].value_counts().sort_index().to_dict()}")

### Create and run OctoClassification
study = OctoClassification(
    name="basic_classification",
    path=os.environ.get("STUDIES_PATH", "./studies"),
    target_metric="AUCROC",
    feature_cols=features,
    target_col="target",
    sample_id_col="index",
    stratification_col="target",
    workflow=[
        Octo(
            description="step1_octo_full",
            task_id=0,
            depends_on=None,  # First task, depends on input
            models=["ExtraTreesClassifier"],
            n_trials=100,  # 100 trials for hyperparameter optimization
            n_folds_inner=5,  # 5 inner folds
            max_features=30,  # Use all 30 features
            ensemble_selection=True,  # Enable ensemble selection
        ),
    ],
)

study.fit(data=df)

print("Workflow completed")