Skip to content

Commit 986efb5

Browse files
committed
several fixes
1 parent 3bab23c commit 986efb5

File tree

3 files changed

+43
-29
lines changed

3 files changed

+43
-29
lines changed

octopus/modules/octo/bag.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -794,7 +794,8 @@ def predict_proba(self, x):
794794
# return mean of weighted predictions
795795
return np.sum(np.array(preds_lst), axis=0) / sum(weights_lst)
796796

797-
def _estimator_type(self):
797+
@property
798+
def _estimator_type(self) -> str:
798799
"""Return the estimator type for sklearn compatibility."""
799800
if self.ml_type in (MLType.BINARY, MLType.MULTICLASS):
800801
return "classifier"
@@ -806,7 +807,8 @@ def _estimator_type(self):
806807
class BagClassifier(BagBase, ClassifierMixin):
807808
"""Bag for classification tasks with sklearn ClassifierMixin."""
808809

809-
def _estimator_type(self): # type: ignore[override]
810+
@property
811+
def _estimator_type(self) -> str: # type: ignore[override]
810812
"""Return the estimator type for sklearn compatibility."""
811813
return "classifier"
812814

@@ -815,7 +817,8 @@ def _estimator_type(self): # type: ignore[override]
815817
class BagRegressor(BagBase, RegressorMixin):
816818
"""Bag for regression tasks with sklearn RegressorMixin."""
817819

818-
def _estimator_type(self): # type: ignore[override]
820+
@property
821+
def _estimator_type(self) -> str: # type: ignore[override]
819822
"""Return the estimator type for sklearn compatibility."""
820823
return "regressor"
821824

octopus/modules/octo/training.py

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import copy
44
import math
55
import statistics
6-
from typing import TypedDict
6+
from typing import Any, TypedDict
77

88
import numpy as np
99
import pandas as pd
@@ -85,7 +85,7 @@ class Training:
8585
training_weight: int = field(default=1, validator=[validators.instance_of(int)])
8686
"""Training weight for ensembling"""
8787

88-
model = field(default=None)
88+
model: Any = field(default=None)
8989
"""Model."""
9090

9191
predictions: dict = field(default=Factory(dict), validator=[validators.instance_of(dict)])
@@ -100,16 +100,16 @@ class Training:
100100
"""Features used."""
101101

102102
outlier_samples: list = field(default=Factory(list), validator=[validators.instance_of(list)])
103-
"""Outlie samples identified."""
103+
"""Outlier samples identified."""
104104

105105
is_fitted: bool = field(default=False, init=False)
106106
"""Flag indicating whether the training has been completed."""
107107

108-
preprocessing_pipeline = field(init=False)
108+
preprocessing_pipeline: ColumnTransformer | Pipeline = field(init=False)
109109
"""Preprocessing pipeline for data scaling, imputation, and categorical encoding."""
110110

111-
x_train_processed = field(default=None, init=False)
112-
"""Training data after pre-processing (outlier, impuation, scaling)."""
111+
x_train_processed: pd.DataFrame | None = field(default=None, init=False)
112+
"""Training data after pre-processing (outlier, imputation, scaling)."""
113113

114114
@property
115115
def outl_reduction(self) -> int:
@@ -169,7 +169,7 @@ def y_dev(self):
169169

170170
@property
171171
def y_test(self):
172-
"""y_dev."""
172+
"""y_test."""
173173
if self.ml_type == MLType.TIMETOEVENT:
174174
duration = self.data_test[self.target_assignments["duration"]]
175175
event = self.data_test[self.target_assignments["event"]]
@@ -186,9 +186,9 @@ def __attrs_post_init__(self):
186186

187187
def _relabel_processed_output(
188188
self,
189-
processed_data: np.ndarray,
189+
processed_data: Any,
190190
index: pd.Index | None = None,
191-
) -> pd.DataFrame | np.ndarray:
191+
) -> pd.DataFrame:
192192
"""Convert pipeline output to a correctly-labeled DataFrame in self.feature_cols order.
193193
194194
Handles the ColumnTransformer column reordering issue: ColumnTransformer outputs columns
@@ -203,8 +203,12 @@ def _relabel_processed_output(
203203
Returns:
204204
DataFrame with columns in self.feature_cols order, correctly labeled.
205205
"""
206+
# Convert sparse matrices to dense arrays
207+
if hasattr(processed_data, "toarray"):
208+
processed_data = processed_data.toarray()
209+
206210
if not (hasattr(processed_data, "shape") and len(processed_data.shape) == 2):
207-
return processed_data
211+
return pd.DataFrame(processed_data)
208212

209213
try:
210214
output_cols = list(self.preprocessing_pipeline.get_feature_names_out())
@@ -233,7 +237,7 @@ def _transform_to_dataframe(
233237
self,
234238
data: pd.DataFrame | np.ndarray,
235239
index: pd.Index | None = None,
236-
) -> pd.DataFrame | np.ndarray:
240+
) -> pd.DataFrame:
237241
"""Transform data through preprocessing pipeline and return correctly-labeled DataFrame.
238242
239243
Args:
@@ -242,7 +246,6 @@ def _transform_to_dataframe(
242246
243247
Returns:
244248
DataFrame with columns in self.feature_cols order, correctly labeled.
245-
Falls back to returning the raw array if it is not 2D.
246249
"""
247250
processed_data = self.preprocessing_pipeline.transform(data)
248251
return self._relabel_processed_output(processed_data, index=index)
@@ -536,7 +539,7 @@ def calculate_fi_group_permutation(self, partition="dev", n_repeats=10):
536539
logger.set_log_group(LogGroup.TRAINING, f"{self.training_id}")
537540

538541
logger.info(f"Calculating permutation feature importances ({partition}). This may take a while...")
539-
np.random.seed(42) # reproducibility
542+
rng = np.random.RandomState(42) # local random state for reproducibility
540543
# fixed confidence level
541544
confidence_level = 0.95
542545
feature_cols = self.feature_cols
@@ -551,6 +554,8 @@ def calculate_fi_group_permutation(self, partition="dev", n_repeats=10):
551554
data = pd.concat([self.x_dev_processed, self.data_dev[target_cols]], axis=1)
552555
elif partition == "test":
553556
data = pd.concat([self.x_test_processed, self.data_test[target_cols]], axis=1)
557+
else:
558+
raise ValueError(f"Invalid partition: '{partition}'. Must be 'dev' or 'test'.")
554559

555560
if not set(feature_cols).issubset(data.columns):
556561
raise ValueError("Features missing in provided dataset.")
@@ -581,7 +586,7 @@ def calculate_fi_group_permutation(self, partition="dev", n_repeats=10):
581586
# replace column with random selection from that column of data_all
582587
# we use data_all as the validation dataset may be small
583588
for feat in feature:
584-
data_pfi[feat] = np.random.choice(data[feat], len(data_pfi), replace=False)
589+
data_pfi[feat] = rng.choice(data[feat], len(data_pfi), replace=False)
585590
pfi_score = get_score_from_model(
586591
model,
587592
data_pfi,
@@ -625,7 +630,6 @@ def calculate_fi_group_permutation(self, partition="dev", n_repeats=10):
625630
def calculate_fi_permutation(self, partition="dev", n_repeats=10):
626631
"""Permutation feature importance."""
627632
logger.info(f"Calculating permutation feature importances ({partition}). This may take a while...")
628-
np.random.seed(42) # reproducibility
629633
if self.ml_type == MLType.TIMETOEVENT:
630634
# sksurv models only provide inbuilt scorer (CI)
631635
# more work needed to support other metrics
@@ -641,6 +645,8 @@ def calculate_fi_permutation(self, partition="dev", n_repeats=10):
641645
elif partition == "test":
642646
x = self.x_test_processed
643647
y = self.y_test
648+
else:
649+
raise ValueError(f"Invalid partition: '{partition}'. Must be 'dev' or 'test'.")
644650

645651
perm_importance = permutation_importance(
646652
self.model,
@@ -659,7 +665,6 @@ def calculate_fi_permutation(self, partition="dev", n_repeats=10):
659665

660666
def calculate_fi_lofo(self):
661667
"""LOFO feature importance."""
662-
np.random.seed(42) # reproducibility
663668
logger.info("Calculating LOFO feature importance. This may take a while...")
664669
# first, dev only
665670
feature_cols = self.feature_cols
@@ -690,6 +695,9 @@ def calculate_fi_lofo(self):
690695
feature_cols_dict = {x: [x] for x in feature_cols}
691696
lofo_features = {**feature_cols_dict, **self.feature_groups}
692697

698+
if self.x_train_processed is None:
699+
raise RuntimeError("x_train_processed is None — model must be fitted before calculating LOFO FI.")
700+
693701
# lofo
694702
fi_dev: list[tuple[str, float]] = []
695703
fi_test: list[tuple[str, float]] = []

tests/modules/octo/test_column_ordering.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212

1313
import numpy as np
1414
import pandas as pd
15-
import pytest
1615

1716
from octopus.models import Models
1817
from octopus.models.hyperparameter import (
@@ -222,9 +221,7 @@ def test_numerical_data_in_numerical_column(self):
222221

223222
# num1 should contain scaled numerical data (floats), not categorical codes
224223
num1_values = training.x_train_processed["num1"].values
225-
assert np.issubdtype(num1_values.dtype, np.floating), (
226-
f"num1 should be float dtype but got {num1_values.dtype}"
227-
)
224+
assert np.issubdtype(num1_values.dtype, np.floating), f"num1 should be float dtype but got {num1_values.dtype}"
228225
# The values should be scaled (StandardScaler) from the original ~N(10,2) distribution
229226
# They should NOT be categorical string values
230227
assert not any(isinstance(v, str) for v in num1_values), "num1 contains string values — column mislabeled!"
@@ -243,9 +240,7 @@ def test_categorical_data_in_categorical_column(self):
243240
unique_vals = set(cat1_values)
244241
# Original cat1 values are {0, 1, 2} — they should NOT be StandardScaler-transformed
245242
# (categorical columns only get imputation, not scaling)
246-
assert unique_vals.issubset({0.0, 1.0, 2.0}), (
247-
f"cat1 should contain only {{0, 1, 2}} but got {unique_vals}"
248-
)
243+
assert unique_vals.issubset({0.0, 1.0, 2.0}), f"cat1 should contain only {{0, 1, 2}} but got {unique_vals}"
249244

250245
def test_internal_fi_labels_correct_with_mixed_types(self):
251246
"""Verify feature importance labels are correct when mixed column types exist.
@@ -327,8 +322,13 @@ def test_predict_classification_with_mixed_types(self):
327322
data_train, data_dev, data_test = _split_data(data)
328323

329324
training = _create_training(
330-
data_train, data_dev, data_test, feature_cols, feature_groups,
331-
ml_type=MLType.BINARY, model_name="ExtraTreesClassifier",
325+
data_train,
326+
data_dev,
327+
data_test,
328+
feature_cols,
329+
feature_groups,
330+
ml_type=MLType.BINARY,
331+
model_name="ExtraTreesClassifier",
332332
)
333333
training.fit()
334334

@@ -355,10 +355,13 @@ def test_relabel_fallback_when_get_feature_names_out_fails(self):
355355
# Delete get_feature_names_out to trigger fallback
356356
class PipelineWithoutNames:
357357
"""Mock pipeline without get_feature_names_out."""
358+
358359
def __init__(self, pipeline):
359360
self._pipeline = pipeline
361+
360362
def transform(self, data):
361363
return self._pipeline.transform(data)
364+
362365
def fit_transform(self, data):
363366
return self._pipeline.fit_transform(data)
364367

@@ -369,4 +372,4 @@ def fit_transform(self, data):
369372
assert list(result.columns) == feature_cols
370373

371374
# Restore pipeline
372-
training.preprocessing_pipeline = original_pipeline
375+
training.preprocessing_pipeline = original_pipeline

0 commit comments

Comments
 (0)