several fixes

anwurl · anwurl · commit 986efb50ed2d · 2026-03-14T20:43:57.000+01:00
diff --git a/octopus/modules/octo/bag.py b/octopus/modules/octo/bag.py
@@ -794,7 +794,8 @@ def predict_proba(self, x):
         # return mean of weighted predictions
         return np.sum(np.array(preds_lst), axis=0) / sum(weights_lst)
 
-    def _estimator_type(self):
+    @property
+    def _estimator_type(self) -> str:
         """Return the estimator type for sklearn compatibility."""
         if self.ml_type in (MLType.BINARY, MLType.MULTICLASS):
             return "classifier"
@@ -806,7 +807,8 @@ def _estimator_type(self):
 class BagClassifier(BagBase, ClassifierMixin):
     """Bag for classification tasks with sklearn ClassifierMixin."""
 
-    def _estimator_type(self):  # type: ignore[override]
+    @property
+    def _estimator_type(self) -> str:  # type: ignore[override]
         """Return the estimator type for sklearn compatibility."""
         return "classifier"
 
@@ -815,7 +817,8 @@ def _estimator_type(self):  # type: ignore[override]
 class BagRegressor(BagBase, RegressorMixin):
     """Bag for regression tasks with sklearn RegressorMixin."""
 
-    def _estimator_type(self):  # type: ignore[override]
+    @property
+    def _estimator_type(self) -> str:  # type: ignore[override]
         """Return the estimator type for sklearn compatibility."""
         return "regressor"
 
diff --git a/octopus/modules/octo/training.py b/octopus/modules/octo/training.py
@@ -3,7 +3,7 @@
 import copy
 import math
 import statistics
-from typing import TypedDict
+from typing import Any, TypedDict
 
 import numpy as np
 import pandas as pd
@@ -85,7 +85,7 @@ class Training:
     training_weight: int = field(default=1, validator=[validators.instance_of(int)])
     """Training weight for ensembling"""
 
-    model = field(default=None)
+    model: Any = field(default=None)
     """Model."""
 
     predictions: dict = field(default=Factory(dict), validator=[validators.instance_of(dict)])
@@ -100,16 +100,16 @@ class Training:
     """Features used."""
 
     outlier_samples: list = field(default=Factory(list), validator=[validators.instance_of(list)])
-    """Outlie samples identified."""
+    """Outlier samples identified."""
 
     is_fitted: bool = field(default=False, init=False)
     """Flag indicating whether the training has been completed."""
 
-    preprocessing_pipeline = field(init=False)
+    preprocessing_pipeline: ColumnTransformer | Pipeline = field(init=False)
     """Preprocessing pipeline for data scaling, imputation, and categorical encoding."""
 
-    x_train_processed = field(default=None, init=False)
-    """Training data after pre-processing (outlier, impuation, scaling)."""
+    x_train_processed: pd.DataFrame | None = field(default=None, init=False)
+    """Training data after pre-processing (outlier, imputation, scaling)."""
 
     @property
     def outl_reduction(self) -> int:
@@ -169,7 +169,7 @@ def y_dev(self):
 
     @property
     def y_test(self):
-        """y_dev."""
+        """y_test."""
         if self.ml_type == MLType.TIMETOEVENT:
             duration = self.data_test[self.target_assignments["duration"]]
             event = self.data_test[self.target_assignments["event"]]
@@ -186,9 +186,9 @@ def __attrs_post_init__(self):
 
     def _relabel_processed_output(
         self,
-        processed_data: np.ndarray,
+        processed_data: Any,
         index: pd.Index | None = None,
-    ) -> pd.DataFrame | np.ndarray:
+    ) -> pd.DataFrame:
         """Convert pipeline output to a correctly-labeled DataFrame in self.feature_cols order.
 
         Handles the ColumnTransformer column reordering issue: ColumnTransformer outputs columns
@@ -203,8 +203,12 @@ def _relabel_processed_output(
         Returns:
             DataFrame with columns in self.feature_cols order, correctly labeled.
         """
+        # Convert sparse matrices to dense arrays
+        if hasattr(processed_data, "toarray"):
+            processed_data = processed_data.toarray()
+
         if not (hasattr(processed_data, "shape") and len(processed_data.shape) == 2):
-            return processed_data
+            return pd.DataFrame(processed_data)
 
         try:
             output_cols = list(self.preprocessing_pipeline.get_feature_names_out())
@@ -233,7 +237,7 @@ def _transform_to_dataframe(
         self,
         data: pd.DataFrame | np.ndarray,
         index: pd.Index | None = None,
-    ) -> pd.DataFrame | np.ndarray:
+    ) -> pd.DataFrame:
         """Transform data through preprocessing pipeline and return correctly-labeled DataFrame.
 
         Args:
@@ -242,7 +246,6 @@ def _transform_to_dataframe(
 
         Returns:
             DataFrame with columns in self.feature_cols order, correctly labeled.
-            Falls back to returning the raw array if it is not 2D.
         """
         processed_data = self.preprocessing_pipeline.transform(data)
         return self._relabel_processed_output(processed_data, index=index)
@@ -536,7 +539,7 @@ def calculate_fi_group_permutation(self, partition="dev", n_repeats=10):
         logger.set_log_group(LogGroup.TRAINING, f"{self.training_id}")
 
         logger.info(f"Calculating permutation feature importances ({partition}). This may take a while...")
-        np.random.seed(42)  # reproducibility
+        rng = np.random.RandomState(42)  # local random state for reproducibility
         # fixed confidence level
         confidence_level = 0.95
         feature_cols = self.feature_cols
@@ -551,6 +554,8 @@ def calculate_fi_group_permutation(self, partition="dev", n_repeats=10):
             data = pd.concat([self.x_dev_processed, self.data_dev[target_cols]], axis=1)
         elif partition == "test":
             data = pd.concat([self.x_test_processed, self.data_test[target_cols]], axis=1)
+        else:
+            raise ValueError(f"Invalid partition: '{partition}'. Must be 'dev' or 'test'.")
 
         if not set(feature_cols).issubset(data.columns):
             raise ValueError("Features missing in provided dataset.")
@@ -581,7 +586,7 @@ def calculate_fi_group_permutation(self, partition="dev", n_repeats=10):
                 # replace column with random selection from that column of data_all
                 # we use data_all as the validation dataset may be small
                 for feat in feature:
-                    data_pfi[feat] = np.random.choice(data[feat], len(data_pfi), replace=False)
+                    data_pfi[feat] = rng.choice(data[feat], len(data_pfi), replace=False)
                 pfi_score = get_score_from_model(
                     model,
                     data_pfi,
@@ -625,7 +630,6 @@ def calculate_fi_group_permutation(self, partition="dev", n_repeats=10):
     def calculate_fi_permutation(self, partition="dev", n_repeats=10):
         """Permutation feature importance."""
         logger.info(f"Calculating permutation feature importances ({partition}). This may take a while...")
-        np.random.seed(42)  # reproducibility
         if self.ml_type == MLType.TIMETOEVENT:
             # sksurv models only provide inbuilt scorer (CI)
             # more work needed to support other metrics
@@ -641,6 +645,8 @@ def calculate_fi_permutation(self, partition="dev", n_repeats=10):
         elif partition == "test":
             x = self.x_test_processed
             y = self.y_test
+        else:
+            raise ValueError(f"Invalid partition: '{partition}'. Must be 'dev' or 'test'.")
 
         perm_importance = permutation_importance(
             self.model,
@@ -659,7 +665,6 @@ def calculate_fi_permutation(self, partition="dev", n_repeats=10):
 
     def calculate_fi_lofo(self):
         """LOFO feature importance."""
-        np.random.seed(42)  # reproducibility
         logger.info("Calculating LOFO feature importance. This may take a while...")
         # first, dev only
         feature_cols = self.feature_cols
@@ -690,6 +695,9 @@ def calculate_fi_lofo(self):
         feature_cols_dict = {x: [x] for x in feature_cols}
         lofo_features = {**feature_cols_dict, **self.feature_groups}
 
+        if self.x_train_processed is None:
+            raise RuntimeError("x_train_processed is None — model must be fitted before calculating LOFO FI.")
+
         # lofo
         fi_dev: list[tuple[str, float]] = []
         fi_test: list[tuple[str, float]] = []
diff --git a/tests/modules/octo/test_column_ordering.py b/tests/modules/octo/test_column_ordering.py
@@ -12,7 +12,6 @@
 
 import numpy as np
 import pandas as pd
-import pytest
 
 from octopus.models import Models
 from octopus.models.hyperparameter import (
@@ -222,9 +221,7 @@ def test_numerical_data_in_numerical_column(self):
 
         # num1 should contain scaled numerical data (floats), not categorical codes
         num1_values = training.x_train_processed["num1"].values
-        assert np.issubdtype(num1_values.dtype, np.floating), (
-            f"num1 should be float dtype but got {num1_values.dtype}"
-        )
+        assert np.issubdtype(num1_values.dtype, np.floating), f"num1 should be float dtype but got {num1_values.dtype}"
         # The values should be scaled (StandardScaler) from the original ~N(10,2) distribution
         # They should NOT be categorical string values
         assert not any(isinstance(v, str) for v in num1_values), "num1 contains string values — column mislabeled!"
@@ -243,9 +240,7 @@ def test_categorical_data_in_categorical_column(self):
         unique_vals = set(cat1_values)
         # Original cat1 values are {0, 1, 2} — they should NOT be StandardScaler-transformed
         # (categorical columns only get imputation, not scaling)
-        assert unique_vals.issubset({0.0, 1.0, 2.0}), (
-            f"cat1 should contain only {{0, 1, 2}} but got {unique_vals}"
-        )
+        assert unique_vals.issubset({0.0, 1.0, 2.0}), f"cat1 should contain only {{0, 1, 2}} but got {unique_vals}"
 
     def test_internal_fi_labels_correct_with_mixed_types(self):
         """Verify feature importance labels are correct when mixed column types exist.
@@ -327,8 +322,13 @@ def test_predict_classification_with_mixed_types(self):
         data_train, data_dev, data_test = _split_data(data)
 
         training = _create_training(
-            data_train, data_dev, data_test, feature_cols, feature_groups,
-            ml_type=MLType.BINARY, model_name="ExtraTreesClassifier",
+            data_train,
+            data_dev,
+            data_test,
+            feature_cols,
+            feature_groups,
+            ml_type=MLType.BINARY,
+            model_name="ExtraTreesClassifier",
         )
         training.fit()
 
@@ -355,10 +355,13 @@ def test_relabel_fallback_when_get_feature_names_out_fails(self):
         # Delete get_feature_names_out to trigger fallback
         class PipelineWithoutNames:
             """Mock pipeline without get_feature_names_out."""
+
             def __init__(self, pipeline):
                 self._pipeline = pipeline
+
             def transform(self, data):
                 return self._pipeline.transform(data)
+
             def fit_transform(self, data):
                 return self._pipeline.fit_transform(data)
 
@@ -369,4 +372,4 @@ def fit_transform(self, data):
         assert list(result.columns) == feature_cols
 
         # Restore pipeline
-        training.preprocessing_pipeline = original_pipeline
+        training.preprocessing_pipeline = original_pipeline