fix: restore files unintentionally reverted from main

anwurl · anwurl · commit 70e3d45f3dfd · 2026-03-13T10:10:02.000Z
The T2E commit was built from an older snapshot of main, which
caused it to revert parquet utility functions, pyarrow dependency,
FAQ content, docs references, and related usages across the codebase.

This commit restores all those files to their current main state,
leaving only the actual T2E changes in the PR diff.
diff --git a/docs/concepts/feature_importance.md b/docs/concepts/feature_importance.md
diff --git a/docs/faq.md b/docs/faq.md
@@ -1 +1,13 @@
 # FAQ
+
+## When loading `.parquet` files, categorical columns seem to be returned as `int`, losing the information that they were categorical.
+
+This is a known issue with parquet file support in Python.
+Both existing libraries, `pyarrow` as well as `fastparquet` do not exactly reproduce original input data types when it comes to categorical columns.
+See e.g. [Issue 29017](https://github.com/apache/arrow/issues/29017) and [Issue 27067](https://github.com/apache/arrow/issues/27067).
+
+To ensure proper data type roundtrip, the module `octopus.utils` provides the functions `parquet_load()` and `parquet_save()` to store and reconstruct precise dtype information in the parquet metadata.
+Files written with `parquet_save()` are expected to be readable with every parquet-compatible code.
+Still, proper dtypes are only guaranteed to be reconstructed using `parquet_load()`.
+
+For details on which dtypes are tested and supported, see [tests/infrastructure/test_file_io.py](https://github.com/emdgroup/octopus/blob/main/tests/infrastructure/test_file_io.py).
diff --git a/docs/reference/diagnostics.md b/docs/reference/diagnostics.md
@@ -0,0 +1 @@
+::: octopus.diagnostics
diff --git a/docs/reference/predict.md b/docs/reference/predict.md
@@ -0,0 +1 @@
+::: octopus.predict
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -87,12 +87,15 @@ nav:
       - "concepts/concepts.md"
       - Nested Cross-Validation: "concepts/nested_cv.md"
       - Understanding the Results: "concepts/understanding_results.md"
+      - Feature Importance: "concepts/feature_importance.md"
   - API Reference:
       - "reference/reference.md"
+      - octopus.diagnostics: "reference/diagnostics.md"
       - octopus.manager: "reference/manager.md"
       - octopus.metrics: "reference/metrics.md"
       - octopus.models: "reference/models.md"
       - octopus.modules: "reference/modules.md"
+      - octopus.predict: "reference/predict.md"
       - octopus.study: "reference/study.md"
 
   - Project Info:
diff --git a/octopus/diagnostics/_data_loader.py b/octopus/diagnostics/_data_loader.py
@@ -12,8 +12,10 @@
 
 import pandas as pd
 
+from octopus.utils import parquet_load
+
 if TYPE_CHECKING:
-    from pathlib import Path
+    from upath import UPath
 
 
 def _extract_id_from_dirname(dirname: str, prefix: str) -> int | None:
@@ -32,7 +34,7 @@ def _extract_id_from_dirname(dirname: str, prefix: str) -> int | None:
     return None
 
 
-def load_parquet_glob(study_path: Path, pattern: str) -> pd.DataFrame:
+def load_parquet_glob(study_path: UPath, pattern: str) -> pd.DataFrame:
     """Load and concatenate parquet files matching a glob pattern.
 
     Extracts ``outersplit_id`` and ``task_id`` from the directory structure,
@@ -51,7 +53,7 @@ def load_parquet_glob(study_path: Path, pattern: str) -> pd.DataFrame:
     dfs: list[pd.DataFrame] = []
     for parquet_file in sorted(study_path.glob(pattern)):
         try:
-            df = pd.read_parquet(parquet_file)
+            df = parquet_load(parquet_file)
         except Exception:
             continue
 
@@ -72,7 +74,7 @@ def load_parquet_glob(study_path: Path, pattern: str) -> pd.DataFrame:
     return pd.concat(dfs, ignore_index=True)
 
 
-def load_predictions(study_path: Path) -> pd.DataFrame:
+def load_predictions(study_path: UPath) -> pd.DataFrame:
     """Load all predictions parquet files across outersplits and tasks.
 
     Searches in ``outersplit*/task*/results/*/predictions.parquet`` to pick up
@@ -87,7 +89,7 @@ def load_predictions(study_path: Path) -> pd.DataFrame:
     return load_parquet_glob(study_path, "outersplit*/task*/results/*/predictions.parquet")
 
 
-def load_feature_importances(study_path: Path) -> pd.DataFrame:
+def load_feature_importances(study_path: UPath) -> pd.DataFrame:
     """Load all feature importance parquet files across outersplits and tasks.
 
     Searches in ``outersplit*/task*/results/*/feature_importances.parquet`` to pick up
@@ -102,7 +104,7 @@ def load_feature_importances(study_path: Path) -> pd.DataFrame:
     return load_parquet_glob(study_path, "outersplit*/task*/results/*/feature_importances.parquet")
 
 
-def load_optuna(study_path: Path) -> pd.DataFrame:
+def load_optuna(study_path: UPath) -> pd.DataFrame:
     """Load all Optuna parquet files across outersplits and tasks.
 
     Args:
@@ -114,7 +116,7 @@ def load_optuna(study_path: Path) -> pd.DataFrame:
     return load_parquet_glob(study_path, "outersplit*/task*/results/optuna_results.parquet")
 
 
-def load_scores(study_path: Path) -> pd.DataFrame:
+def load_scores(study_path: UPath) -> pd.DataFrame:
     """Load all scores parquet files across outersplits and tasks.
 
     Searches in ``outersplit*/task*/results/*/scores.parquet`` to pick up
diff --git a/octopus/manager/workflow_runner.py b/octopus/manager/workflow_runner.py
@@ -12,7 +12,7 @@
 from octopus.datasplit import OuterSplit
 from octopus.logger import get_logger
 from octopus.modules import ModuleResult, ResultType, StudyContext, Task
-from octopus.utils import calculate_feature_groups
+from octopus.utils import calculate_feature_groups, parquet_save
 
 logger = get_logger()
 
@@ -55,9 +55,9 @@ def run(self, outersplit_id: int, outersplit: OuterSplit) -> None:
         fold_dir = self.study_context.output_path / f"outersplit{outersplit_id}"
         fold_dir.mkdir(parents=True, exist_ok=True)
         train_path = fold_dir / "data_traindev.parquet"
-        outersplit.traindev.to_parquet(str(train_path), storage_options=train_path.storage_options, engine="pyarrow")
+        parquet_save(outersplit.traindev, train_path)
         test_path = fold_dir / "data_test.parquet"
-        outersplit.test.to_parquet(str(test_path), storage_options=test_path.storage_options, engine="pyarrow")
+        parquet_save(outersplit.test, test_path)
 
         # task_results: dict[task_id -> dict[ResultType, ModuleResult]]
         task_results: dict[int, dict[ResultType, ModuleResult]] = {}
diff --git a/octopus/modules/autogluon/core.py b/octopus/modules/autogluon/core.py
@@ -31,6 +31,7 @@
 from octopus.metrics.utils import get_score_from_model
 from octopus.modules import FIDataset, FIMethod, ModuleExecution, ModuleResult, ResultType, StudyContext
 from octopus.types import MLType
+from octopus.utils import csv_save
 
 if TYPE_CHECKING:
     from octopus.modules import (
@@ -401,19 +402,13 @@ def _save_leaderboard_info(
         # Save leaderboard
         leaderboard = model.leaderboard(ag_test_data, extra_info=True)
         leaderboard_path = results_dir / "leaderboard.csv"
-        leaderboard.to_csv(
-            str(leaderboard_path),
-            storage_options=dict(leaderboard_path.storage_options),
-        )
+        csv_save(leaderboard, leaderboard_path)
 
         # Save best model results
         best_model = leaderboard.iloc[0]
         best_result_df = pd.DataFrame(best_model).transpose()
         best_result_path = results_dir / "best_model_result.csv"
-        best_result_df.to_csv(
-            str(best_result_path),
-            storage_options=dict(best_result_path.storage_options),
-        )
+        csv_save(best_result_df, best_result_path)
 
         # Save model info
         model_info = model.info()
diff --git a/octopus/modules/octo/core.py b/octopus/modules/octo/core.py
@@ -16,7 +16,7 @@
 from octopus.models import Models
 from octopus.modules import ModuleExecution, ModuleResult, ResultType
 from octopus.modules.mrmr.core import _maxrminr, _relevance_fstats
-from octopus.utils import joblib_load
+from octopus.utils import joblib_load, parquet_save
 
 from .bag import Bag
 from .enssel import EnSel
@@ -407,11 +407,7 @@ def logging_callback(study, trial):
                     }
                 )
         dict_optuna_path = results_dir / "optuna_results.parquet"
-        pd.DataFrame(dict_optuna).to_parquet(
-            str(dict_optuna_path),
-            storage_options=dict_optuna_path.storage_options,
-            engine="pyarrow",
-        )
+        parquet_save(pd.DataFrame(dict_optuna), dict_optuna_path)
 
         # display results
         logger.set_log_group(LogGroup.SCORES, f"OUTER {outersplit_id} SQE TBD")
diff --git a/octopus/modules/octo/training.py b/octopus/modules/octo/training.py
@@ -433,6 +433,13 @@ def calculate_fi_constant(self):
 
     def calculate_fi_internal(self):
         """Sklearn-provided internal feature importance (based on train dataset)."""
+        # Handle unsupported "timetoevent" case as in your original code
+        if getattr(self, "ml_type", None) == MLType.TIMETOEVENT:
+            fi_df = pd.DataFrame(columns=["feature", "importance"])
+            logger.warning("Internal features importances not available for timetoevent.")
+            self.feature_importances["internal"] = fi_df
+            return
+
         # 1) Tree-based models exposing feature_importances_
         if hasattr(self.model, "feature_importances_"):
             fi = np.asarray(self.model.feature_importances_)
diff --git a/octopus/modules/result.py b/octopus/modules/result.py
@@ -8,7 +8,7 @@
 from upath import UPath
 
 from octopus.types import ResultType
-from octopus.utils import joblib_load, joblib_save
+from octopus.utils import joblib_load, joblib_save, parquet_load, parquet_save
 
 
 @define
@@ -54,7 +54,7 @@ def save(self, result_dir: UPath) -> None:
                 out["module"] = self.module
                 out["result_type"] = self.result_type.value
                 path = result_dir / f"{name}.parquet"
-                out.to_parquet(str(path), storage_options=path.storage_options, engine="pyarrow")
+                parquet_save(out, path)
 
         # Save model/ subdirectory if model exists
         if self.model is not None:
@@ -93,7 +93,7 @@ def load(cls, result_dir: UPath, result_type: ResultType, module: str) -> "Modul
         for name in ["scores", "predictions", "feature_importances"]:
             path = result_dir / f"{name}.parquet"
             if path.exists():
-                df = pd.read_parquet(str(path), storage_options=path.storage_options, engine="pyarrow")
+                df = parquet_load(path)
                 if name == "scores":
                     scores = df
                 elif name == "predictions":
diff --git a/octopus/predict/study_io.py b/octopus/predict/study_io.py
@@ -19,6 +19,7 @@
 from upath import UPath
 
 from octopus.types import MLType
+from octopus.utils import parquet_load
 
 __all__ = [
     "SplitArtifacts",
@@ -196,7 +197,7 @@ def load_test_data(self) -> pd.DataFrame:
         path = self.fold_dir / "data_test.parquet"
         if not path.exists():
             raise FileNotFoundError(f"Test data not found: {path}")
-        return pd.read_parquet(path)
+        return parquet_load(path)
 
     def load_train_data(self) -> pd.DataFrame:
         """Load train data (at outersplit level).
@@ -210,7 +211,7 @@ def load_train_data(self) -> pd.DataFrame:
         path = self.fold_dir / "data_traindev.parquet"
         if not path.exists():
             raise FileNotFoundError(f"Train data not found: {path}")
-        return pd.read_parquet(path)
+        return parquet_load(path)
 
     def load_model(self) -> Any:
         """Load fitted model from result_dir/model/model.joblib.
@@ -285,7 +286,7 @@ def load_scores(self) -> pd.DataFrame:
             DataFrame with scores, or empty DataFrame if not found.
         """
         path = self.result_dir / "scores.parquet"
-        return pd.read_parquet(path) if path.exists() else pd.DataFrame()
+        return parquet_load(path) if path.exists() else pd.DataFrame()
 
 
 # ═══════════════════════════════════════════════════════════════
diff --git a/octopus/study/core.py b/octopus/study/core.py
@@ -17,7 +17,7 @@
 from octopus.metrics import Metrics
 from octopus.modules import Octo, StudyContext, Task
 from octopus.types import MLType
-from octopus.utils import get_package_name, get_version
+from octopus.utils import csv_save, get_package_name, get_version, parquet_save
 
 from .data_preparator import OctoDataPreparator
 from .data_validator import OctoDataValidator
@@ -215,19 +215,9 @@ def serialize_value(value):
             json.dump(study_meta, f, indent=2)
 
         data_path = self.output_path / "data_raw.parquet"
-        data.to_parquet(
-            str(data_path),
-            index=False,
-            storage_options=data_path.storage_options,
-            engine="pyarrow",
-        )
+        parquet_save(data, data_path, index=False)
         prepared_data_path = self.output_path / "data_prepared.parquet"
-        prepared.data.to_parquet(
-            str(prepared_data_path),
-            index=False,
-            storage_options=prepared_data_path.storage_options,
-            engine="pyarrow",
-        )
+        parquet_save(prepared.data, prepared_data_path, index=False)
 
     def _prepare_data(self, data: pd.DataFrame) -> PreparedData:
         """Prepare the data for training."""
@@ -287,11 +277,7 @@ def _run_health_check(self, prepared: PreparedData, config: HealthCheckConfig |
         )
         report = checker.generate_report()
         report_path = self.output_path / "health_check_report.csv"
-        report.to_csv(
-            str(report_path),
-            index=False,
-            storage_options=report_path.storage_options,
-        )
+        csv_save(report, report_path, index=False)
 
         if report.empty:
             return
diff --git a/octopus/utils.py b/octopus/utils.py
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/tests/infrastructure/test_file_io.py b/tests/infrastructure/test_file_io.py
diff --git a/tests/modules/test_module_result.py b/tests/modules/test_module_result.py
diff --git a/tests/predict/test_predict.py b/tests/predict/test_predict.py