Skip to content

Commit 70e3d45

Browse files
committed
fix: restore files unintentionally reverted from main
The T2E commit was built from an older snapshot of main, which caused it to revert parquet utility functions, pyarrow dependency, FAQ content, docs references, and related usages across the codebase. This commit restores all those files to their current main state, leaving only the actual T2E changes in the PR diff.
1 parent e485b9d commit 70e3d45

File tree

18 files changed

+255
-53
lines changed

18 files changed

+255
-53
lines changed
File renamed without changes.

docs/faq.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,13 @@
11
# FAQ
2+
3+
## When loading `.parquet` files, categorical columns seem to be returned as `int`, losing the information that they were categorical.
4+
5+
This is a known issue with parquet file support in Python.
6+
Both existing libraries, `pyarrow` as well as `fastparquet` do not exactly reproduce original input data types when it comes to categorical columns.
7+
See e.g. [Issue 29017](https://github.com/apache/arrow/issues/29017) and [Issue 27067](https://github.com/apache/arrow/issues/27067).
8+
9+
To ensure proper data type roundtrip, the module `octopus.utils` provides the functions `parquet_load()` and `parquet_save()` to store and reconstruct precise dtype information in the parquet metadata.
10+
Files written with `parquet_save()` are expected to be readable with every parquet-compatible code.
11+
Still, proper dtypes are only guaranteed to be reconstructed using `parquet_load()`.
12+
13+
For details on which dtypes are tested and supported, see [tests/infrastructure/test_file_io.py](https://github.com/emdgroup/octopus/blob/main/tests/infrastructure/test_file_io.py).

docs/reference/diagnostics.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
::: octopus.diagnostics

docs/reference/predict.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
::: octopus.predict

mkdocs.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,12 +87,15 @@ nav:
8787
- "concepts/concepts.md"
8888
- Nested Cross-Validation: "concepts/nested_cv.md"
8989
- Understanding the Results: "concepts/understanding_results.md"
90+
- Feature Importance: "concepts/feature_importance.md"
9091
- API Reference:
9192
- "reference/reference.md"
93+
- octopus.diagnostics: "reference/diagnostics.md"
9294
- octopus.manager: "reference/manager.md"
9395
- octopus.metrics: "reference/metrics.md"
9496
- octopus.models: "reference/models.md"
9597
- octopus.modules: "reference/modules.md"
98+
- octopus.predict: "reference/predict.md"
9699
- octopus.study: "reference/study.md"
97100

98101
- Project Info:

octopus/diagnostics/_data_loader.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,10 @@
1212

1313
import pandas as pd
1414

15+
from octopus.utils import parquet_load
16+
1517
if TYPE_CHECKING:
16-
from pathlib import Path
18+
from upath import UPath
1719

1820

1921
def _extract_id_from_dirname(dirname: str, prefix: str) -> int | None:
@@ -32,7 +34,7 @@ def _extract_id_from_dirname(dirname: str, prefix: str) -> int | None:
3234
return None
3335

3436

35-
def load_parquet_glob(study_path: Path, pattern: str) -> pd.DataFrame:
37+
def load_parquet_glob(study_path: UPath, pattern: str) -> pd.DataFrame:
3638
"""Load and concatenate parquet files matching a glob pattern.
3739
3840
Extracts ``outersplit_id`` and ``task_id`` from the directory structure,
@@ -51,7 +53,7 @@ def load_parquet_glob(study_path: Path, pattern: str) -> pd.DataFrame:
5153
dfs: list[pd.DataFrame] = []
5254
for parquet_file in sorted(study_path.glob(pattern)):
5355
try:
54-
df = pd.read_parquet(parquet_file)
56+
df = parquet_load(parquet_file)
5557
except Exception:
5658
continue
5759

@@ -72,7 +74,7 @@ def load_parquet_glob(study_path: Path, pattern: str) -> pd.DataFrame:
7274
return pd.concat(dfs, ignore_index=True)
7375

7476

75-
def load_predictions(study_path: Path) -> pd.DataFrame:
77+
def load_predictions(study_path: UPath) -> pd.DataFrame:
7678
"""Load all predictions parquet files across outersplits and tasks.
7779
7880
Searches in ``outersplit*/task*/results/*/predictions.parquet`` to pick up
@@ -87,7 +89,7 @@ def load_predictions(study_path: Path) -> pd.DataFrame:
8789
return load_parquet_glob(study_path, "outersplit*/task*/results/*/predictions.parquet")
8890

8991

90-
def load_feature_importances(study_path: Path) -> pd.DataFrame:
92+
def load_feature_importances(study_path: UPath) -> pd.DataFrame:
9193
"""Load all feature importance parquet files across outersplits and tasks.
9294
9395
Searches in ``outersplit*/task*/results/*/feature_importances.parquet`` to pick up
@@ -102,7 +104,7 @@ def load_feature_importances(study_path: Path) -> pd.DataFrame:
102104
return load_parquet_glob(study_path, "outersplit*/task*/results/*/feature_importances.parquet")
103105

104106

105-
def load_optuna(study_path: Path) -> pd.DataFrame:
107+
def load_optuna(study_path: UPath) -> pd.DataFrame:
106108
"""Load all Optuna parquet files across outersplits and tasks.
107109
108110
Args:
@@ -114,7 +116,7 @@ def load_optuna(study_path: Path) -> pd.DataFrame:
114116
return load_parquet_glob(study_path, "outersplit*/task*/results/optuna_results.parquet")
115117

116118

117-
def load_scores(study_path: Path) -> pd.DataFrame:
119+
def load_scores(study_path: UPath) -> pd.DataFrame:
118120
"""Load all scores parquet files across outersplits and tasks.
119121
120122
Searches in ``outersplit*/task*/results/*/scores.parquet`` to pick up

octopus/manager/workflow_runner.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from octopus.datasplit import OuterSplit
1313
from octopus.logger import get_logger
1414
from octopus.modules import ModuleResult, ResultType, StudyContext, Task
15-
from octopus.utils import calculate_feature_groups
15+
from octopus.utils import calculate_feature_groups, parquet_save
1616

1717
logger = get_logger()
1818

@@ -55,9 +55,9 @@ def run(self, outersplit_id: int, outersplit: OuterSplit) -> None:
5555
fold_dir = self.study_context.output_path / f"outersplit{outersplit_id}"
5656
fold_dir.mkdir(parents=True, exist_ok=True)
5757
train_path = fold_dir / "data_traindev.parquet"
58-
outersplit.traindev.to_parquet(str(train_path), storage_options=train_path.storage_options, engine="pyarrow")
58+
parquet_save(outersplit.traindev, train_path)
5959
test_path = fold_dir / "data_test.parquet"
60-
outersplit.test.to_parquet(str(test_path), storage_options=test_path.storage_options, engine="pyarrow")
60+
parquet_save(outersplit.test, test_path)
6161

6262
# task_results: dict[task_id -> dict[ResultType, ModuleResult]]
6363
task_results: dict[int, dict[ResultType, ModuleResult]] = {}

octopus/modules/autogluon/core.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
from octopus.metrics.utils import get_score_from_model
3232
from octopus.modules import FIDataset, FIMethod, ModuleExecution, ModuleResult, ResultType, StudyContext
3333
from octopus.types import MLType
34+
from octopus.utils import csv_save
3435

3536
if TYPE_CHECKING:
3637
from octopus.modules import (
@@ -401,19 +402,13 @@ def _save_leaderboard_info(
401402
# Save leaderboard
402403
leaderboard = model.leaderboard(ag_test_data, extra_info=True)
403404
leaderboard_path = results_dir / "leaderboard.csv"
404-
leaderboard.to_csv(
405-
str(leaderboard_path),
406-
storage_options=dict(leaderboard_path.storage_options),
407-
)
405+
csv_save(leaderboard, leaderboard_path)
408406

409407
# Save best model results
410408
best_model = leaderboard.iloc[0]
411409
best_result_df = pd.DataFrame(best_model).transpose()
412410
best_result_path = results_dir / "best_model_result.csv"
413-
best_result_df.to_csv(
414-
str(best_result_path),
415-
storage_options=dict(best_result_path.storage_options),
416-
)
411+
csv_save(best_result_df, best_result_path)
417412

418413
# Save model info
419414
model_info = model.info()

octopus/modules/octo/core.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from octopus.models import Models
1717
from octopus.modules import ModuleExecution, ModuleResult, ResultType
1818
from octopus.modules.mrmr.core import _maxrminr, _relevance_fstats
19-
from octopus.utils import joblib_load
19+
from octopus.utils import joblib_load, parquet_save
2020

2121
from .bag import Bag
2222
from .enssel import EnSel
@@ -407,11 +407,7 @@ def logging_callback(study, trial):
407407
}
408408
)
409409
dict_optuna_path = results_dir / "optuna_results.parquet"
410-
pd.DataFrame(dict_optuna).to_parquet(
411-
str(dict_optuna_path),
412-
storage_options=dict_optuna_path.storage_options,
413-
engine="pyarrow",
414-
)
410+
parquet_save(pd.DataFrame(dict_optuna), dict_optuna_path)
415411

416412
# display results
417413
logger.set_log_group(LogGroup.SCORES, f"OUTER {outersplit_id} SQE TBD")

octopus/modules/octo/training.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,13 @@ def calculate_fi_constant(self):
433433

434434
def calculate_fi_internal(self):
435435
"""Sklearn-provided internal feature importance (based on train dataset)."""
436+
# Handle unsupported "timetoevent" case as in your original code
437+
if getattr(self, "ml_type", None) == MLType.TIMETOEVENT:
438+
fi_df = pd.DataFrame(columns=["feature", "importance"])
439+
logger.warning("Internal features importances not available for timetoevent.")
440+
self.feature_importances["internal"] = fi_df
441+
return
442+
436443
# 1) Tree-based models exposing feature_importances_
437444
if hasattr(self.model, "feature_importances_"):
438445
fi = np.asarray(self.model.feature_importances_)

0 commit comments

Comments
 (0)