Skip to content

Commit c76b547

Browse files
committed
catboost multi-class fix
1 parent 826bb02 commit c76b547

File tree

3 files changed

+104
-87
lines changed

3 files changed

+104
-87
lines changed

octopus/models/classification_models.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,12 @@ def catboost_classifier() -> ModelConfig:
152152
"""CatBoost classification model config."""
153153
return ModelConfig(
154154
model_class=CatBoostClassifier,
155-
ml_types=[MLType.BINARY, MLType.MULTICLASS],
155+
# KNOWN ISSUE: CatBoost multiclass is disabled because shap.Explainer(model, bg)
156+
# segfaults in SHAP <=0.51 when using TreeExplainer's interventional mode with
157+
# CatBoost multiclass models. Re-enable once SHAP fixes this upstream.
158+
# See datasets_local/specifications_refactorfi/03_shap_catboost_segfault_proposal.md
159+
# Original: ml_types=[MLType.BINARY, MLType.MULTICLASS],
160+
ml_types=[MLType.BINARY],
156161
feature_method="internal",
157162
chpo_compatible=True,
158163
scaler=None,

pyproject.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ keywords = [
2727
dynamic = ['version']
2828
dependencies = [
2929
"attrs>=25.3.0",
30-
"catboost>=1.2.8",
30+
"catboost>=1.2.10",
3131
"duckdb>=1.3",
3232
"lz4>=4.4",
3333
"networkx>=3.5",
@@ -38,7 +38,7 @@ dependencies = [
3838
"ray>=2.44,<2.47",
3939
"rapidfuzz>=3.14",
4040
"scikit-learn>=1.6.0",
41-
"shap>=0.48",
41+
"shap>=0.51",
4242
"llvmlite<0.46", # license change with 0.46
4343
"torch>=2.0",
4444
"xgboost>=3.0",
@@ -126,6 +126,7 @@ test = [
126126
"octopus-automl[examples]",
127127
"pytest>=8.4.2",
128128
"pytest-cov>=6.2.1",
129+
"pytest-forked>=1.6.0",
129130
"pytest-order>=1.3.0",
130131
"octopus-automl[autogluon]",
131132
"octopus-automl[boruta]",

tests/modules/test_training_feature_importances.py

Lines changed: 95 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,15 @@
11
"""Feature Importance Test Suite for Training Class.
22
3-
Tests all feature importance methods across all available models using pytest parametrize.
3+
Tests all feature importance methods across all available models.
4+
Each test is marked with ``@pytest.mark.forked`` so it runs in its own
5+
subprocess — this provides complete isolation between tests, preventing:
6+
7+
- CatBoost's C++ destructor segfault when Python's GC finalizes objects
8+
- numba/llvmlite LLVM pass-manager crash from accumulated JIT compilations
9+
- Memory accumulation from session-scoped model caches
10+
11+
See datasets_local/specifications_refactorfi/02_ci_segfault_investigation.md
12+
for details on why this structure was chosen.
413
514
Usage:
615
pytest test_training_feature_importances.py -v
@@ -34,7 +43,7 @@
3443
"calculate_fi_permutation",
3544
"calculate_fi_lofo",
3645
"calculate_fi_featuresused_shap",
37-
# "calculate_fi_shap", # Excluded: kernel SHAP is too slow/memory-heavy for CI (see 02_ci_segfault_investigation.md)
46+
# "calculate_fi_shap", # Excluded: kernel SHAP is too slow/memory-heavy for CI
3847
]
3948

4049
ML_TYPE_CONFIGS = {
@@ -74,21 +83,22 @@ def _get_available_models_by_type():
7483
return models_by_type
7584

7685

77-
def _generate_test_params():
78-
"""Generate (ml_type, model_name, fi_method) param combos for pytest."""
86+
def _generate_model_params():
87+
"""Generate (ml_type, model_name) param combos for pytest.
88+
89+
Each combo gets one test that runs ALL FI methods sequentially.
90+
"""
7991
available_models = _get_available_models_by_type()
8092
params = []
8193
for ml_type, model_names in available_models.items():
8294
for model_name in model_names:
83-
for fi_method in FI_METHODS:
84-
params.append(
85-
pytest.param(
86-
ml_type,
87-
model_name,
88-
fi_method,
89-
id=f"{ml_type.value}-{model_name}-{fi_method}",
90-
)
95+
params.append(
96+
pytest.param(
97+
ml_type,
98+
model_name,
99+
id=f"{ml_type.value}-{model_name}",
91100
)
101+
)
92102
return params
93103

94104

@@ -120,6 +130,49 @@ def _get_default_model_params(model_name: str) -> dict:
120130
return params
121131

122132

133+
def _create_test_data():
134+
"""Create test dataset with mixed data types."""
135+
np.random.seed(TEST_CONFIG["random_seed"])
136+
n_samples = TEST_CONFIG["n_samples"]
137+
138+
data = pd.DataFrame(
139+
{
140+
"num_col1": np.random.normal(10, 2, n_samples),
141+
"num_col2": np.random.normal(50, 10, n_samples),
142+
}
143+
)
144+
145+
# Inject some NaN values to test robustness
146+
nan_mask = np.random.random(n_samples) < 0.1
147+
data.loc[nan_mask, "num_col1"] = np.nan
148+
149+
nominal_col = np.random.choice([1, 2, 3], n_samples).astype(float)
150+
nominal_col[np.random.random(n_samples) < 0.05] = np.nan
151+
data["nominal_col"] = nominal_col
152+
153+
data["row_id"] = range(n_samples)
154+
155+
data["target_class"] = np.random.choice([0, 1], n_samples)
156+
data["target_multiclass"] = np.random.choice([0, 1, 2], n_samples)
157+
data["target_reg"] = 0.5 * data["num_col1"] + 0.3 * data["num_col2"] + np.random.normal(0, 1, n_samples)
158+
data["duration"] = np.random.exponential(10, n_samples)
159+
data["event"] = np.random.choice([True, False], n_samples, p=[0.7, 0.3])
160+
161+
n_train = int(n_samples * (1 - TEST_CONFIG["test_split"] - TEST_CONFIG["dev_split"]))
162+
n_dev = int(n_samples * TEST_CONFIG["dev_split"])
163+
164+
indices = np.random.permutation(n_samples)
165+
train_idx = indices[:n_train]
166+
dev_idx = indices[n_train : n_train + n_dev]
167+
test_idx = indices[n_train + n_dev :]
168+
169+
return (
170+
data.iloc[train_idx].reset_index(drop=True),
171+
data.iloc[dev_idx].reset_index(drop=True),
172+
data.iloc[test_idx].reset_index(drop=True),
173+
)
174+
175+
123176
def _create_training_instance(
124177
data_train: pd.DataFrame,
125178
data_dev: pd.DataFrame,
@@ -158,13 +211,13 @@ def _create_training_instance(
158211
)
159212

160213

161-
def _run_fi_method(training: Training, method_name: str):
214+
def _run_fi_method(training: Training, method_name: str) -> list[str]:
162215
"""Run a feature importance method and return the expected result key(s)."""
163216
if method_name == "calculate_fi_internal":
164217
training.calculate_fi_internal()
165218
return ["internal"]
166219
elif method_name == "calculate_fi_permutation":
167-
training.calculate_fi_permutation(partition="dev", n_repeats=1) # use_groups=True (default)
220+
training.calculate_fi_permutation(partition="dev", n_repeats=1)
168221
return ["permutation_dev"]
169222
elif method_name == "calculate_fi_lofo":
170223
training.calculate_fi_lofo()
@@ -179,84 +232,42 @@ def _run_fi_method(training: Training, method_name: str):
179232
raise ValueError(f"Unknown method: {method_name}")
180233

181234

182-
# Cache fitted Training instances across parameterized tests so each model is
183-
# only fitted once regardless of how many FI methods are tested against it.
184-
_fitted_training_cache: dict[tuple[MLType, str], Training] = {}
185-
186-
187-
@pytest.fixture(scope="session")
188-
def test_data():
189-
"""Create test dataset with mixed data types."""
190-
np.random.seed(TEST_CONFIG["random_seed"])
191-
n_samples = TEST_CONFIG["n_samples"]
192-
193-
data = pd.DataFrame(
194-
{
195-
"num_col1": np.random.normal(10, 2, n_samples),
196-
"num_col2": np.random.normal(50, 10, n_samples),
197-
}
198-
)
199-
200-
data.loc[::10, "num_col1"] = np.nan
235+
@pytest.mark.forked
236+
@pytest.mark.parametrize("ml_type,model_name", _generate_model_params())
237+
def test_feature_importance(ml_type, model_name):
238+
"""Test all FI methods for a single model in an isolated subprocess.
201239
202-
nominal_col = np.random.choice([1, 2, 3], n_samples).astype(float)
203-
nominal_col[::15] = np.nan
204-
data["nominal_col"] = nominal_col
240+
Each test runs in its own forked process (``@pytest.mark.forked``),
241+
providing complete isolation. This prevents:
205242
206-
data["row_id"] = range(n_samples)
243+
- CatBoost C++ destructor segfaults during garbage collection
244+
- numba/llvmlite LLVM pass-manager crashes from accumulated JIT state
245+
- Memory accumulation across tests
207246
208-
data["target_class"] = np.random.choice([0, 1], n_samples)
209-
data["target_multiclass"] = np.random.choice([0, 1, 2], n_samples)
210-
data["target_reg"] = (
211-
0.5 * data["num_col1"].fillna(data["num_col1"].mean())
212-
+ 0.3 * data["num_col2"].fillna(data["num_col2"].mean())
213-
+ np.random.normal(0, 1, n_samples)
214-
)
215-
data["duration"] = np.random.exponential(10, n_samples)
216-
data["event"] = np.random.choice([True, False], n_samples, p=[0.7, 0.3])
217-
218-
n_train = int(n_samples * (1 - TEST_CONFIG["test_split"] - TEST_CONFIG["dev_split"]))
219-
n_dev = int(n_samples * TEST_CONFIG["dev_split"])
220-
221-
indices = np.random.permutation(n_samples)
222-
train_idx = indices[:n_train]
223-
dev_idx = indices[n_train : n_train + n_dev]
224-
test_idx = indices[n_train + n_dev :]
225-
226-
return (
227-
data.iloc[train_idx].reset_index(drop=True),
228-
data.iloc[dev_idx].reset_index(drop=True),
229-
data.iloc[test_idx].reset_index(drop=True),
230-
)
231-
232-
233-
@pytest.mark.parametrize("ml_type,model_name,fi_method", _generate_test_params())
234-
def test_feature_importance(test_data, ml_type, model_name, fi_method):
235-
"""Test a single feature importance method for a single model."""
247+
The model is fitted once, all FI methods run sequentially, and the
248+
entire process exits cleanly when the test completes.
249+
"""
236250
warnings.filterwarnings("ignore")
237251

238-
data_train, data_dev, data_test = test_data
252+
data_train, data_dev, data_test = _create_test_data()
239253
feature_cols = ["num_col1", "num_col2", "nominal_col"]
240254
feature_groups = {
241255
"numerical_group": ["num_col1", "num_col2"],
242256
"categorical_group": ["nominal_col"],
243257
}
244258

245-
cache_key = (ml_type, model_name)
246-
if cache_key not in _fitted_training_cache:
247-
training = _create_training_instance(
248-
data_train, data_dev, data_test, ml_type, model_name, feature_cols, feature_groups
249-
)
250-
training.fit()
251-
_fitted_training_cache[cache_key] = training
252-
253-
training = _fitted_training_cache[cache_key]
254-
fi_keys = _run_fi_method(training, fi_method)
255-
256-
for key in fi_keys:
257-
fi_data = training.feature_importances.get(key)
258-
assert fi_data is not None, f"Feature importance key '{key}' not found after {fi_method}"
259-
# calculate_fi_internal legitimately returns empty for models without
260-
# built-in feature importances (e.g. GaussianProcess, SVM with non-linear kernel)
261-
if fi_method != "calculate_fi_internal":
262-
assert len(fi_data) > 0, f"Feature importance '{key}' is empty after {fi_method}"
259+
training = _create_training_instance(
260+
data_train, data_dev, data_test, ml_type, model_name, feature_cols, feature_groups
261+
)
262+
training.fit()
263+
264+
for fi_method in FI_METHODS:
265+
fi_keys = _run_fi_method(training, fi_method)
266+
267+
for key in fi_keys:
268+
fi_data = training.feature_importances.get(key)
269+
assert fi_data is not None, f"Feature importance key '{key}' not found after {fi_method}"
270+
# calculate_fi_internal legitimately returns empty for models without
271+
# built-in feature importances (e.g. GaussianProcess, SVM with non-linear kernel)
272+
if fi_method != "calculate_fi_internal":
273+
assert len(fi_data) > 0, f"Feature importance '{key}' is empty after {fi_method}"

0 commit comments

Comments
 (0)