11"""Feature Importance Test Suite for Training Class.
22
3- Tests all feature importance methods across all available models using pytest parametrize.
3+ Tests all feature importance methods across all available models.
4+ Each test is marked with ``@pytest.mark.forked`` so it runs in its own
5+ subprocess — this provides complete isolation between tests, preventing:
6+
7+ - CatBoost's C++ destructor segfault when Python's GC finalizes objects
8+ - numba/llvmlite LLVM pass-manager crash from accumulated JIT compilations
9+ - Memory accumulation from session-scoped model caches
10+
11+ See datasets_local/specifications_refactorfi/02_ci_segfault_investigation.md
12+ for details on why this structure was chosen.
413
514Usage:
615 pytest test_training_feature_importances.py -v
3443 "calculate_fi_permutation" ,
3544 "calculate_fi_lofo" ,
3645 "calculate_fi_featuresused_shap" ,
37- # "calculate_fi_shap", # Excluded: kernel SHAP is too slow/memory-heavy for CI (see 02_ci_segfault_investigation.md)
46+ # "calculate_fi_shap", # Excluded: kernel SHAP is too slow/memory-heavy for CI
3847]
3948
4049ML_TYPE_CONFIGS = {
@@ -74,21 +83,22 @@ def _get_available_models_by_type():
7483 return models_by_type
7584
7685
77- def _generate_test_params ():
78- """Generate (ml_type, model_name, fi_method) param combos for pytest."""
86+ def _generate_model_params ():
87+ """Generate (ml_type, model_name) param combos for pytest.
88+
89+ Each combo gets one test that runs ALL FI methods sequentially.
90+ """
7991 available_models = _get_available_models_by_type ()
8092 params = []
8193 for ml_type , model_names in available_models .items ():
8294 for model_name in model_names :
83- for fi_method in FI_METHODS :
84- params .append (
85- pytest .param (
86- ml_type ,
87- model_name ,
88- fi_method ,
89- id = f"{ ml_type .value } -{ model_name } -{ fi_method } " ,
90- )
95+ params .append (
96+ pytest .param (
97+ ml_type ,
98+ model_name ,
99+ id = f"{ ml_type .value } -{ model_name } " ,
91100 )
101+ )
92102 return params
93103
94104
@@ -120,6 +130,49 @@ def _get_default_model_params(model_name: str) -> dict:
120130 return params
121131
122132
133+ def _create_test_data ():
134+ """Create test dataset with mixed data types."""
135+ np .random .seed (TEST_CONFIG ["random_seed" ])
136+ n_samples = TEST_CONFIG ["n_samples" ]
137+
138+ data = pd .DataFrame (
139+ {
140+ "num_col1" : np .random .normal (10 , 2 , n_samples ),
141+ "num_col2" : np .random .normal (50 , 10 , n_samples ),
142+ }
143+ )
144+
145+ # Inject some NaN values to test robustness
146+ nan_mask = np .random .random (n_samples ) < 0.1
147+ data .loc [nan_mask , "num_col1" ] = np .nan
148+
149+ nominal_col = np .random .choice ([1 , 2 , 3 ], n_samples ).astype (float )
150+ nominal_col [np .random .random (n_samples ) < 0.05 ] = np .nan
151+ data ["nominal_col" ] = nominal_col
152+
153+ data ["row_id" ] = range (n_samples )
154+
155+ data ["target_class" ] = np .random .choice ([0 , 1 ], n_samples )
156+ data ["target_multiclass" ] = np .random .choice ([0 , 1 , 2 ], n_samples )
157+ data ["target_reg" ] = 0.5 * data ["num_col1" ] + 0.3 * data ["num_col2" ] + np .random .normal (0 , 1 , n_samples )
158+ data ["duration" ] = np .random .exponential (10 , n_samples )
159+ data ["event" ] = np .random .choice ([True , False ], n_samples , p = [0.7 , 0.3 ])
160+
161+ n_train = int (n_samples * (1 - TEST_CONFIG ["test_split" ] - TEST_CONFIG ["dev_split" ]))
162+ n_dev = int (n_samples * TEST_CONFIG ["dev_split" ])
163+
164+ indices = np .random .permutation (n_samples )
165+ train_idx = indices [:n_train ]
166+ dev_idx = indices [n_train : n_train + n_dev ]
167+ test_idx = indices [n_train + n_dev :]
168+
169+ return (
170+ data .iloc [train_idx ].reset_index (drop = True ),
171+ data .iloc [dev_idx ].reset_index (drop = True ),
172+ data .iloc [test_idx ].reset_index (drop = True ),
173+ )
174+
175+
123176def _create_training_instance (
124177 data_train : pd .DataFrame ,
125178 data_dev : pd .DataFrame ,
@@ -158,13 +211,13 @@ def _create_training_instance(
158211 )
159212
160213
161- def _run_fi_method (training : Training , method_name : str ):
214+ def _run_fi_method (training : Training , method_name : str ) -> list [ str ] :
162215 """Run a feature importance method and return the expected result key(s)."""
163216 if method_name == "calculate_fi_internal" :
164217 training .calculate_fi_internal ()
165218 return ["internal" ]
166219 elif method_name == "calculate_fi_permutation" :
167- training .calculate_fi_permutation (partition = "dev" , n_repeats = 1 ) # use_groups=True (default)
220+ training .calculate_fi_permutation (partition = "dev" , n_repeats = 1 )
168221 return ["permutation_dev" ]
169222 elif method_name == "calculate_fi_lofo" :
170223 training .calculate_fi_lofo ()
@@ -179,84 +232,42 @@ def _run_fi_method(training: Training, method_name: str):
179232 raise ValueError (f"Unknown method: { method_name } " )
180233
181234
182- # Cache fitted Training instances across parameterized tests so each model is
183- # only fitted once regardless of how many FI methods are tested against it.
184- _fitted_training_cache : dict [tuple [MLType , str ], Training ] = {}
185-
186-
187- @pytest .fixture (scope = "session" )
188- def test_data ():
189- """Create test dataset with mixed data types."""
190- np .random .seed (TEST_CONFIG ["random_seed" ])
191- n_samples = TEST_CONFIG ["n_samples" ]
192-
193- data = pd .DataFrame (
194- {
195- "num_col1" : np .random .normal (10 , 2 , n_samples ),
196- "num_col2" : np .random .normal (50 , 10 , n_samples ),
197- }
198- )
199-
200- data .loc [::10 , "num_col1" ] = np .nan
235+ @pytest .mark .forked
236+ @pytest .mark .parametrize ("ml_type,model_name" , _generate_model_params ())
237+ def test_feature_importance (ml_type , model_name ):
238+ """Test all FI methods for a single model in an isolated subprocess.
201239
202- nominal_col = np .random .choice ([1 , 2 , 3 ], n_samples ).astype (float )
203- nominal_col [::15 ] = np .nan
204- data ["nominal_col" ] = nominal_col
240+ Each test runs in its own forked process (``@pytest.mark.forked``),
241+ providing complete isolation. This prevents:
205242
206- data ["row_id" ] = range (n_samples )
243+ - CatBoost C++ destructor segfaults during garbage collection
244+ - numba/llvmlite LLVM pass-manager crashes from accumulated JIT state
245+ - Memory accumulation across tests
207246
208- data ["target_class" ] = np .random .choice ([0 , 1 ], n_samples )
209- data ["target_multiclass" ] = np .random .choice ([0 , 1 , 2 ], n_samples )
210- data ["target_reg" ] = (
211- 0.5 * data ["num_col1" ].fillna (data ["num_col1" ].mean ())
212- + 0.3 * data ["num_col2" ].fillna (data ["num_col2" ].mean ())
213- + np .random .normal (0 , 1 , n_samples )
214- )
215- data ["duration" ] = np .random .exponential (10 , n_samples )
216- data ["event" ] = np .random .choice ([True , False ], n_samples , p = [0.7 , 0.3 ])
217-
218- n_train = int (n_samples * (1 - TEST_CONFIG ["test_split" ] - TEST_CONFIG ["dev_split" ]))
219- n_dev = int (n_samples * TEST_CONFIG ["dev_split" ])
220-
221- indices = np .random .permutation (n_samples )
222- train_idx = indices [:n_train ]
223- dev_idx = indices [n_train : n_train + n_dev ]
224- test_idx = indices [n_train + n_dev :]
225-
226- return (
227- data .iloc [train_idx ].reset_index (drop = True ),
228- data .iloc [dev_idx ].reset_index (drop = True ),
229- data .iloc [test_idx ].reset_index (drop = True ),
230- )
231-
232-
233- @pytest .mark .parametrize ("ml_type,model_name,fi_method" , _generate_test_params ())
234- def test_feature_importance (test_data , ml_type , model_name , fi_method ):
235- """Test a single feature importance method for a single model."""
247+ The model is fitted once, all FI methods run sequentially, and the
248+ entire process exits cleanly when the test completes.
249+ """
236250 warnings .filterwarnings ("ignore" )
237251
238- data_train , data_dev , data_test = test_data
252+ data_train , data_dev , data_test = _create_test_data ()
239253 feature_cols = ["num_col1" , "num_col2" , "nominal_col" ]
240254 feature_groups = {
241255 "numerical_group" : ["num_col1" , "num_col2" ],
242256 "categorical_group" : ["nominal_col" ],
243257 }
244258
245- cache_key = (ml_type , model_name )
246- if cache_key not in _fitted_training_cache :
247- training = _create_training_instance (
248- data_train , data_dev , data_test , ml_type , model_name , feature_cols , feature_groups
249- )
250- training .fit ()
251- _fitted_training_cache [cache_key ] = training
252-
253- training = _fitted_training_cache [cache_key ]
254- fi_keys = _run_fi_method (training , fi_method )
255-
256- for key in fi_keys :
257- fi_data = training .feature_importances .get (key )
258- assert fi_data is not None , f"Feature importance key '{ key } ' not found after { fi_method } "
259- # calculate_fi_internal legitimately returns empty for models without
260- # built-in feature importances (e.g. GaussianProcess, SVM with non-linear kernel)
261- if fi_method != "calculate_fi_internal" :
262- assert len (fi_data ) > 0 , f"Feature importance '{ key } ' is empty after { fi_method } "
259+ training = _create_training_instance (
260+ data_train , data_dev , data_test , ml_type , model_name , feature_cols , feature_groups
261+ )
262+ training .fit ()
263+
264+ for fi_method in FI_METHODS :
265+ fi_keys = _run_fi_method (training , fi_method )
266+
267+ for key in fi_keys :
268+ fi_data = training .feature_importances .get (key )
269+ assert fi_data is not None , f"Feature importance key '{ key } ' not found after { fi_method } "
270+ # calculate_fi_internal legitimately returns empty for models without
271+ # built-in feature importances (e.g. GaussianProcess, SVM with non-linear kernel)
272+ if fi_method != "calculate_fi_internal" :
273+ assert len (fi_data ) > 0 , f"Feature importance '{ key } ' is empty after { fi_method } "
0 commit comments