1919from sklearn .base import clone
2020from sklearn .externals import six
2121from sklearn .model_selection ._split import check_cv
22+ from sklearn .utils import safe_indexing
2223
2324
2425class StackingCVClassifier (BaseEstimator , ClassifierMixin , TransformerMixin ):
@@ -182,10 +183,19 @@ def fit(self, X, y, groups=None, sample_weight=None):
182183 # Override shuffle parameter in case of self generated
183184 # cross-validation strategy
184185 final_cv .shuffle = self .shuffle
185- skf = list (final_cv .split (X , y , groups ))
186186
187- all_model_predictions = np .array ([]).reshape (len (y ), 0 )
188- for model in self .clfs_ :
187+ folds = list (final_cv .split (X , y , groups ))
188+
189+ # Handle the case of X being a list of lists
190+ # by converting X into a numpy array
191+ if isinstance (X , list ):
192+ X = np .array (X )
193+
194+ meta_features = None
195+ n_folds = final_cv .get_n_splits ()
196+ n_models = len (self .clfs_ )
197+
198+ for n , model in enumerate (self .clfs_ ):
189199
190200 if self .verbose > 0 :
191201 i = self .clfs_ .index (model ) + 1
@@ -200,92 +210,41 @@ def fit(self, X, y, groups=None, sample_weight=None):
200210 if self .verbose > 1 :
201211 print (_name_estimators ((model ,))[0 ][1 ])
202212
203- if not self .use_probas :
204- single_model_prediction = np .array ([]).reshape (0 , 1 )
205- else :
206- single_model_prediction = np .array ([]).reshape (0 , len (set (y )))
213+ for num , (train_indices , test_indices ) in enumerate (folds ):
207214
208- for num , (train_index , test_index ) in enumerate (skf ):
215+ X_train = safe_indexing (X , train_indices )
216+ y_train = safe_indexing (y , train_indices )
209217
210218 if self .verbose > 0 :
211219 print ("Training and fitting fold %d of %d..." %
212- ((num + 1 ), final_cv .get_n_splits ()))
213-
214- try :
215- if sample_weight is None :
216- model .fit (X [train_index ], y [train_index ])
217- else :
218- model .fit (X [train_index ], y [train_index ],
219- sample_weight = sample_weight [train_index ])
220- except TypeError as e :
221-
222- if str (e ).startswith ('A sparse matrix was passed,'
223- ' but dense'
224- ' data is required' ):
225- sparse_estimator_message = (
226- "\n You are likely getting this error"
227- " because one of the"
228- " estimators"
229- " does not support sparse matrix input." )
230- else :
231- sparse_estimator_message = ''
232-
233- raise TypeError (str (e ) + sparse_estimator_message +
234- '\n Please check that X and y'
235- 'are NumPy arrays. If X and y are lists'
236- ' of lists,\n try passing them as'
237- ' numpy.array(X)'
238- ' and numpy.array(y).' )
239- except KeyError as e :
240-
241- raise KeyError (str (e ) + '\n Please check that X and y'
242- ' are NumPy arrays. If X and y are pandas'
243- ' DataFrames,\n try passing them as'
244- ' X.values'
245- ' and y.values.' )
220+ ((num + 1 ), n_folds ))
246221
247- if not self .use_probas :
248- prediction = model .predict (X [test_index ])
249- prediction = prediction .reshape (prediction .shape [0 ], 1 )
222+ if sample_weight is None :
223+ model .fit (X_train , y_train )
250224 else :
251- prediction = model .predict_proba (X [test_index ])
252- single_model_prediction = np .vstack ([single_model_prediction .
253- astype (prediction .dtype ),
254- prediction ])
225+ w = safe_indexing (sample_weight , train_indices )
226+ model .fit (X_train , y_train , sample_weight = w )
255227
256- all_model_predictions = np .hstack ([all_model_predictions .
257- astype (single_model_prediction .
258- dtype ),
259- single_model_prediction ])
228+ X_test = safe_indexing (X , test_indices )
229+ if not self .use_probas :
230+ prediction = model .predict (X_test )[:, np .newaxis ]
231+ else :
232+ prediction = model .predict_proba (X_test )
233+
234+ if meta_features is None :
235+ # First run, use prediction to get the number of classes
236+ n_classes = prediction .shape [1 ]
237+ meta_features_shape = (X .shape [0 ], n_classes * n_models )
238+ meta_features = np .empty (shape = meta_features_shape )
239+ meta_features [np .array (test_indices )[:, np .newaxis ],
240+ np .arange (n_classes )] = prediction
241+ else :
242+ row_idx = np .array (test_indices )[:, np .newaxis ]
243+ col_idx = np .arange (n_classes ) + n * n_classes
244+ meta_features [row_idx , col_idx ] = prediction
260245
261246 if self .store_train_meta_features :
262- # Store the meta features in the order of the
263- # original X,y arrays
264- reodered_indices = np .array ([]).astype (y .dtype )
265- for train_index , test_index in skf :
266- reodered_indices = np .concatenate ((reodered_indices ,
267- test_index ))
268- self .train_meta_features_ = all_model_predictions [np .argsort (
269- reodered_indices )]
270-
271- # We have to shuffle the labels in the same order as we generated
272- # predictions during CV (we kinda shuffled them when we did
273- # Stratified CV).
274- # We also do the same with the features (we will need this only IF
275- # use_features_in_secondary is True)
276- reordered_labels = np .array ([]).astype (y .dtype )
277- reordered_features = np .array ([]).reshape ((0 , X .shape [1 ]))\
278- .astype (X .dtype )
279- for train_index , test_index in skf :
280- reordered_labels = np .concatenate ((reordered_labels ,
281- y [test_index ]))
282-
283- if sparse .issparse (X ):
284- reordered_features = sparse .vstack ((reordered_features ,
285- X [test_index ]))
286- else :
287- reordered_features = np .concatenate ((reordered_features ,
288- X [test_index ]))
247+ self .train_meta_features_ = meta_features
289248
290249 # Fit the base models correctly this time using ALL the training set
291250 for model in self .clfs_ :
@@ -295,18 +254,16 @@ def fit(self, X, y, groups=None, sample_weight=None):
295254 model .fit (X , y , sample_weight = sample_weight )
296255
297256 # Fit the secondary model
298- if not self .use_features_in_secondary :
299- meta_features = all_model_predictions
300- elif sparse .issparse (X ):
301- meta_features = sparse .hstack ((reordered_features ,
302- all_model_predictions ))
303- else :
304- meta_features = np .hstack ((reordered_features ,
305- all_model_predictions ))
257+ if self .use_features_in_secondary :
258+ meta_features = self ._stack_first_level_features (
259+ X ,
260+ meta_features
261+ )
262+
306263 if sample_weight is None :
307- self .meta_clf_ .fit (meta_features , reordered_labels )
264+ self .meta_clf_ .fit (meta_features , y )
308265 else :
309- self .meta_clf_ .fit (meta_features , reordered_labels ,
266+ self .meta_clf_ .fit (meta_features , y ,
310267 sample_weight = sample_weight )
311268
312269 return self
@@ -347,20 +304,35 @@ def predict_meta_features(self, X):
347304 Returns the meta-features for test data.
348305
349306 """
350- check_is_fitted (self , 'clfs_' )
351- all_model_predictions = np .array ([]).reshape (len (X ), 0 )
307+ check_is_fitted (self , ['clfs_' , 'meta_clf_' ])
308+
309+ per_model_preds = []
310+
352311 for model in self .clfs_ :
353312 if not self .use_probas :
354- single_model_prediction = model .predict (X )
355- single_model_prediction = single_model_prediction \
356- .reshape (single_model_prediction .shape [0 ], 1 )
313+ prediction = model .predict (X )[:, np .newaxis ]
357314 else :
358- single_model_prediction = model .predict_proba (X )
359- all_model_predictions = np .hstack ((all_model_predictions .
360- astype (single_model_prediction
361- .dtype ),
362- single_model_prediction ))
363- return all_model_predictions
315+ prediction = model .predict_proba (X )
316+
317+ per_model_preds .append (prediction )
318+
319+ return np .hstack (per_model_preds )
320+
321+ def _stack_first_level_features (self , X , meta_features ):
322+ if sparse .issparse (X ):
323+ stack_fn = sparse .hstack
324+ else :
325+ stack_fn = np .hstack
326+
327+ return stack_fn ((X , meta_features ))
328+
329+ def _do_predict (self , X , predict_fn ):
330+ meta_features = self .predict_meta_features (X )
331+
332+ if self .use_features_in_secondary :
333+ meta_features = self ._stack_first_level_features (X , meta_features )
334+
335+ return predict_fn (meta_features )
364336
365337 def predict (self , X ):
366338 """ Predict target values for X.
@@ -377,16 +349,9 @@ def predict(self, X):
377349 Predicted class labels.
378350
379351 """
380- check_is_fitted (self , 'clfs_' )
381- all_model_predictions = self .predict_meta_features (X )
382- if not self .use_features_in_secondary :
383- return self .meta_clf_ .predict (all_model_predictions )
384- elif sparse .issparse (X ):
385- return self .meta_clf_ .predict (
386- sparse .hstack ((X , all_model_predictions )))
387- else :
388- return self .meta_clf_ .predict (
389- np .hstack ((X , all_model_predictions )))
352+ check_is_fitted (self , ['clfs_' , 'meta_clf_' ])
353+
354+ return self ._do_predict (X , self .meta_clf_ .predict )
390355
391356 def predict_proba (self , X ):
392357 """ Predict class probabilities for X.
@@ -403,24 +368,6 @@ def predict_proba(self, X):
403368 Probability for each class per sample.
404369
405370 """
406- check_is_fitted (self , 'clfs_' )
407- all_model_predictions = np .array ([]).reshape (len (X ), 0 )
408- for model in self .clfs_ :
409- if not self .use_probas :
410- single_model_prediction = model .predict (X )
411- single_model_prediction = single_model_prediction \
412- .reshape (single_model_prediction .shape [0 ], 1 )
413- else :
414- single_model_prediction = model .predict_proba (X )
415- all_model_predictions = np .hstack ((all_model_predictions .
416- astype (single_model_prediction .
417- dtype ),
418- single_model_prediction ))
419- if not self .use_features_in_secondary :
420- return self .meta_clf_ .predict_proba (all_model_predictions )
421- elif sparse .issparse (X ):
422- self .meta_clf_ \
423- .predict_proba (sparse .hstack ((X , all_model_predictions )))
424- else :
425- return self .meta_clf_ \
426- .predict_proba (np .hstack ((X , all_model_predictions )))
371+ check_is_fitted (self , ['clfs_' , 'meta_clf_' ])
372+
373+ return self ._do_predict (X , self .meta_clf_ .predict_proba )
0 commit comments