Skip to content

Commit a3a539e

Browse files
ackerleytngrasbt
authored andcommitted
Use of safe_indexing in StackingCVClassifier (#513)
* Refactor StackingCVClassifer and use safe_indexing * Fix combining of probabilities by inserting 0 probability columns * Refactor prediction functions * Fix after running test cases * Add regression test * Write fit to avoid use of hstack or vstack Also remove need to reorder labels when building meta features
1 parent 6485619 commit a3a539e

File tree

2 files changed

+119
-167
lines changed

2 files changed

+119
-167
lines changed

mlxtend/classifier/stacking_cv_classification.py

Lines changed: 79 additions & 132 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from sklearn.base import clone
2020
from sklearn.externals import six
2121
from sklearn.model_selection._split import check_cv
22+
from sklearn.utils import safe_indexing
2223

2324

2425
class StackingCVClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):
@@ -182,10 +183,19 @@ def fit(self, X, y, groups=None, sample_weight=None):
182183
# Override shuffle parameter in case of self generated
183184
# cross-validation strategy
184185
final_cv.shuffle = self.shuffle
185-
skf = list(final_cv.split(X, y, groups))
186186

187-
all_model_predictions = np.array([]).reshape(len(y), 0)
188-
for model in self.clfs_:
187+
folds = list(final_cv.split(X, y, groups))
188+
189+
# Handle the case of X being a list of lists
190+
# by converting X into a numpy array
191+
if isinstance(X, list):
192+
X = np.array(X)
193+
194+
meta_features = None
195+
n_folds = final_cv.get_n_splits()
196+
n_models = len(self.clfs_)
197+
198+
for n, model in enumerate(self.clfs_):
189199

190200
if self.verbose > 0:
191201
i = self.clfs_.index(model) + 1
@@ -200,92 +210,41 @@ def fit(self, X, y, groups=None, sample_weight=None):
200210
if self.verbose > 1:
201211
print(_name_estimators((model,))[0][1])
202212

203-
if not self.use_probas:
204-
single_model_prediction = np.array([]).reshape(0, 1)
205-
else:
206-
single_model_prediction = np.array([]).reshape(0, len(set(y)))
213+
for num, (train_indices, test_indices) in enumerate(folds):
207214

208-
for num, (train_index, test_index) in enumerate(skf):
215+
X_train = safe_indexing(X, train_indices)
216+
y_train = safe_indexing(y, train_indices)
209217

210218
if self.verbose > 0:
211219
print("Training and fitting fold %d of %d..." %
212-
((num + 1), final_cv.get_n_splits()))
213-
214-
try:
215-
if sample_weight is None:
216-
model.fit(X[train_index], y[train_index])
217-
else:
218-
model.fit(X[train_index], y[train_index],
219-
sample_weight=sample_weight[train_index])
220-
except TypeError as e:
221-
222-
if str(e).startswith('A sparse matrix was passed,'
223-
' but dense'
224-
' data is required'):
225-
sparse_estimator_message = (
226-
"\nYou are likely getting this error"
227-
" because one of the"
228-
" estimators"
229-
" does not support sparse matrix input.")
230-
else:
231-
sparse_estimator_message = ''
232-
233-
raise TypeError(str(e) + sparse_estimator_message +
234-
'\nPlease check that X and y'
235-
'are NumPy arrays. If X and y are lists'
236-
' of lists,\ntry passing them as'
237-
' numpy.array(X)'
238-
' and numpy.array(y).')
239-
except KeyError as e:
240-
241-
raise KeyError(str(e) + '\nPlease check that X and y'
242-
' are NumPy arrays. If X and y are pandas'
243-
' DataFrames,\ntry passing them as'
244-
' X.values'
245-
' and y.values.')
220+
((num + 1), n_folds))
246221

247-
if not self.use_probas:
248-
prediction = model.predict(X[test_index])
249-
prediction = prediction.reshape(prediction.shape[0], 1)
222+
if sample_weight is None:
223+
model.fit(X_train, y_train)
250224
else:
251-
prediction = model.predict_proba(X[test_index])
252-
single_model_prediction = np.vstack([single_model_prediction.
253-
astype(prediction.dtype),
254-
prediction])
225+
w = safe_indexing(sample_weight, train_indices)
226+
model.fit(X_train, y_train, sample_weight=w)
255227

256-
all_model_predictions = np.hstack([all_model_predictions.
257-
astype(single_model_prediction.
258-
dtype),
259-
single_model_prediction])
228+
X_test = safe_indexing(X, test_indices)
229+
if not self.use_probas:
230+
prediction = model.predict(X_test)[:, np.newaxis]
231+
else:
232+
prediction = model.predict_proba(X_test)
233+
234+
if meta_features is None:
235+
# First run, use prediction to get the number of classes
236+
n_classes = prediction.shape[1]
237+
meta_features_shape = (X.shape[0], n_classes * n_models)
238+
meta_features = np.empty(shape=meta_features_shape)
239+
meta_features[np.array(test_indices)[:, np.newaxis],
240+
np.arange(n_classes)] = prediction
241+
else:
242+
row_idx = np.array(test_indices)[:, np.newaxis]
243+
col_idx = np.arange(n_classes) + n * n_classes
244+
meta_features[row_idx, col_idx] = prediction
260245

261246
if self.store_train_meta_features:
262-
# Store the meta features in the order of the
263-
# original X,y arrays
264-
reodered_indices = np.array([]).astype(y.dtype)
265-
for train_index, test_index in skf:
266-
reodered_indices = np.concatenate((reodered_indices,
267-
test_index))
268-
self.train_meta_features_ = all_model_predictions[np.argsort(
269-
reodered_indices)]
270-
271-
# We have to shuffle the labels in the same order as we generated
272-
# predictions during CV (we kinda shuffled them when we did
273-
# Stratified CV).
274-
# We also do the same with the features (we will need this only IF
275-
# use_features_in_secondary is True)
276-
reordered_labels = np.array([]).astype(y.dtype)
277-
reordered_features = np.array([]).reshape((0, X.shape[1]))\
278-
.astype(X.dtype)
279-
for train_index, test_index in skf:
280-
reordered_labels = np.concatenate((reordered_labels,
281-
y[test_index]))
282-
283-
if sparse.issparse(X):
284-
reordered_features = sparse.vstack((reordered_features,
285-
X[test_index]))
286-
else:
287-
reordered_features = np.concatenate((reordered_features,
288-
X[test_index]))
247+
self.train_meta_features_ = meta_features
289248

290249
# Fit the base models correctly this time using ALL the training set
291250
for model in self.clfs_:
@@ -295,18 +254,16 @@ def fit(self, X, y, groups=None, sample_weight=None):
295254
model.fit(X, y, sample_weight=sample_weight)
296255

297256
# Fit the secondary model
298-
if not self.use_features_in_secondary:
299-
meta_features = all_model_predictions
300-
elif sparse.issparse(X):
301-
meta_features = sparse.hstack((reordered_features,
302-
all_model_predictions))
303-
else:
304-
meta_features = np.hstack((reordered_features,
305-
all_model_predictions))
257+
if self.use_features_in_secondary:
258+
meta_features = self._stack_first_level_features(
259+
X,
260+
meta_features
261+
)
262+
306263
if sample_weight is None:
307-
self.meta_clf_.fit(meta_features, reordered_labels)
264+
self.meta_clf_.fit(meta_features, y)
308265
else:
309-
self.meta_clf_.fit(meta_features, reordered_labels,
266+
self.meta_clf_.fit(meta_features, y,
310267
sample_weight=sample_weight)
311268

312269
return self
@@ -347,20 +304,35 @@ def predict_meta_features(self, X):
347304
Returns the meta-features for test data.
348305
349306
"""
350-
check_is_fitted(self, 'clfs_')
351-
all_model_predictions = np.array([]).reshape(len(X), 0)
307+
check_is_fitted(self, ['clfs_', 'meta_clf_'])
308+
309+
per_model_preds = []
310+
352311
for model in self.clfs_:
353312
if not self.use_probas:
354-
single_model_prediction = model.predict(X)
355-
single_model_prediction = single_model_prediction\
356-
.reshape(single_model_prediction.shape[0], 1)
313+
prediction = model.predict(X)[:, np.newaxis]
357314
else:
358-
single_model_prediction = model.predict_proba(X)
359-
all_model_predictions = np.hstack((all_model_predictions.
360-
astype(single_model_prediction
361-
.dtype),
362-
single_model_prediction))
363-
return all_model_predictions
315+
prediction = model.predict_proba(X)
316+
317+
per_model_preds.append(prediction)
318+
319+
return np.hstack(per_model_preds)
320+
321+
def _stack_first_level_features(self, X, meta_features):
322+
if sparse.issparse(X):
323+
stack_fn = sparse.hstack
324+
else:
325+
stack_fn = np.hstack
326+
327+
return stack_fn((X, meta_features))
328+
329+
def _do_predict(self, X, predict_fn):
330+
meta_features = self.predict_meta_features(X)
331+
332+
if self.use_features_in_secondary:
333+
meta_features = self._stack_first_level_features(X, meta_features)
334+
335+
return predict_fn(meta_features)
364336

365337
def predict(self, X):
366338
""" Predict target values for X.
@@ -377,16 +349,9 @@ def predict(self, X):
377349
Predicted class labels.
378350
379351
"""
380-
check_is_fitted(self, 'clfs_')
381-
all_model_predictions = self.predict_meta_features(X)
382-
if not self.use_features_in_secondary:
383-
return self.meta_clf_.predict(all_model_predictions)
384-
elif sparse.issparse(X):
385-
return self.meta_clf_.predict(
386-
sparse.hstack((X, all_model_predictions)))
387-
else:
388-
return self.meta_clf_.predict(
389-
np.hstack((X, all_model_predictions)))
352+
check_is_fitted(self, ['clfs_', 'meta_clf_'])
353+
354+
return self._do_predict(X, self.meta_clf_.predict)
390355

391356
def predict_proba(self, X):
392357
""" Predict class probabilities for X.
@@ -403,24 +368,6 @@ def predict_proba(self, X):
403368
Probability for each class per sample.
404369
405370
"""
406-
check_is_fitted(self, 'clfs_')
407-
all_model_predictions = np.array([]).reshape(len(X), 0)
408-
for model in self.clfs_:
409-
if not self.use_probas:
410-
single_model_prediction = model.predict(X)
411-
single_model_prediction = single_model_prediction\
412-
.reshape(single_model_prediction.shape[0], 1)
413-
else:
414-
single_model_prediction = model.predict_proba(X)
415-
all_model_predictions = np.hstack((all_model_predictions.
416-
astype(single_model_prediction.
417-
dtype),
418-
single_model_prediction))
419-
if not self.use_features_in_secondary:
420-
return self.meta_clf_.predict_proba(all_model_predictions)
421-
elif sparse.issparse(X):
422-
self.meta_clf_\
423-
.predict_proba(sparse.hstack((X, all_model_predictions)))
424-
else:
425-
return self.meta_clf_\
426-
.predict_proba(np.hstack((X, all_model_predictions)))
371+
check_is_fitted(self, ['clfs_', 'meta_clf_'])
372+
373+
return self._do_predict(X, self.meta_clf_.predict_proba)

mlxtend/classifier/tests/test_stacking_cv_classifier.py

Lines changed: 40 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -329,39 +329,6 @@ def test_verbose():
329329
sclf.fit(X_iris, y_iris)
330330

331331

332-
def test_list_of_lists():
333-
X_list = [i for i in X_iris]
334-
meta = LogisticRegression(multi_class='ovr', solver='liblinear')
335-
clf1 = RandomForestClassifier(n_estimators=10)
336-
clf2 = GaussianNB()
337-
sclf = StackingCVClassifier(classifiers=[clf1, clf2],
338-
use_probas=True,
339-
meta_classifier=meta,
340-
shuffle=False,
341-
verbose=0)
342-
343-
try:
344-
sclf.fit(X_list, y_iris)
345-
except TypeError as e:
346-
assert 'are NumPy arrays. If X and y are lists' in str(e)
347-
348-
349-
def test_pandas():
350-
X_df = pd.DataFrame(X_iris)
351-
meta = LogisticRegression(multi_class='ovr', solver='liblinear')
352-
clf1 = RandomForestClassifier(n_estimators=10)
353-
clf2 = GaussianNB()
354-
sclf = StackingCVClassifier(classifiers=[clf1, clf2],
355-
use_probas=True,
356-
meta_classifier=meta,
357-
shuffle=False,
358-
verbose=0)
359-
try:
360-
sclf.fit(X_df, y_iris)
361-
except KeyError as e:
362-
assert 'are NumPy arrays. If X and y are pandas DataFrames' in str(e)
363-
364-
365332
def test_get_params():
366333
clf1 = KNeighborsClassifier(n_neighbors=1)
367334
clf2 = RandomForestClassifier(random_state=1)
@@ -493,8 +460,8 @@ def test_sparse_inputs_with_features_in_secondary():
493460
stclf = StackingCVClassifier(classifiers=[rf, rf],
494461
meta_classifier=lr,
495462
use_features_in_secondary=True)
496-
X_train, X_test, y_train, y_test = train_test_split(X_breast, y_breast,
497-
test_size=0.3)
463+
X_train, X_test, y_train, y_test = train_test_split(X_breast, y_breast,
464+
test_size=0.3)
498465

499466
# dense
500467
stclf.fit(X_train, y_train)
@@ -505,3 +472,41 @@ def test_sparse_inputs_with_features_in_secondary():
505472
stclf.fit(sparse.csr_matrix(X_train), y_train)
506473
assert round(stclf.score(X_train, y_train), 2) == 0.99, \
507474
round(stclf.score(X_train, y_train), 2)
475+
476+
477+
def test_works_with_df_if_fold_indexes_missing():
478+
"""This is a regression test to make sure fitting will still work even if
479+
training data has ids that cannot be indexed using the indexes from the cv
480+
(e.g. skf)
481+
482+
Some possibilities:
483+
+ Output of the folds are not neatly consecutive (i.e. [341, 345, 543, ...]
484+
instead of [0, 1, ... n])
485+
+ Indexes just start from some number greater than the size of the input
486+
(see test case)
487+
488+
Training data sometimes has ids that carry other information, and selection
489+
of rows based on cv should not break.
490+
491+
This is fixed in the code using `safe_indexing`
492+
"""
493+
494+
np.random.seed(123)
495+
rf = RandomForestClassifier(n_estimators=10)
496+
lr = LogisticRegression(multi_class='ovr', solver='liblinear')
497+
stclf = StackingCVClassifier(classifiers=[rf, rf],
498+
meta_classifier=lr,
499+
use_features_in_secondary=True)
500+
501+
X_modded = pd.DataFrame(X_breast,
502+
index=np.arange(X_breast.shape[0]) + 1000)
503+
y_modded = pd.Series(y_breast,
504+
index=np.arange(y_breast.shape[0]) + 1000)
505+
506+
X_train, X_test, y_train, y_test = train_test_split(X_modded, y_modded,
507+
test_size=0.3)
508+
509+
# dense
510+
stclf.fit(X_train, y_train)
511+
assert round(stclf.score(X_train, y_train), 2) == 0.99, \
512+
round(stclf.score(X_train, y_train), 2)

0 commit comments

Comments
 (0)