Skip to content

Commit 6b457a1

Browse files
rchaves33Ryan Chavesrasbt
authored
Implement drop proba col (#675)
* Replace drop_last_proba with drop_proba col (#590) * Revert "Replace drop_last_proba with drop_proba col (#590)" This reverts commit a19ed56. * Fix typo in unit test for drop_proba_col (#590) * fix rebase issue * add check for drop_col param Co-authored-by: Ryan Chaves <[email protected]> Co-authored-by: rasbt <[email protected]>
1 parent 8dc2d25 commit 6b457a1

File tree

6 files changed

+116
-41
lines changed

6 files changed

+116
-41
lines changed

docs/sources/user_guide/classifier/StackingCVClassifier.ipynb

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -700,7 +700,7 @@
700700
"text": [
701701
"## StackingCVClassifier\n",
702702
"\n",
703-
"*StackingCVClassifier(classifiers, meta_classifier, use_probas=False, drop_last_proba=False, cv=2, shuffle=True, random_state=None, stratify=True, verbose=0, use_features_in_secondary=False, store_train_meta_features=False, use_clones=True, n_jobs=None, pre_dispatch='2*n_jobs')*\n",
703+
"*StackingCVClassifier(classifiers, meta_classifier, use_probas=False, drop_proba_col=None, cv=2, shuffle=True, random_state=None, stratify=True, verbose=0, use_features_in_secondary=False, store_train_meta_features=False, use_clones=True, n_jobs=None, pre_dispatch='2*n_jobs')*\n",
704704
"\n",
705705
"A 'Stacking Cross-Validation' classifier for scikit-learn estimators.\n",
706706
"\n",
@@ -725,13 +725,16 @@
725725
" If True, trains meta-classifier based on predicted probabilities\n",
726726
" instead of class labels.\n",
727727
"\n",
728-
"- `drop_last_proba` : bool (default: False)\n",
728+
"- `drop_proba_col` : string (default: None)\n",
729729
"\n",
730-
" Drops the last \"probability\" column in the feature set since if `True`,\n",
731-
" because it is redundant:\n",
730+
" Drops extra \"probability\" column in the feature set, because it is\n",
731+
" redundant:\n",
732732
" p(y_c) = 1 - p(y_1) + p(y_2) + ... + p(y_{c-1}).\n",
733-
" This can be useful for meta-classifiers that are sensitive to\n",
734-
" perfectly collinear features. Only relevant if `use_probas=True.\n",
733+
" This can be useful for meta-classifiers that are sensitive to perfectly\n",
734+
" collinear features.\n",
735+
" If `last`, drops last probability column.\n",
736+
" If `first`, drops first probability column.\n",
737+
" Only relevant if `use_probas=True`.\n",
735738
"\n",
736739
"- `cv` : int, cross-validation generator or an iterable, optional (default: 2)\n",
737740
"\n",
@@ -1094,4 +1097,4 @@
10941097
},
10951098
"nbformat": 4,
10961099
"nbformat_minor": 4
1097-
}
1100+
}

docs/sources/user_guide/classifier/StackingClassifier.ipynb

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -696,7 +696,7 @@
696696
"text": [
697697
"## StackingClassifier\n",
698698
"\n",
699-
"*StackingClassifier(classifiers, meta_classifier, use_probas=False, drop_last_proba=False, average_probas=False, verbose=0, use_features_in_secondary=False, store_train_meta_features=False, use_clones=True)*\n",
699+
"*StackingClassifier(classifiers, meta_classifier, use_probas=False, drop_proba_col=None, average_probas=False, verbose=0, use_features_in_secondary=False, store_train_meta_features=False, use_clones=True)*\n",
700700
"\n",
701701
"A Stacking classifier for scikit-learn estimators for classification.\n",
702702
"\n",
@@ -720,13 +720,16 @@
720720
" If True, trains meta-classifier based on predicted probabilities\n",
721721
" instead of class labels.\n",
722722
"\n",
723-
"- `drop_last_proba` : bool (default: False)\n",
723+
"- `drop_proba_col` : string (default: None)\n",
724724
"\n",
725-
" Drops the last \"probability\" column in the feature set since if `True`,\n",
726-
" because it is redundant:\n",
725+
" Drops extra \"probability\" column in the feature set, because it is\n",
726+
" redundant:\n",
727727
" p(y_c) = 1 - p(y_1) + p(y_2) + ... + p(y_{c-1}).\n",
728-
" This can be useful for meta-classifiers that are sensitive to\n",
729-
" perfectly collinear features. Only relevant if `use_probas=True`.\n",
728+
" This can be useful for meta-classifiers that are sensitive to perfectly\n",
729+
" collinear features.\n",
730+
" If `last`, drops last probability column.\n",
731+
" If `first`, drops first probability column.\n",
732+
" Only relevant if `use_probas=True`.\n",
730733
"\n",
731734
"- `average_probas` : bool (default: False)\n",
732735
"\n",
@@ -1032,4 +1035,4 @@
10321035
},
10331036
"nbformat": 4,
10341037
"nbformat_minor": 4
1035-
}
1038+
}

mlxtend/classifier/stacking_classification.py

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,15 @@ class StackingClassifier(_BaseXComposition, _BaseStackingClassifier,
3737
use_probas : bool (default: False)
3838
If True, trains meta-classifier based on predicted probabilities
3939
instead of class labels.
40-
drop_last_proba : bool (default: False)
41-
Drops the last "probability" column in the feature set since if `True`,
42-
because it is redundant:
40+
drop_proba_col : string (default: None)
41+
Drops extra "probability" column in the feature set, because it is
42+
redundant:
4343
p(y_c) = 1 - p(y_1) + p(y_2) + ... + p(y_{c-1}).
44-
This can be useful for meta-classifiers that are sensitive to
45-
perfectly collinear features. Only relevant if `use_probas=True`.
44+
This can be useful for meta-classifiers that are sensitive to perfectly
45+
collinear features.
46+
If 'last', drops last probability column.
47+
If 'first', drops first probability column.
48+
Only relevant if `use_probas=True`.
4649
average_probas : bool (default: False)
4750
Averages the probabilities as meta features if `True`.
4851
Only relevant if `use_probas=True`.
@@ -93,7 +96,7 @@ class StackingClassifier(_BaseXComposition, _BaseStackingClassifier,
9396
"""
9497

9598
def __init__(self, classifiers, meta_classifier,
96-
use_probas=False, drop_last_proba=False,
99+
use_probas=False, drop_proba_col=None,
97100
average_probas=False, verbose=0,
98101
use_features_in_secondary=False,
99102
store_train_meta_features=False,
@@ -102,7 +105,13 @@ def __init__(self, classifiers, meta_classifier,
102105
self.classifiers = classifiers
103106
self.meta_classifier = meta_classifier
104107
self.use_probas = use_probas
105-
self.drop_last_proba = drop_last_proba
108+
109+
allowed = {None, 'first', 'last'}
110+
if drop_proba_col not in allowed:
111+
raise ValueError('`drop_proba_col` must be in %s. Got %s'
112+
% (allowed, drop_proba_col))
113+
self.drop_proba_col = drop_proba_col
114+
106115
self.average_probas = average_probas
107116
self.verbose = verbose
108117
self.use_features_in_secondary = use_features_in_secondary
@@ -214,9 +223,12 @@ def predict_meta_features(self, X):
214223
"""
215224
check_is_fitted(self, 'clfs_')
216225
if self.use_probas:
217-
if self.drop_last_proba:
226+
if self.drop_proba_col == 'last':
218227
probas = np.asarray([clf.predict_proba(X)[:, :-1]
219228
for clf in self.clfs_])
229+
elif self.drop_proba_col == 'first':
230+
probas = np.asarray([clf.predict_proba(X)[:, 1:]
231+
for clf in self.clfs_])
220232
else:
221233
probas = np.asarray([clf.predict_proba(X)
222234
for clf in self.clfs_])

mlxtend/classifier/stacking_cv_classification.py

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -42,12 +42,15 @@ class StackingCVClassifier(_BaseXComposition, _BaseStackingClassifier,
4242
use_probas : bool (default: False)
4343
If True, trains meta-classifier based on predicted probabilities
4444
instead of class labels.
45-
drop_last_proba : bool (default: False)
46-
Drops the last "probability" column in the feature set since if `True`,
47-
because it is redundant:
45+
drop_proba_col : string (default: None)
46+
Drops extra "probability" column in the feature set, because it is
47+
redundant:
4848
p(y_c) = 1 - p(y_1) + p(y_2) + ... + p(y_{c-1}).
49-
This can be useful for meta-classifiers that are sensitive to
50-
perfectly collinear features. Only relevant if `use_probas=True.
49+
This can be useful for meta-classifiers that are sensitive to perfectly
50+
collinear features.
51+
If 'last', drops last probability column.
52+
If 'first', drops first probability column.
53+
Only relevant if `use_probas=True`.
5154
cv : int, cross-validation generator or an iterable, optional (default: 2)
5255
Determines the cross-validation splitting strategy.
5356
Possible inputs for cv are:
@@ -137,7 +140,7 @@ class StackingCVClassifier(_BaseXComposition, _BaseStackingClassifier,
137140
138141
"""
139142
def __init__(self, classifiers, meta_classifier,
140-
use_probas=False, drop_last_proba=False,
143+
use_probas=False, drop_proba_col=None,
141144
cv=2, shuffle=True,
142145
random_state=None, stratify=True, verbose=0,
143146
use_features_in_secondary=False,
@@ -148,7 +151,13 @@ def __init__(self, classifiers, meta_classifier,
148151
self.classifiers = classifiers
149152
self.meta_classifier = meta_classifier
150153
self.use_probas = use_probas
151-
self.drop_last_proba = drop_last_proba
154+
155+
allowed = {None, 'first', 'last'}
156+
if drop_proba_col not in allowed:
157+
raise ValueError('`drop_proba_col` must be in %s. Got %s'
158+
% (allowed, drop_proba_col))
159+
160+
self.drop_proba_col = drop_proba_col
152161
self.cv = cv
153162
self.shuffle = shuffle
154163
self.random_state = random_state
@@ -243,8 +252,10 @@ def fit(self, X, y, groups=None, sample_weight=None):
243252

244253
if not self.use_probas:
245254
prediction = prediction[:, np.newaxis]
246-
elif self.drop_last_proba:
255+
elif self.drop_proba_col == 'last':
247256
prediction = prediction[:, :-1]
257+
elif self.drop_proba_col == 'first':
258+
prediction = prediction[:, 1:]
248259

249260
if meta_features is None:
250261
meta_features = prediction
@@ -315,8 +326,10 @@ def predict_meta_features(self, X):
315326
if not self.use_probas:
316327
prediction = model.predict(X)[:, np.newaxis]
317328
else:
318-
if self.drop_last_proba:
329+
if self.drop_proba_col == 'last':
319330
prediction = model.predict_proba(X)[:, :-1]
331+
elif self.drop_proba_col == 'first':
332+
prediction = model.predict_proba(X)[:, 1:]
320333
else:
321334
prediction = model.predict_proba(X)
322335

mlxtend/classifier/tests/test_stacking_classifier.py

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -198,13 +198,13 @@ def test_StackingClassifier_avg_vs_concat():
198198
np.array_equal(r2[0][:3], r2[0][3:])
199199

200200

201-
def test_StackingClassifier_drop_last_proba():
201+
def test_StackingClassifier_drop_proba_col():
202202
np.random.seed(123)
203203
lr1 = LogisticRegression(solver='liblinear',
204204
multi_class='ovr')
205205
sclf1 = StackingClassifier(classifiers=[lr1, lr1],
206206
use_probas=True,
207-
drop_last_proba=False,
207+
drop_proba_col=None,
208208
meta_classifier=lr1)
209209

210210
sclf1.fit(X, y)
@@ -213,16 +213,25 @@ def test_StackingClassifier_drop_last_proba():
213213

214214
sclf2 = StackingClassifier(classifiers=[lr1, lr1],
215215
use_probas=True,
216-
drop_last_proba=True,
216+
drop_proba_col='last',
217217
meta_classifier=lr1)
218218

219219
sclf2.fit(X, y)
220220
r2 = sclf2.predict_meta_features(X[:2])
221221
assert r2.shape == (2, 4), r2.shape
222222

223+
sclf4 = StackingClassifier(classifiers=[lr1, lr1],
224+
use_probas=True,
225+
drop_proba_col='first',
226+
meta_classifier=lr1)
227+
228+
sclf4.fit(X, y)
229+
r4 = sclf4.predict_meta_features(X[:2])
230+
assert r4.shape == (2, 4), r4.shape
231+
223232
sclf3 = StackingClassifier(classifiers=[lr1, lr1],
224233
use_probas=True,
225-
drop_last_proba=True,
234+
drop_proba_col='last',
226235
meta_classifier=lr1)
227236

228237
sclf3.fit(X[0:100], y[0:100]) # only 2 classes
@@ -440,7 +449,7 @@ def test_get_params():
440449
got = sorted(list({s.split('__')[0] for s in sclf.get_params().keys()}))
441450
expect = ['average_probas',
442451
'classifiers',
443-
'drop_last_proba',
452+
'drop_proba_col',
444453
'gaussiannb',
445454
'kneighborsclassifier',
446455
'meta_classifier',
@@ -564,3 +573,16 @@ def test_decision_function():
564573
assert scores_mean == 0.95, scores_mean
565574
else:
566575
assert scores_mean == 0.94, scores_mean
576+
577+
578+
def test_drop_col_unsupported():
579+
np.random.seed(123)
580+
meta = LogisticRegression()
581+
clf1 = RandomForestClassifier(n_estimators=10)
582+
clf2 = GaussianNB()
583+
clf3 = KNeighborsClassifier()
584+
585+
with pytest.raises(ValueError):
586+
StackingClassifier(classifiers=[clf1, clf2, clf3],
587+
meta_classifier=meta,
588+
drop_proba_col='invalid value')

mlxtend/classifier/tests/test_stacking_cv_classifier.py

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -344,7 +344,7 @@ def test_get_params():
344344

345345
expect = ['classifiers',
346346
'cv',
347-
'drop_last_proba',
347+
'drop_proba_col',
348348
'gaussiannb',
349349
'kneighborsclassifier',
350350
'meta_classifier',
@@ -502,13 +502,13 @@ def test_sparse_inputs_with_features_in_secondary():
502502
round(stclf.score(X_train, y_train), 2)
503503

504504

505-
def test_StackingClassifier_drop_last_proba():
505+
def test_StackingClassifier_drop_proba_col():
506506
np.random.seed(123)
507507
lr1 = LogisticRegression(solver='liblinear',
508508
multi_class='ovr')
509509
sclf1 = StackingCVClassifier(classifiers=[lr1, lr1],
510510
use_probas=True,
511-
drop_last_proba=False,
511+
drop_proba_col=None,
512512
meta_classifier=lr1)
513513

514514
sclf1.fit(X_iris, y_iris)
@@ -517,16 +517,25 @@ def test_StackingClassifier_drop_last_proba():
517517

518518
sclf2 = StackingCVClassifier(classifiers=[lr1, lr1],
519519
use_probas=True,
520-
drop_last_proba=True,
520+
drop_proba_col='last',
521521
meta_classifier=lr1)
522522

523523
sclf2.fit(X_iris, y_iris)
524524
r2 = sclf2.predict_meta_features(X_iris[:2])
525525
assert r2.shape == (2, 4), r2.shape
526526

527+
sclf4 = StackingCVClassifier(classifiers=[lr1, lr1],
528+
use_probas=True,
529+
drop_proba_col='first',
530+
meta_classifier=lr1)
531+
532+
sclf4.fit(X_iris, y_iris)
533+
r4 = sclf4.predict_meta_features(X_iris[:2])
534+
assert r4.shape == (2, 4), r4.shape
535+
527536
sclf3 = StackingCVClassifier(classifiers=[lr1, lr1],
528537
use_probas=True,
529-
drop_last_proba=True,
538+
drop_proba_col='last',
530539
meta_classifier=lr1)
531540

532541
sclf3.fit(X_iris[0:100], y_iris[0:100]) # only 2 classes
@@ -618,3 +627,16 @@ def test_decision_function():
618627
assert scores_mean == 0.96, scores_mean
619628
else:
620629
assert scores_mean == 0.90, scores_mean
630+
631+
632+
def test_drop_col_unsupported():
633+
np.random.seed(123)
634+
meta = LogisticRegression()
635+
clf1 = RandomForestClassifier(n_estimators=10)
636+
clf2 = GaussianNB()
637+
clf3 = KNeighborsClassifier()
638+
639+
with pytest.raises(ValueError):
640+
StackingCVClassifier(classifiers=[clf1, clf2, clf3],
641+
meta_classifier=meta,
642+
drop_proba_col='invalid value')

0 commit comments

Comments
 (0)