diff --git a/docs/sources/user_guide/regressor/Eensemble_vote_regressor_files/EnsembleVotingRegressor.png b/docs/sources/user_guide/regressor/Eensemble_vote_regressor_files/EnsembleVotingRegressor.png new file mode 100644 index 000000000..3d720155c Binary files /dev/null and b/docs/sources/user_guide/regressor/Eensemble_vote_regressor_files/EnsembleVotingRegressor.png differ diff --git a/docs/sources/user_guide/regressor/Eensemble_vote_regressor_files/StackingRegressor.png b/docs/sources/user_guide/regressor/Eensemble_vote_regressor_files/StackingRegressor.png new file mode 100644 index 000000000..24db4fd66 Binary files /dev/null and b/docs/sources/user_guide/regressor/Eensemble_vote_regressor_files/StackingRegressor.png differ diff --git a/docs/sources/user_guide/regressor/Eensemble_vote_regressor_files/VotingRegressor.png b/docs/sources/user_guide/regressor/Eensemble_vote_regressor_files/VotingRegressor.png new file mode 100644 index 000000000..ffc4c5a61 Binary files /dev/null and b/docs/sources/user_guide/regressor/Eensemble_vote_regressor_files/VotingRegressor.png differ diff --git a/docs/sources/user_guide/regressor/ensemble_vote_regressor.py b/docs/sources/user_guide/regressor/ensemble_vote_regressor.py new file mode 100644 index 000000000..5ee2cebb4 --- /dev/null +++ b/docs/sources/user_guide/regressor/ensemble_vote_regressor.py @@ -0,0 +1,177 @@ +# Ensemble Voting Regressor + +from ..externals.estimator_checks import check_is_fitted +from ..externals.name_estimators import _name_estimators +from sklearn.base import BaseEstimator +import numpy as np +from sklearn.base import RegressorMixin +from sklearn.base import TransformerMixin +from sklearn.base import clone +from ..externals import six + + +class EnsembleVotingRegressor (BaseEstimator, RegressorMixin, TransformerMixin): + + """A Ensemble voting regressor for scikit-learn estimators for regression. + + Parameters + ---------- + regressors : array-like, shape = [n_regressors] + A list of regressors. + Invoking the `fit` method on the `EnsembleVotingRegressor` will fit clones + of those original regressors that will + be stored in the class attribute + `self.regr_`. + weights : array-like, shape = [n_classifiers], optional (default=`None`) + Sequence of weights (`float` or `int`) to weight the occurances of + predicted class labels (`hard` voting) or class probabilities + before averaging (`soft` voting). Uses uniform weights if `None`. + verbose : int, optional (default=0) + Controls the verbosity of the building process. + - `verbose=0` (default): Prints nothing + - `verbose=1`: Prints the number & name of the regressor being fitted + - `verbose=2`: Prints info about the parameters of the + regressor being fitted + - `verbose>2`: Changes `verbose` param of the underlying regressor to + self.verbose - 2 + + Attributes + ---------- + regressors : array-like, shape = [n_predictions] + The unmodified input regressors + regr_ : list, shape=[n_regressors] + Fitted regressors (clones of the original regressors) + refit : bool (default: True) + Clones the regressors for stacking regression if True (default) + or else uses the original ones, which will be refitted on the dataset + upon calling the `fit` method. Setting refit=False is + recommended if you are working with estimators that are supporting + the scikit-learn fit/predict API interface but are not compatible + to scikit-learn's `clone` function. + + """ + def __init__(self, regressors, weights=None, verbose=0, refit=True): + + self.regressors = regressors + self.weights = weights + self.verbose = verbose + self.refit = refit + self.named_clfs = {key: value for key, value in _name_estimators(regressors)} + + def fit(self, X, y, sample_weight=None): + """Learn weight coefficients from training data for each classifier. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape = [n_samples, n_features] + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + + y : array-like, shape = [n_samples] + Target values. + + sample_weight : array-like, shape = [n_samples], optional + Sample weights passed as sample_weights to each regressor + in the regressors list . + Raises error if some regressor does not support + sample_weight in the fit() method. + + Returns + ------- + self : object + + """ + if self.weights and len(self.weights) != len(self.regressors): + raise ValueError('Number of regressors and weights must be equal' + '; got %d weights, %d regressors' + % (len(self.weights), len(self.regressors))) + + if not self.refit: + self.regr_ = [clf for clf in self.regressors] + + else: + self.regr_ = [clone(clf) for clf in self.regressors] + + if self.verbose > 0: + print("Fitting %d regressors..." % (len(self.regressors))) + + for reg in self.regr_: + + if self.verbose > 0: + i = self.regr_.index(reg) + 1 + print("Fitting clf%d: %s (%d/%d)" % + (i, _name_estimators((reg,))[0][0], i, + len(self.regr_))) + + if self.verbose > 2: + if hasattr(reg, 'verbose'): + reg.set_params(verbose=self.verbose - 2) + + if self.verbose > 1: + print(_name_estimators((reg,))[0][1]) + + if sample_weight is None: + reg.fit(X, y) + else: + reg.fit(X, y, sample_weight=sample_weight) + return self + + + + def predict(self, X): + """ Predict class labels for X. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape = [n_samples, n_features] + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + + Returns + ---------- + maj : array-like, shape = [n_samples] + Predicted class labels. + + """ + check_is_fitted(self, 'regr_') + res = np.average(self._predict(X), axis=1, + weights=self.weights) + return res + + def transform(self, X): + """ Return class labels or probabilities for X for each estimator. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape = [n_samples, n_features] + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + + Returns + ------- + If `voting='soft'` : array-like = [n_classifiers, n_samples, n_classes] + Class probabilties calculated by each classifier. + If `voting='hard'` : array-like = [n_classifiers, n_samples] + Class labels predicted by each classifier. + + """ + check_is_fitted(self, 'regr_') + return self._predict(X) + + def get_params(self, deep=True): + """Return estimator parameter names for GridSearch support.""" + if not deep: + return super(EnsembleVotingRegressor, self).get_params(deep=False) + else: + out = self.named_clfs.copy() + for name, step in six.iteritems(self.named_clfs): + for key, value in six.iteritems(step.get_params(deep=True)): + out['%s__%s' % (name, key)] = value + for key, value in six.iteritems(super(EnsembleVotingRegressor, + self).get_params(deep=False)): + out['%s' % key] = value + return out + + def _predict(self, X): + """Collect results from clf.predict calls.""" + return np.asarray([clf.predict(X) for clf in self.regr_]).T diff --git a/mlxtend/regressor/__init__.py b/mlxtend/regressor/__init__.py index 8a8648d88..01d4867ea 100644 --- a/mlxtend/regressor/__init__.py +++ b/mlxtend/regressor/__init__.py @@ -7,5 +7,6 @@ from .linear_regression import LinearRegression from .stacking_regression import StackingRegressor from .stacking_cv_regression import StackingCVRegressor +from .ensemble_vote import EnsembleVotingRegressor -__all__ = ["LinearRegression", "StackingRegressor", "StackingCVRegressor"] +__all__ = ["LinearRegression", "StackingRegressor", "StackingCVRegressor", "EnsembleVotingRegressor"] diff --git a/mlxtend/regressor/ensemble_vote.py b/mlxtend/regressor/ensemble_vote.py new file mode 100644 index 000000000..5ee2cebb4 --- /dev/null +++ b/mlxtend/regressor/ensemble_vote.py @@ -0,0 +1,177 @@ +# Ensemble Voting Regressor + +from ..externals.estimator_checks import check_is_fitted +from ..externals.name_estimators import _name_estimators +from sklearn.base import BaseEstimator +import numpy as np +from sklearn.base import RegressorMixin +from sklearn.base import TransformerMixin +from sklearn.base import clone +from ..externals import six + + +class EnsembleVotingRegressor (BaseEstimator, RegressorMixin, TransformerMixin): + + """A Ensemble voting regressor for scikit-learn estimators for regression. + + Parameters + ---------- + regressors : array-like, shape = [n_regressors] + A list of regressors. + Invoking the `fit` method on the `EnsembleVotingRegressor` will fit clones + of those original regressors that will + be stored in the class attribute + `self.regr_`. + weights : array-like, shape = [n_classifiers], optional (default=`None`) + Sequence of weights (`float` or `int`) to weight the occurances of + predicted class labels (`hard` voting) or class probabilities + before averaging (`soft` voting). Uses uniform weights if `None`. + verbose : int, optional (default=0) + Controls the verbosity of the building process. + - `verbose=0` (default): Prints nothing + - `verbose=1`: Prints the number & name of the regressor being fitted + - `verbose=2`: Prints info about the parameters of the + regressor being fitted + - `verbose>2`: Changes `verbose` param of the underlying regressor to + self.verbose - 2 + + Attributes + ---------- + regressors : array-like, shape = [n_predictions] + The unmodified input regressors + regr_ : list, shape=[n_regressors] + Fitted regressors (clones of the original regressors) + refit : bool (default: True) + Clones the regressors for stacking regression if True (default) + or else uses the original ones, which will be refitted on the dataset + upon calling the `fit` method. Setting refit=False is + recommended if you are working with estimators that are supporting + the scikit-learn fit/predict API interface but are not compatible + to scikit-learn's `clone` function. + + """ + def __init__(self, regressors, weights=None, verbose=0, refit=True): + + self.regressors = regressors + self.weights = weights + self.verbose = verbose + self.refit = refit + self.named_clfs = {key: value for key, value in _name_estimators(regressors)} + + def fit(self, X, y, sample_weight=None): + """Learn weight coefficients from training data for each classifier. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape = [n_samples, n_features] + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + + y : array-like, shape = [n_samples] + Target values. + + sample_weight : array-like, shape = [n_samples], optional + Sample weights passed as sample_weights to each regressor + in the regressors list . + Raises error if some regressor does not support + sample_weight in the fit() method. + + Returns + ------- + self : object + + """ + if self.weights and len(self.weights) != len(self.regressors): + raise ValueError('Number of regressors and weights must be equal' + '; got %d weights, %d regressors' + % (len(self.weights), len(self.regressors))) + + if not self.refit: + self.regr_ = [clf for clf in self.regressors] + + else: + self.regr_ = [clone(clf) for clf in self.regressors] + + if self.verbose > 0: + print("Fitting %d regressors..." % (len(self.regressors))) + + for reg in self.regr_: + + if self.verbose > 0: + i = self.regr_.index(reg) + 1 + print("Fitting clf%d: %s (%d/%d)" % + (i, _name_estimators((reg,))[0][0], i, + len(self.regr_))) + + if self.verbose > 2: + if hasattr(reg, 'verbose'): + reg.set_params(verbose=self.verbose - 2) + + if self.verbose > 1: + print(_name_estimators((reg,))[0][1]) + + if sample_weight is None: + reg.fit(X, y) + else: + reg.fit(X, y, sample_weight=sample_weight) + return self + + + + def predict(self, X): + """ Predict class labels for X. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape = [n_samples, n_features] + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + + Returns + ---------- + maj : array-like, shape = [n_samples] + Predicted class labels. + + """ + check_is_fitted(self, 'regr_') + res = np.average(self._predict(X), axis=1, + weights=self.weights) + return res + + def transform(self, X): + """ Return class labels or probabilities for X for each estimator. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape = [n_samples, n_features] + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + + Returns + ------- + If `voting='soft'` : array-like = [n_classifiers, n_samples, n_classes] + Class probabilties calculated by each classifier. + If `voting='hard'` : array-like = [n_classifiers, n_samples] + Class labels predicted by each classifier. + + """ + check_is_fitted(self, 'regr_') + return self._predict(X) + + def get_params(self, deep=True): + """Return estimator parameter names for GridSearch support.""" + if not deep: + return super(EnsembleVotingRegressor, self).get_params(deep=False) + else: + out = self.named_clfs.copy() + for name, step in six.iteritems(self.named_clfs): + for key, value in six.iteritems(step.get_params(deep=True)): + out['%s__%s' % (name, key)] = value + for key, value in six.iteritems(super(EnsembleVotingRegressor, + self).get_params(deep=False)): + out['%s' % key] = value + return out + + def _predict(self, X): + """Collect results from clf.predict calls.""" + return np.asarray([clf.predict(X) for clf in self.regr_]).T diff --git a/mlxtend/regressor/tests/test_ensemble_vote_regressor.py b/mlxtend/regressor/tests/test_ensemble_vote_regressor.py new file mode 100644 index 000000000..1d0cae97a --- /dev/null +++ b/mlxtend/regressor/tests/test_ensemble_vote_regressor.py @@ -0,0 +1,300 @@ +# Sebastian Raschka 2014-2019 +# mlxtend Machine Learning Library Extensions +# Author: Sebastian Raschka +# +# License: BSD 3 clause + +import random +import pytest +from sklearn.naive_bayes import GaussianNB +from sklearn.neighbors import KNeighborsRegressor +from mlxtend.regressor import EnsembleVotingRegressor +from sklearn import datasets +from sklearn.ensemble import GradientBoostingRegressor +from sklearn.ensemble import RandomForestRegressor +from sklearn.linear_model import LinearRegression +import numpy as np +from mlxtend.data import iris_data +from sklearn.model_selection import GridSearchCV +from sklearn.model_selection import cross_val_score +from sklearn.base import clone + +X, y = iris_data() +X = X[:, 1:3] + + +class EnsembleVoteRegressor(object): + pass + + +def test_EnsembleVoteRegressor(): + + np.random.seed(123) + clf1 = GradientBoostingRegressor(random_state=1, n_estimators=10) + clf2 = RandomForestRegressor(random_state=1, n_estimators=10) + clf3 = LinearRegression() + eclf = EnsembleVoteRegressor(clfs=[clf1, clf2, clf3], voting='hard') + + scores = cross_val_score(eclf, + X, + y, + cv=5, + scoring='accuracy') + scores_mean = (round(scores.mean(), 2)) + assert(scores_mean == 0.94) + + +def test_sample_weight(): + # with no weight + np.random.seed(123) + clf1 = GradientBoostingRegressor(random_state=1, n_estimators=10) + clf2 = RandomForestRegressor(random_state=1, n_estimators=10) + clf3 = LinearRegression() + eclf = EnsembleVoteRegressor(clfs=[clf1, clf2, clf3], voting='hard') + prob1 = eclf.fit(X, y).predict_proba(X) + + # with weight = 1 + w = np.ones(len(y)) + np.random.seed(123) + clf1 = GradientBoostingRegressor(random_state=1, n_estimators=10) + clf2 = RandomForestRegressor(random_state=1, n_estimators=10) + clf3 = LinearRegression() + eclf = EnsembleVoteRegressor(clfs=[clf1, clf2, clf3], voting='hard') + prob2 = eclf.fit(X, y, sample_weight=w).predict_proba(X) + + # with random weight + random.seed(87) + w = np.array([random.random() for _ in range(len(y))]) + np.random.seed(123) + clf1 = GradientBoostingRegressor(random_state=1, n_estimators=10) + clf2 = RandomForestRegressor(random_state=1, n_estimators=10) + clf3 = LinearRegression() + eclf = EnsembleVoteRegressor(clfs=[clf1, clf2, clf3], voting='hard') + prob3 = eclf.fit(X, y, sample_weight=w).predict_proba(X) + + diff12 = np.max(np.abs(prob1 - prob2)) + diff23 = np.max(np.abs(prob2 - prob3)) + assert diff12 < 1e-3, "max diff is %.4f" % diff12 + assert diff23 > 1e-3, "max diff is %.4f" % diff23 + + +def test_no_weight_support(): + random.seed(87) + w = np.array([random.random() for _ in range(len(y))]) + gbr = GradientBoostingRegressor(random_state=1, n_estimators=10) + rf = RandomForestRegressor(random_state=1, n_estimators=10) + lr = LinearRegression() + eclf = EnsembleVoteRegressor(clfs=[gbr, rf, lr], voting='hard') + with pytest.raises(TypeError): + eclf.fit(X, y, sample_weight=w) + + +def test_no_weight_support_with_no_weight(): + gbr = GradientBoostingRegressor(random_state=1, n_estimators=10) + rf = RandomForestRegressor(random_state=1, n_estimators=10) + lr = LinearRegression() + eclf = EnsembleVoteRegressor(clfs=[gbr, rf, lr], voting='hard') + eclf.fit(X, y) + + +def test_1model_labels(): + clf = GradientBoostingRegressor(random_state=123, n_estimators=10) + ens_clf_1 = EnsembleVoteRegressor(clfs=[clf], voting='soft', weights=None) + ens_clf_2 = EnsembleVoteRegressor(clfs=[clf], voting='soft', weights=[1.]) + + pred_e1 = ens_clf_1.fit(X, y).predict(X) + pred_e2 = ens_clf_2.fit(X, y).predict(X) + pred_e3 = clf.fit(X, y).predict(X) + + np.testing.assert_equal(pred_e1, pred_e2) + np.testing.assert_equal(pred_e1, pred_e3) + + +def test_1model_probas(): + clf = GradientBoostingRegressor(random_state=123, n_estimators=10) + ens_clf_1 = EnsembleVoteRegressor(clfs=[clf], voting='soft', weights=None) + ens_clf_2 = EnsembleVoteRegressor(clfs=[clf], voting='soft', weights=[1.]) + + pred_e1 = ens_clf_1.fit(X, y).predict_proba(X) + pred_e2 = ens_clf_2.fit(X, y).predict_proba(X) + pred_e3 = clf.fit(X, y).predict_proba(X) + + np.testing.assert_almost_equal(pred_e1, pred_e2, decimal=8) + np.testing.assert_almost_equal(pred_e1, pred_e3, decimal=8) + + +def test_EnsembleVoteRegressor_weights(): + + np.random.seed(123) + clf1 = GradientBoostingRegressor(random_state=1, n_estimators=10) + clf2 = RandomForestRegressor(random_state=1, n_estimators=10) + clf3 = LinearRegression() + eclf = EnsembleVoteRegressor(clfs=[clf1, clf2, clf3], + voting='soft', + weights=[1, 2, 10]) + + scores = cross_val_score(eclf, + X, + y, + cv=5, + scoring='accuracy') + scores_mean = (round(scores.mean(), 2)) + assert(scores_mean == 0.93) + + +def test_EnsembleVoteRegressor_gridsearch(): + + clf1 = GradientBoostingRegressor(random_state=1) + clf2 = RandomForestRegressor(random_state=1) + clf3 = LinearRegression() + eclf = EnsembleVoteRegressor(clfs=[clf1, clf2, clf3], voting='soft') + + params = {'GradientBoostingRegressor__n_estimators': [20, 200], + 'RandomForestRegressor__n_estimators': [20, 200]} + + grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5, iid=False) + + X, y = iris_data() + grid.fit(X, y) + + mean_scores = [round(s, 2) for s + in grid.cv_results_['mean_test_score']] + + assert mean_scores == [0.95, 0.96, 0.96, 0.95] + + +def test_EnsembleVoteRegressor_gridsearch_enumerate_names(): + + clf1 = GradientBoostingRegressor(random_state=1) + clf2 = EnsembleVoteRegressor(random_state=1) + eclf = EnsembleVoteRegressor(clfs=[clf1, clf1, clf2]) + + params = {'GradientBoostingRegressor-1__n_estimators': [20, 200], + 'GradientBoostingRegressor-2__n_estimators': [20, 200], + 'RandomForestRegressor__n_estimators': [20, 200], + 'voting': ['hard', 'soft']} + + grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5, iid=False) + + X, y = iris_data() + grid = grid.fit(X, y) + + +def test_get_params(): + clf1 = KNeighborsRegressor(n_neighbors=1) + clf2 = RandomForestRegressor(random_state=1, n_estimators=10) + clf3 = GaussianNB() + eclf = EnsembleVoteRegressor(clfs=[clf1, clf2, clf3]) + + got = sorted(list({s.split('__')[0] for s in eclf.get_params().keys()})) + expect = ['clfs', + 'gaussiannb', + 'kneighborsregressor', + 'randomforestregressor', + 'refit', + 'verbose', + 'voting', + 'weights'] + assert got == expect, got + + +def test_classifier_gridsearch(): + clf1 = KNeighborsRegressor(n_neighbors=1) + clf2 = RandomForestRegressor(random_state=1, n_estimators=10) + clf3 = GaussianNB() + eclf = EnsembleVoteRegressor(clfs=[clf1]) + + params = {'clfs': [[clf1, clf1, clf1], [clf2, clf3]]} + + grid = GridSearchCV(estimator=eclf, + param_grid=params, + iid=False, + cv=5, + refit=True) + grid.fit(X, y) + + assert len(grid.best_params_['clfs']) == 2 + + +def test_string_labels_numpy_array(): + np.random.seed(123) + clf1 = LogisticRegression(solver='liblinear', multi_class='ovr') + clf2 = RandomForestClassifier(n_estimators=10) + clf3 = GaussianNB() + eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard') + + y_str = y.copy() + y_str = y_str.astype(str) + y_str[:50] = 'a' + y_str[50:100] = 'b' + y_str[100:150] = 'c' + + scores = cross_val_score(eclf, + X, + y_str, + cv=5, + scoring='accuracy') + scores_mean = (round(scores.mean(), 2)) + assert(scores_mean == 0.94) + + +def test_string_labels_python_list(): + np.random.seed(123) + clf1 = GradientBoostingRegressor(random_state=1, n_estimators=10) + clf2 = RandomForestRegressor(random_state=1, n_estimators=10) + clf3 = LinearRegression() + eclf = EnsembleVoteRegressor(clfs=[clf1, clf2, clf3], voting='hard') + + y_str = (['a' for a in range(50)] + + ['b' for a in range(50)] + + ['c' for a in range(50)]) + + scores = cross_val_score(eclf, + X, + y_str, + cv=5, + scoring='accuracy') + scores_mean = (round(scores.mean(), 2)) + assert(scores_mean == 0.94) + + +def test_string_labels_refit_false(): + np.random.seed(123) + clf1 = GradientBoostingRegressor(random_state=1, n_estimators=10) + clf2 = RandomForestRegressor(random_state=1, n_estimators=10) + clf3 = LinearRegression() + + y_str = y.copy() + y_str = y_str.astype(str) + y_str[:50] = 'a' + y_str[50:100] = 'b' + y_str[100:150] = 'c' + + clf1.fit(X, y_str) + clf2.fit(X, y_str) + clf3.fit(X, y_str) + + eclf = EnsembleVoteRegressor(clfs=[clf1, clf2, clf3], + voting='hard', + refit=False) + + eclf.fit(X, y_str) + assert round(eclf.score(X, y_str), 2) == 0.97 + + eclf = EnsembleVoteRegressor(clfs=[clf1, clf2, clf3], + voting='soft', + refit=False) + + eclf.fit(X, y_str) + assert round(eclf.score(X, y_str), 2) == 0.97 + + +def test_clone(): + + clf1 = GradientBoostingRegressor(random_state=1, n_estimators=10) + clf2 = RandomForestRegressor(random_state=1, n_estimators=10) + clf3 = LinearRegression() + eclf = EnsembleVoteRegressor(clfs=[clf1, clf2, clf3], + voting='hard', + refit=False) + clone(eclf)