diff --git a/.gitignore b/.gitignore index baff57d6b..0834c9fa1 100644 --- a/.gitignore +++ b/.gitignore @@ -66,4 +66,9 @@ target/ *.sln *.pyproj *.suo -*.vs \ No newline at end of file +*.vs +/*.csproj +/.spyproject +/.vscode +/bin/Debug +/obj/x86/Debug diff --git a/imblearn/ensemble/__init__.py b/imblearn/ensemble/__init__.py index 6c17409e5..d7ef467ad 100644 --- a/imblearn/ensemble/__init__.py +++ b/imblearn/ensemble/__init__.py @@ -4,6 +4,7 @@ """ from .easy_ensemble import EasyEnsemble +from .easy_ensemble_generalization import EasyEnsembleGeneralization from .balance_cascade import BalanceCascade -__all__ = ['EasyEnsemble', 'BalanceCascade'] +__all__ = ['EasyEnsemble', 'EasyEnsembleGeneralization', 'BalanceCascade'] diff --git a/imblearn/ensemble/easy_ensemble_generalization.py b/imblearn/ensemble/easy_ensemble_generalization.py new file mode 100644 index 000000000..a8d3a5f86 --- /dev/null +++ b/imblearn/ensemble/easy_ensemble_generalization.py @@ -0,0 +1,201 @@ +"Easy Ensemble Generalization" + +# Authors: Christos Aridas +# +# License: MIT +from __future__ import print_function + +import numpy as np +from sklearn.base import ClassifierMixin, clone +from sklearn.ensemble import VotingClassifier +from sklearn.ensemble.base import BaseEnsemble, _set_random_states +from sklearn.tree import DecisionTreeClassifier +from sklearn.utils import check_random_state +from sklearn.utils.multiclass import check_classification_targets +from sklearn.utils.validation import check_is_fitted + +from ..pipeline import Pipeline +from ..under_sampling import RandomUnderSampler + +MAX_INT = np.iinfo(np.int32).max + + +class EasyEnsembleGeneralization(BaseEnsemble, ClassifierMixin): + """This classifier generalize the Easy Ensemble algorithm for imbalanced + datasets. + + Parameters + ---------- + estimator : object or None, optional (default=None) + Invoking the ``fit`` method on the ``EasyEnsembleGeneralization`` will fit clones + of those original estimators that will be stored in the class attribute + ``self.estimators_``. An estimator can be set to `None` using + ``set_params``. + + sampler: object or None, optional (default=None) + Invoking the ``fit`` method on the ``EasyEnsembleGeneralization`` will fit clones + of those original samplers. + + n_estimators : int, optional (default=10) + The number of base estimators in the ensemble. + + voting : str, {'hard', 'soft'} (default='hard') + If 'hard', uses predicted class labels for majority rule voting. + Else if 'soft', predicts the class label based on the argmax of + the sums of the predicted probabilities, which is recommended for + an ensemble of well-calibrated classifiers. + + random_state : int, RandomState instance or None, optional (default=None) + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. + + n_jobs : int, optional (default=1) + The number of jobs to run in parallel for ``fit``. + If -1, then the number of jobs is set to the number of cores. + + Attributes + ---------- + estimators_ : list of classifiers + The collection of fitted estimators. + + classes_ : array-like, shape = [n_predictions] + The classes labels. + + Examples + -------- + >>> import numpy as np + >>> from imblearn.ensemble import EasyEnsembleGeneralization as EEG + >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) + >>> y = np.array([1, 1, 1, 2, 2, 2]) + >>> eeg = EEG(voting='soft', random_state=0) + >>> eeg = eeg.fit(X,y) + >>> print(eeg.predict(X)) + [1 1 1 2 2 2] + >>> + """ + + def __init__(self, + base_estimator=None, + base_sampler=None, + n_estimators=5, + voting='soft', + random_state=None, + n_jobs=1): + + self.base_estimator = base_estimator + self.base_sampler = base_sampler + self.n_estimators = n_estimators + self.voting = voting + self.random_state = random_state + self.n_jobs = n_jobs + + def _validate_estimator(self): + """Check the estimator and set the base_estimator_ attribute.""" + super(EasyEnsembleGeneralization, self)._validate_estimator( + default=DecisionTreeClassifier()) + + def _validate_sampler(self): + """Check the sampler and set the base_sampler_ attribute.""" + + if self.base_sampler is not None: + self.base_sampler_ = self.base_sampler + else: + self.base_sampler_ = RandomUnderSampler() + + if self.base_sampler_ is None: + raise ValueError("base_sampler cannot be None") + + def fit(self, X, y, sample_weight=None): + """Build an ensemble of estimators from the training set (X, y). + + Parameters + ---------- + X : {array-like, sparse matrix} of shape = [n_samples, n_features] + The training input samples. Sparse matrices are accepted only if + they are supported by the base estimator. + + y : array-like, shape = [n_samples] + The target values (class labels in classification, real numbers in + regression). + + sample_weight : array-like, shape = [n_samples] or None + Sample weights. If None, then samples are equally weighted. + Note that this is supported only if the base estimator supports + sample weighting. + + Returns + ------- + self : object + Returns self. + """ + + + check_classification_targets(y) + + self._validate_estimator() + self._validate_sampler() + + random_state = check_random_state(self.random_state) + + if not hasattr(self.base_sampler, 'random_state'): + ValueError('Base sampler must have a random_state parameter') + + steps = [('sampler', self.base_sampler_), + ('estimator', self.base_estimator_)] + pipeline_template = Pipeline(steps) + + pipelines = [] + for i in enumerate(range(self.n_estimators)): + pipeline = clone(pipeline_template) + _set_random_states(pipeline, random_state) + pipelines.append(pipeline) + + ensemble_members = [[str(i), pipeline] + for i, pipeline in enumerate(pipelines)] + + self._voting = VotingClassifier(ensemble_members, + voting=self.voting, + n_jobs=self.n_jobs) + self._voting.fit(X, y) + + self.classes_ = self._voting.classes_ + self.estimators_ = [pipeline.named_steps['estimator'] + for pipeline in self._voting.estimators_] + + return self + + def predict(self, X): + """ Predict class labels for X. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape = [n_samples, n_features] + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + + Returns + ---------- + maj : array-like, shape = [n_samples] + Predicted class labels. + """ + check_is_fitted(self, "_voting") + return self._voting.predict(X) + + def predict_proba(self, X): + """Compute probabilities of possible outcomes for all samples in X. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape = [n_samples, n_features] + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + + Returns + ---------- + avg : array-like, shape = [n_samples, n_classes] + Weighted average probability for each class per sample. + """ + check_is_fitted(self, "_voting") + return self._voting.predict_proba(X) diff --git a/imblearn/ensemble/tests/test_easy_ensemble_generalization.py b/imblearn/ensemble/tests/test_easy_ensemble_generalization.py new file mode 100644 index 000000000..acbf56e83 --- /dev/null +++ b/imblearn/ensemble/tests/test_easy_ensemble_generalization.py @@ -0,0 +1,79 @@ +"""Testing for the VotingClassifier""" + +from __future__ import print_function + +import numpy as np +from sklearn.exceptions import NotFittedError +from sklearn.model_selection import GridSearchCV, cross_val_score +from sklearn.utils.testing import assert_almost_equal, assert_array_equal +from sklearn.utils.testing import assert_equal, assert_true, assert_false +from sklearn.utils.testing import assert_raise_message + +from imblearn.ensemble import EasyEnsembleGeneralization as EEG + +RND_SEED = 0 +X = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], + [1.25192108, -0.22367336], [0.53366841, -0.30312976], + [1.52091956, -0.49283504], [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], [0.3084254, 0.33299982], + [0.70472253, -0.73309052], [0.28893132, -0.38761769], + [1.15514042, 0.0129463], [0.88407872, 0.35454207], + [1.31301027, -0.92648734], [-1.11515198, -0.93689695], + [-0.18410027, -0.45194484], [0.9281014, 0.53085498], + [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], + [0.08711622, 0.93259929], [1.70580611, -0.11219234]]) +y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) + + +def test_estimator_init(): + + eeg = EEG(n_estimators=0) + msg = "n_estimators must be greater than zero, got 0." + assert_raise_message(ValueError, msg, eeg.fit, X, y) + + +def test_predict_proba_hardvoting(): + eeg = EEG(voting='hard', random_state=RND_SEED).fit(X, y) + msg = "predict_proba is not available when voting='hard'" + assert_raise_message(AttributeError, msg, eeg.predict_proba, X) + + +def test_notfitted(): + eeg = EEG() + msg = ("This EasyEnsembleGeneralization instance is not fitted yet. Call \'fit\'" + " with appropriate arguments before using this method.") + assert_raise_message(NotFittedError, msg, eeg.predict_proba, X) + + +def test_majority_label(): + """Check classification by majority vote.""" + eeg = EEG(voting='soft', random_state=RND_SEED) + scores = cross_val_score(eeg, X, y, cv=5, scoring='roc_auc') + print(scores.mean()) + assert_almost_equal(scores.mean(), 0.65, decimal=2) + + +def test_predict_on_toy_problem(): + """Manually check predicted class labels for the toy dataset.""" + eeg = EEG(voting='hard', random_state=RND_SEED) + assert_equal(all(eeg.fit(X, y).predict(X[0:6])), all([0, 1, 0, 0, 0, 1])) + + +def test_gridsearch(): + """Check GridSearch support.""" + eeg = EEG(random_state=RND_SEED) + + params = {'voting': ['soft', 'hard'], + 'n_estimators': [2, 3, 4]} + + grid = GridSearchCV(estimator=eeg, param_grid=params, cv=3) + grid.fit(X, y) + + +def test_parallel_predict(): + """Check parallel backend of EasyEnsembleGeneralization on the toy dataset.""" + eeg1 = EEG(voting='soft', random_state=RND_SEED, n_jobs=1).fit(X, y) + eeg2 = EEG(voting='soft', random_state=RND_SEED, n_jobs=2).fit(X, y) + + assert_array_equal(eeg1.predict(X), eeg2.predict(X)) + assert_array_equal(eeg1.predict_proba(X), eeg2.predict_proba(X))