|
| 1 | +"""Class to perform over-sampling using ROSE.""" |
| 2 | + |
| 3 | +import numpy as np |
| 4 | +from scipy import sparse |
| 5 | +from sklearn.utils import check_random_state |
| 6 | +from .base import BaseOverSampler |
| 7 | +from ..utils._validation import _deprecate_positional_args |
| 8 | + |
| 9 | + |
| 10 | +class ROSE(BaseOverSampler): |
| 11 | + """Random Over-Sampling Examples (ROSE). |
| 12 | +
|
| 13 | + This object is the implementation of ROSE algorithm. |
| 14 | + It generates new samples by a smoothed bootstrap approach, |
| 15 | + taking a random subsample of original data and adding a |
| 16 | + multivariate kernel density estimate :math:`f(x|Y_i)` around |
| 17 | + them with a smoothing matrix :math:`H_j`, and finally sampling |
| 18 | + from this distribution. A shrinking matrix can be provided, to |
| 19 | + set the bandwidth of the gaussian kernel. |
| 20 | +
|
| 21 | + Read more in the :ref:`User Guide <rose>`. |
| 22 | +
|
| 23 | + Parameters |
| 24 | + ---------- |
| 25 | + sampling_strategy : float, str, dict or callable, default='auto' |
| 26 | + Sampling information to resample the data set. |
| 27 | +
|
| 28 | + - When ``float``, it corresponds to the desired ratio of the number of |
| 29 | + samples in the minority class over the number of samples in the |
| 30 | + majority class after resampling. Therefore, the ratio is expressed as |
| 31 | + :math:`\\alpha_{os} = N_{rm} / N_{M}` where :math:`N_{rm}` is the |
| 32 | + number of samples in the minority class after resampling and |
| 33 | + :math:`N_{M}` is the number of samples in the majority class. |
| 34 | +
|
| 35 | + .. warning:: |
| 36 | + ``float`` is only available for **binary** classification. An |
| 37 | + error is raised for multi-class classification. |
| 38 | +
|
| 39 | + - When ``str``, specify the class targeted by the resampling. The |
| 40 | + number of samples in the different classes will be equalized. |
| 41 | + Possible choices are: |
| 42 | +
|
| 43 | + ``'minority'``: resample only the minority class; |
| 44 | +
|
| 45 | + ``'not minority'``: resample all classes but the minority class; |
| 46 | +
|
| 47 | + ``'not majority'``: resample all classes but the majority class; |
| 48 | +
|
| 49 | + ``'all'``: resample all classes; |
| 50 | +
|
| 51 | + ``'auto'``: equivalent to ``'not majority'``. |
| 52 | +
|
| 53 | + - When ``dict``, the keys correspond to the targeted classes. The |
| 54 | + values correspond to the desired number of samples for each targeted |
| 55 | + class. |
| 56 | +
|
| 57 | + - When callable, function taking ``y`` and returns a ``dict``. The keys |
| 58 | + correspond to the targeted classes. The values correspond to the |
| 59 | + desired number of samples for each class. |
| 60 | +
|
| 61 | + shrink_factors : dict, default= 1 for every class |
| 62 | + Dict of {classes: shrinkfactors} items, applied to |
| 63 | + the gaussian kernels. It can be used to compress/dilate the kernel. |
| 64 | +
|
| 65 | + random_state : int, RandomState instance, default=None |
| 66 | + Control the randomization of the algorithm. |
| 67 | +
|
| 68 | + - If int, ``random_state`` is the seed used by the random number |
| 69 | + generator; |
| 70 | + - If ``RandomState`` instance, random_state is the random number |
| 71 | + generator; |
| 72 | + - If ``None``, the random number generator is the ``RandomState`` |
| 73 | + instance used by ``np.random``. |
| 74 | +
|
| 75 | + n_jobs : int, default=None |
| 76 | + Number of CPU cores used during the cross-validation loop. |
| 77 | + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. |
| 78 | + ``-1`` means using all processors. See |
| 79 | + `Glossary <https://scikit-learn.org/stable/glossary.html#term-n-jobs>`_ |
| 80 | + for more details. |
| 81 | +
|
| 82 | + See Also |
| 83 | + -------- |
| 84 | + SMOTE : Over-sample using SMOTE. |
| 85 | +
|
| 86 | + Notes |
| 87 | + ----- |
| 88 | +
|
| 89 | + References |
| 90 | + ---------- |
| 91 | + .. [1] N. Lunardon, G. Menardi, N.Torelli, "ROSE: A Package for Binary |
| 92 | + Imbalanced Learning," R Journal, 6(1), 2014. |
| 93 | +
|
| 94 | + .. [2] G Menardi, N. Torelli, "Training and assessing classification |
| 95 | + rules with imbalanced data," Data Mining and Knowledge |
| 96 | + Discovery, 28(1), pp.92-122, 2014. |
| 97 | +
|
| 98 | + Examples |
| 99 | + -------- |
| 100 | +
|
| 101 | + >>> from imblearn.over_sampling import ROSE |
| 102 | + >>> from sklearn.datasets import make_classification |
| 103 | + >>> from collections import Counter |
| 104 | + >>> r = ROSE(shrink_factors={0:1, 1:0.5, 2:0.7}) |
| 105 | + >>> X, y = make_classification(n_classes=3, class_sep=2, |
| 106 | + ... weights=[0.1, 0.7, 0.2], n_informative=3, n_redundant=1, flip_y=0, |
| 107 | + ... n_features=20, n_clusters_per_class=1, n_samples=2000, random_state=10) |
| 108 | + >>> print('Original dataset shape %s' % Counter(y)) |
| 109 | + Original dataset shape Counter({1: 1400, 2: 400, 0: 200}) |
| 110 | + >>> X_res, y_res = r.fit_resample(X, y) |
| 111 | + >>> print('Resampled dataset shape %s' % Counter(y_res)) |
| 112 | + Resampled dataset shape Counter({2: 1400, 1: 1400, 0: 1400}) |
| 113 | + """ |
| 114 | + |
| 115 | + @_deprecate_positional_args |
| 116 | + def __init__(self, *, sampling_strategy="auto", shrink_factors=None, |
| 117 | + random_state=None, n_jobs=None): |
| 118 | + super().__init__(sampling_strategy=sampling_strategy) |
| 119 | + self.random_state = random_state |
| 120 | + self.shrink_factors = shrink_factors |
| 121 | + self.n_jobs = n_jobs |
| 122 | + |
| 123 | + def _make_samples(self, |
| 124 | + X, |
| 125 | + class_indices, |
| 126 | + n_class_samples, |
| 127 | + h_shrink): |
| 128 | + """ A support function that returns artificial samples constructed |
| 129 | + from a random subsample of the data, by adding a multiviariate |
| 130 | + gaussian kernel and sampling from this distribution. An optional |
| 131 | + shrink factor can be included, to compress/dilate the kernel. |
| 132 | +
|
| 133 | + Parameters |
| 134 | + ---------- |
| 135 | + X : {array-like, sparse matrix}, shape (n_samples, n_features) |
| 136 | + Observations from which the samples will be created. |
| 137 | +
|
| 138 | + class_indices : ndarray, shape (n_class_samples,) |
| 139 | + The target class indices |
| 140 | +
|
| 141 | + n_class_samples : int |
| 142 | + The total number of samples per class to generate |
| 143 | +
|
| 144 | + h_shrink : int |
| 145 | + the shrink factor |
| 146 | +
|
| 147 | + Returns |
| 148 | + ------- |
| 149 | + X_new : {ndarray, sparse matrix}, shape (n_samples, n_features) |
| 150 | + Synthetically generated samples. |
| 151 | +
|
| 152 | + y_new : ndarray, shape (n_samples,) |
| 153 | + Target values for synthetic samples. |
| 154 | +
|
| 155 | + """ |
| 156 | + |
| 157 | + number_of_features = X.shape[1] |
| 158 | + random_state = check_random_state(self.random_state) |
| 159 | + samples_indices = random_state.choice( |
| 160 | + class_indices, size=n_class_samples, replace=True) |
| 161 | + minimize_amise = (4 / ((number_of_features + 2) * len( |
| 162 | + class_indices))) ** (1 / (number_of_features + 4)) |
| 163 | + if sparse.issparse(X): |
| 164 | + variances = np.diagflat( |
| 165 | + np.std(X[class_indices, :].toarray(), axis=0, ddof=1)) |
| 166 | + else: |
| 167 | + variances = np.diagflat( |
| 168 | + np.std(X[class_indices, :], axis=0, ddof=1)) |
| 169 | + h_opt = h_shrink * minimize_amise * variances |
| 170 | + randoms = random_state.standard_normal(size=(n_class_samples, |
| 171 | + number_of_features)) |
| 172 | + Xrose = np.matmul(randoms, h_opt) + X[samples_indices, :] |
| 173 | + if sparse.issparse(X): |
| 174 | + return sparse.csr_matrix(Xrose) |
| 175 | + return Xrose |
| 176 | + |
| 177 | + def _fit_resample(self, X, y): |
| 178 | + |
| 179 | + X_resampled = X.copy() |
| 180 | + y_resampled = y.copy() |
| 181 | + |
| 182 | + if self.shrink_factors is None: |
| 183 | + self.shrink_factors = { |
| 184 | + key: 1 for key in self.sampling_strategy_.keys()} |
| 185 | + |
| 186 | + for class_sample, n_samples in self.sampling_strategy_.items(): |
| 187 | + class_indices = np.flatnonzero(y == class_sample) |
| 188 | + n_class_samples = n_samples |
| 189 | + X_new = self._make_samples(X, |
| 190 | + class_indices, |
| 191 | + n_samples, |
| 192 | + self.shrink_factors[class_sample]) |
| 193 | + y_new = np.array([class_sample] * n_class_samples) |
| 194 | + |
| 195 | + if sparse.issparse(X_new): |
| 196 | + X_resampled = sparse.vstack([X_resampled, X_new]) |
| 197 | + else: |
| 198 | + X_resampled = np.concatenate((X_resampled, X_new)) |
| 199 | + |
| 200 | + y_resampled = np.hstack((y_resampled, y_new)) |
| 201 | + |
| 202 | + return X_resampled.astype(X.dtype), y_resampled.astype(y.dtype) |
0 commit comments