From cf9bd6d423e85e77601d32bd77703d48be9fd908 Mon Sep 17 00:00:00 2001 From: Anton Ermakov Date: Fri, 15 Mar 2019 14:31:30 +0300 Subject: [PATCH 1/6] Support of parameters max_features (for `threshold` exceeding) and bootstrap_max_features (for `bootstrap_threshold` exceeding) while performing threshold check on bootstrap step (SelectFromModel invoke) and while performing final check (get_support method). Backward compatible with previous implementation --- stability_selection/stability_selection.py | 56 ++++++++++++++++++---- 1 file changed, 48 insertions(+), 8 deletions(-) diff --git a/stability_selection/stability_selection.py b/stability_selection/stability_selection.py index 1f95af8..fa3c80d 100644 --- a/stability_selection/stability_selection.py +++ b/stability_selection/stability_selection.py @@ -63,10 +63,10 @@ def _bootstrap_generator(n_bootstrap_iterations, bootstrap_func, y, def _fit_bootstrap_sample(base_estimator, X, y, lambda_name, lambda_value, - threshold=None): + threshold=None, max_features=None): """ Fits base_estimator on a bootstrap sample of the original data, - and returns a mas of the variables that are selected by the fitted model. + and returns a mask of the variables that are selected by the fitted model. Parameters ---------- @@ -95,6 +95,11 @@ def _fit_bootstrap_sample(base_estimator, X, y, lambda_name, lambda_value, or implicitly (e.g, Lasso), the threshold used is 1e-5. Otherwise, "mean" is used by default. + max_features : int or None, optional + The maximum number of features selected scoring above ``threshold``. + To disable ``threshold`` and only select based on ``max_features``, + set ``threshold=-np.inf``. + Returns ------- selected_variables : array-like, shape = [n_features] @@ -108,6 +113,7 @@ def _fit_bootstrap_sample(base_estimator, X, y, lambda_name, lambda_value, selector_model = _return_estimator_from_pipeline(base_estimator) variable_selector = SelectFromModel(estimator=selector_model, threshold=threshold, + max_features=max_features, prefit=True) return variable_selector.get_support() @@ -187,6 +193,9 @@ class StabilitySelection(BaseEstimator, TransformerMixin): threshold : float. Threshold defining the minimum cutoff value for the stability scores. + max_features : int or None, optional + The maximum number of features selected scoring above ``threshold``. + bootstrap_func : str or callable fun (default=bootstrap_without_replacement) The function used to subsample the data. This parameter can be: - A string, which must be one of @@ -208,6 +217,11 @@ class StabilitySelection(BaseEstimator, TransformerMixin): or implicitly (e.g, Lasso), the threshold used is 1e-5. Otherwise, "mean" is used by default. + bootstrap_max_features : int or None, optional + The maximum number of features selected scoring above ``bootstrap_threshold``. + To disable ``bootstrap_threshold`` and only select based on ``max_features``, + set ``bootstrap_threshold=-np.inf``. + verbose : integer. Controls the verbosity: the higher, the more messages. @@ -253,10 +267,13 @@ class StabilitySelection(BaseEstimator, TransformerMixin): of the Royal Statistical Society: Series B (Statistical Methodology), 75(1), pp.55-80. """ + def __init__(self, base_estimator=LogisticRegression(penalty='l1'), lambda_name='C', lambda_grid=np.logspace(-5, -2, 25), n_bootstrap_iterations=100, - sample_fraction=0.5, threshold=0.6, bootstrap_func=bootstrap_without_replacement, - bootstrap_threshold=None, verbose=0, n_jobs=1, pre_dispatch='2*n_jobs', + sample_fraction=0.5, threshold=0.6, max_features=None, + bootstrap_func=bootstrap_without_replacement, + bootstrap_threshold=None, bootstrap_max_features=None, + verbose=0, n_jobs=1, pre_dispatch='2*n_jobs', random_state=None): self.base_estimator = base_estimator self.lambda_name = lambda_name @@ -264,8 +281,10 @@ def __init__(self, base_estimator=LogisticRegression(penalty='l1'), lambda_name= self.n_bootstrap_iterations = n_bootstrap_iterations self.sample_fraction = sample_fraction self.threshold = threshold + self.max_features = max_features self.bootstrap_func = bootstrap_func self.bootstrap_threshold = bootstrap_threshold + self.bootstrap_max_features = bootstrap_max_features self.verbose = verbose self.n_jobs = n_jobs self.pre_dispatch = pre_dispatch @@ -282,6 +301,9 @@ def _validate_input(self): if not isinstance(self.threshold, float) or not (0.0 < self.threshold <= 1.0): raise ValueError('threshold should be a float in (0, 1], got %s' % self.threshold) + if self.max_features is not None and (not isinstance(self.max_features, int) or self.max_features < 1): + raise ValueError('max_features should be a positive int, got %s' % self.max_features) + if self.lambda_name not in self.base_estimator.get_params().keys(): raise ValueError('lambda_name is set to %s, but base_estimator %s ' 'does not have a parameter ' @@ -340,7 +362,8 @@ def fit(self, X, y): y=y[subsample], lambda_name=self.lambda_name, lambda_value=lambda_value, - threshold=self.bootstrap_threshold) + threshold=self.bootstrap_threshold, + max_features=self.bootstrap_max_features) for subsample in bootstrap_samples) stability_scores[:, idx] = np.vstack(selected_variables).mean(axis=0) @@ -348,7 +371,7 @@ def fit(self, X, y): self.stability_scores_ = stability_scores return self - def get_support(self, indices=False, threshold=None): + def get_support(self, indices=False, threshold=None, max_features=None): """Get a mask, or integer index, of the features selected Parameters @@ -361,6 +384,9 @@ def get_support(self, indices=False, threshold=None): Threshold defining the minimum cutoff value for the stability scores. + max_features : int or None, optional + The maximum number of features selected scoring above ``threshold``. + Returns ------- support : array @@ -377,9 +403,23 @@ def get_support(self, indices=False, threshold=None): raise ValueError('threshold should be a float in (0, 1], ' 'got %s' % self.threshold) - cutoff = self.threshold if threshold is None else threshold - mask = (self.stability_scores_.max(axis=1) > cutoff) + n_features = self.stability_scores_.shape[0] + if max_features is not None and (not isinstance(max_features, int) + or not(1 < max_features <= n_features)): + raise ValueError('max_features should be an int in [1, %s], ' + 'got %s' % (n_features, max_features)) + threshold_cutoff = self.threshold if threshold is None else threshold + max_features_cutoff = self.max_features if max_features is None else max_features + + if max_features_cutoff is None: + mask = (self.stability_scores_.max(axis=1) > threshold_cutoff) + else: + exceed_counts = (self.stability_scores_ > threshold_cutoff).sum(axis=1) + max_features_cutoff = min(max_features_cutoff, (exceed_counts > 0).sum()) + feature_indices = (-exceed_counts).argsort()[:max_features_cutoff] + mask = np.zeros(n_features, dtype=np.bool) + mask[feature_indices] = True return mask if not indices else np.where(mask)[0] def transform(self, X, threshold=None): From 1b1530728d0b79c924f42aee8341a81ee19d1b22 Mon Sep 17 00:00:00 2001 From: Anton Ermakov Date: Fri, 15 Mar 2019 15:42:05 +0300 Subject: [PATCH 2/6] tests added --- stability_selection/tests/test_common.py | 5 ++++ .../tests/test_stability_selection.py | 29 +++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/stability_selection/tests/test_common.py b/stability_selection/tests/test_common.py index 7ef0995..b65083d 100644 --- a/stability_selection/tests/test_common.py +++ b/stability_selection/tests/test_common.py @@ -71,3 +71,8 @@ def test_sample_fraction(): @raises(ValueError) def test_lambda_name(): StabilitySelection(lambda_name='n_estimators')._validate_input() + + +@raises(ValueError) +def test_max_features_zero(): + StabilitySelection(max_features=0)._validate_input() diff --git a/stability_selection/tests/test_stability_selection.py b/stability_selection/tests/test_stability_selection.py index 4193d29..8af30fe 100644 --- a/stability_selection/tests/test_stability_selection.py +++ b/stability_selection/tests/test_stability_selection.py @@ -154,6 +154,35 @@ def test_no_features(): np.empty(0).reshape((X.shape[0], 0))) +def test_stability_selection_max_features(): + n, p, k = 1000, 1000, 5 + lambda_grid=np.logspace(-5, -1, 25) + + X, y, important_betas = _generate_dummy_classification_data(n=n, k=k) + selector = StabilitySelection(lambda_grid=lambda_grid, + max_features=1, + verbose=1) + selector.fit(X, y) + X_r = selector.transform(X) + assert(X_r.shape == (n, 1)) + + selector = StabilitySelection(lambda_grid=lambda_grid, + max_features=k, + verbose=1) + selector.fit(X, y) + X_r = selector.transform(X) + assert(X_r.shape == (n, k)) + + selector = StabilitySelection(lambda_grid=lambda_grid, + max_features=k+1, + verbose=1) + selector.fit(X, y) + X_r = selector.transform(X) + assert(X_r.shape == (n, k)) + + print('ok') + + def test_stability_plot(): n, p, k = 500, 200, 5 From c5728b3ebb2a957ada094bb325b20416e1418648 Mon Sep 17 00:00:00 2001 From: Anton Ermakov Date: Fri, 15 Mar 2019 23:43:14 +0300 Subject: [PATCH 3/6] reducing verbose level to satisfy travis CI log limit (10000 strings) --- stability_selection/tests/test_stability_selection.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/stability_selection/tests/test_stability_selection.py b/stability_selection/tests/test_stability_selection.py index 8af30fe..34147c3 100644 --- a/stability_selection/tests/test_stability_selection.py +++ b/stability_selection/tests/test_stability_selection.py @@ -161,21 +161,21 @@ def test_stability_selection_max_features(): X, y, important_betas = _generate_dummy_classification_data(n=n, k=k) selector = StabilitySelection(lambda_grid=lambda_grid, max_features=1, - verbose=1) + verbose=0) selector.fit(X, y) X_r = selector.transform(X) assert(X_r.shape == (n, 1)) selector = StabilitySelection(lambda_grid=lambda_grid, max_features=k, - verbose=1) + verbose=0) selector.fit(X, y) X_r = selector.transform(X) assert(X_r.shape == (n, k)) selector = StabilitySelection(lambda_grid=lambda_grid, max_features=k+1, - verbose=1) + verbose=0) selector.fit(X, y) X_r = selector.transform(X) assert(X_r.shape == (n, k)) From bfa331fded6633e9884b000b985e819446923d91 Mon Sep 17 00:00:00 2001 From: Anton Ermakov Date: Sat, 16 Mar 2019 00:00:40 +0300 Subject: [PATCH 4/6] explicit LogisticRegression 'liblinear' solver (l1) to suppress warnings and satisfy travis CI log limits --- stability_selection/stability_selection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stability_selection/stability_selection.py b/stability_selection/stability_selection.py index fa3c80d..0bb6f9e 100644 --- a/stability_selection/stability_selection.py +++ b/stability_selection/stability_selection.py @@ -268,8 +268,8 @@ class StabilitySelection(BaseEstimator, TransformerMixin): 75(1), pp.55-80. """ - def __init__(self, base_estimator=LogisticRegression(penalty='l1'), lambda_name='C', - lambda_grid=np.logspace(-5, -2, 25), n_bootstrap_iterations=100, + def __init__(self, base_estimator=LogisticRegression(penalty='l1', solver='liblinear'), + lambda_name='C', lambda_grid=np.logspace(-5, -2, 25), n_bootstrap_iterations=100, sample_fraction=0.5, threshold=0.6, max_features=None, bootstrap_func=bootstrap_without_replacement, bootstrap_threshold=None, bootstrap_max_features=None, From be7decf8230adcc8cefc2a30df7b5ae0cace2b98 Mon Sep 17 00:00:00 2001 From: Anton Ermakov Date: Sat, 16 Mar 2019 01:00:47 +0300 Subject: [PATCH 5/6] small refactor + test fixes --- stability_selection/stability_selection.py | 16 +++--- .../tests/test_stability_selection.py | 50 ++++++++++++------- 2 files changed, 39 insertions(+), 27 deletions(-) diff --git a/stability_selection/stability_selection.py b/stability_selection/stability_selection.py index 0bb6f9e..6e31bee 100644 --- a/stability_selection/stability_selection.py +++ b/stability_selection/stability_selection.py @@ -410,16 +410,16 @@ def get_support(self, indices=False, threshold=None, max_features=None): 'got %s' % (n_features, max_features)) threshold_cutoff = self.threshold if threshold is None else threshold - max_features_cutoff = self.max_features if max_features is None else max_features + mask = (self.stability_scores_.max(axis=1) > threshold_cutoff) - if max_features_cutoff is None: - mask = (self.stability_scores_.max(axis=1) > threshold_cutoff) - else: + max_features_cutoff = self.max_features if max_features is None else max_features + if max_features_cutoff is not None: exceed_counts = (self.stability_scores_ > threshold_cutoff).sum(axis=1) - max_features_cutoff = min(max_features_cutoff, (exceed_counts > 0).sum()) - feature_indices = (-exceed_counts).argsort()[:max_features_cutoff] - mask = np.zeros(n_features, dtype=np.bool) - mask[feature_indices] = True + if max_features_cutoff < (exceed_counts > 0).sum(): + feature_indices = (-exceed_counts).argsort()[:max_features_cutoff] + mask = np.zeros(n_features, dtype=np.bool) + mask[feature_indices] = True + return mask if not indices else np.where(mask)[0] def transform(self, X, threshold=None): diff --git a/stability_selection/tests/test_stability_selection.py b/stability_selection/tests/test_stability_selection.py index 34147c3..25adc5d 100644 --- a/stability_selection/tests/test_stability_selection.py +++ b/stability_selection/tests/test_stability_selection.py @@ -44,7 +44,7 @@ def _generate_dummy_classification_data(p=1000, n=1000, k=5, def test_stability_selection_classification(): n, p, k = 1000, 1000, 5 - X, y, important_betas = _generate_dummy_classification_data(n=n, k=k) + X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k) selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25), verbose=1) selector.fit(X, y) @@ -59,7 +59,7 @@ def test_stability_selection_classification(): def test_stability_selection_regression(): n, p, k = 500, 1000, 5 - X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) + X, y, important_betas = _generate_dummy_regression_data(n=n, p=p, k=k) base_estimator = Pipeline([ ('scaler', StandardScaler()), @@ -81,7 +81,7 @@ def test_stability_selection_regression(): def test_with_complementary_pairs_bootstrap(): n, p, k = 500, 1000, 5 - X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) + X, y, important_betas = _generate_dummy_regression_data(n=n, p=p, k=k) base_estimator = Pipeline([ ('scaler', StandardScaler()), @@ -104,7 +104,7 @@ def test_with_complementary_pairs_bootstrap(): def test_with_stratified_bootstrap(): n, p, k = 1000, 1000, 5 - X, y, important_betas = _generate_dummy_classification_data(n=n, k=k) + X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k) selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25), verbose=1, bootstrap_func='stratified') selector.fit(X, y) @@ -117,7 +117,7 @@ def test_with_stratified_bootstrap(): def test_different_shape(): n, p, k = 100, 200, 5 - X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) + X, y, important_betas = _generate_dummy_regression_data(n=n, p=p, k=k) base_estimator = Pipeline([ ('scaler', StandardScaler()), @@ -136,7 +136,7 @@ def test_different_shape(): def test_no_features(): n, p, k = 100, 200, 0 - X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) + X, y, important_betas = _generate_dummy_regression_data(n=n, p=p, k=k) base_estimator = Pipeline([ ('scaler', StandardScaler()), @@ -155,38 +155,50 @@ def test_no_features(): def test_stability_selection_max_features(): - n, p, k = 1000, 1000, 5 + n, p, k = 2000, 100, 5 lambda_grid=np.logspace(-5, -1, 25) - X, y, important_betas = _generate_dummy_classification_data(n=n, k=k) - selector = StabilitySelection(lambda_grid=lambda_grid, - max_features=1, - verbose=0) + X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k) + selector = StabilitySelection(lambda_grid=lambda_grid, max_features=1) selector.fit(X, y) X_r = selector.transform(X) assert(X_r.shape == (n, 1)) - selector = StabilitySelection(lambda_grid=lambda_grid, - max_features=k, - verbose=0) + selector = StabilitySelection(lambda_grid=lambda_grid, max_features=k) selector.fit(X, y) X_r = selector.transform(X) assert(X_r.shape == (n, k)) - selector = StabilitySelection(lambda_grid=lambda_grid, - max_features=k+1, - verbose=0) + selector = StabilitySelection(lambda_grid=lambda_grid, max_features=k+1) selector.fit(X, y) X_r = selector.transform(X) assert(X_r.shape == (n, k)) - print('ok') + +@raises(ValueError) +def test_get_support_max_features_low(): + n, p, k = 500, 200, 5 + + X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k) + selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25)) + selector.fit(X, y) + selector.get_support(max_features=0) + + +@raises(ValueError) +def test_get_support_max_features_high(): + n, p, k = 500, 200, 5 + + X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k) + selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25)) + selector.fit(X, y) + selector.get_support(max_features=p+1) def test_stability_plot(): n, p, k = 500, 200, 5 - X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) + X, y, important_betas = _generate_dummy_regression_data(n=n, p=p, k=k) base_estimator = Pipeline([ ('scaler', StandardScaler()), From 1e75f79366f6637c26436805dcbd427e56c9ce3a Mon Sep 17 00:00:00 2001 From: Anton Ermakov Date: Sun, 17 Mar 2019 00:37:23 +0300 Subject: [PATCH 6/6] test stability --- stability_selection/tests/test_stability_selection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stability_selection/tests/test_stability_selection.py b/stability_selection/tests/test_stability_selection.py index 25adc5d..1498a7d 100644 --- a/stability_selection/tests/test_stability_selection.py +++ b/stability_selection/tests/test_stability_selection.py @@ -155,7 +155,7 @@ def test_no_features(): def test_stability_selection_max_features(): - n, p, k = 2000, 100, 5 + n, p, k = 1000, 1000, 5 lambda_grid=np.logspace(-5, -1, 25) X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k)