From cf9bd6d423e85e77601d32bd77703d48be9fd908 Mon Sep 17 00:00:00 2001
From: Anton Ermakov <a-n-ermakov@yandex.ru>
Date: Fri, 15 Mar 2019 14:31:30 +0300
Subject: [PATCH 1/6] Support of parameters max_features (for `threshold`
 exceeding) and bootstrap_max_features (for `bootstrap_threshold` exceeding)
 while performing threshold check on bootstrap step (SelectFromModel invoke)
 and while performing final check (get_support method). Backward compatible
 with previous implementation

---
 stability_selection/stability_selection.py | 56 ++++++++++++++++++----
 1 file changed, 48 insertions(+), 8 deletions(-)

diff --git a/stability_selection/stability_selection.py b/stability_selection/stability_selection.py
index 1f95af8..fa3c80d 100644
--- a/stability_selection/stability_selection.py
+++ b/stability_selection/stability_selection.py
@@ -63,10 +63,10 @@ def _bootstrap_generator(n_bootstrap_iterations, bootstrap_func, y,
 
 
 def _fit_bootstrap_sample(base_estimator, X, y, lambda_name, lambda_value,
-                          threshold=None):
+                          threshold=None, max_features=None):
     """
     Fits base_estimator on a bootstrap sample of the original data,
-    and returns a mas of the variables that are selected by the fitted model.
+    and returns a mask of the variables that are selected by the fitted model.
 
     Parameters
     ----------
@@ -95,6 +95,11 @@ def _fit_bootstrap_sample(base_estimator, X, y, lambda_name, lambda_value,
         or implicitly (e.g, Lasso), the threshold used is 1e-5.
         Otherwise, "mean" is used by default.
 
+    max_features : int or None, optional
+        The maximum number of features selected scoring above ``threshold``.
+        To disable ``threshold`` and only select based on ``max_features``,
+        set ``threshold=-np.inf``.
+
     Returns
     -------
     selected_variables : array-like, shape = [n_features]
@@ -108,6 +113,7 @@ def _fit_bootstrap_sample(base_estimator, X, y, lambda_name, lambda_value,
     selector_model = _return_estimator_from_pipeline(base_estimator)
     variable_selector = SelectFromModel(estimator=selector_model,
                                         threshold=threshold,
+                                        max_features=max_features,
                                         prefit=True)
     return variable_selector.get_support()
 
@@ -187,6 +193,9 @@ class StabilitySelection(BaseEstimator, TransformerMixin):
     threshold : float.
         Threshold defining the minimum cutoff value for the stability scores.
 
+    max_features : int or None, optional
+        The maximum number of features selected scoring above ``threshold``.
+
     bootstrap_func : str or callable fun (default=bootstrap_without_replacement)
         The function used to subsample the data. This parameter can be:
             - A string, which must be one of
@@ -208,6 +217,11 @@ class StabilitySelection(BaseEstimator, TransformerMixin):
         or implicitly (e.g, Lasso), the threshold used is 1e-5.
         Otherwise, "mean" is used by default.
 
+    bootstrap_max_features : int or None, optional
+        The maximum number of features selected scoring above ``bootstrap_threshold``.
+        To disable ``bootstrap_threshold`` and only select based on ``max_features``,
+        set ``bootstrap_threshold=-np.inf``.
+
     verbose : integer.
         Controls the verbosity: the higher, the more messages.
 
@@ -253,10 +267,13 @@ class StabilitySelection(BaseEstimator, TransformerMixin):
            of the Royal Statistical Society: Series B (Statistical Methodology),
             75(1), pp.55-80.
     """
+
     def __init__(self, base_estimator=LogisticRegression(penalty='l1'), lambda_name='C',
                  lambda_grid=np.logspace(-5, -2, 25), n_bootstrap_iterations=100,
-                 sample_fraction=0.5, threshold=0.6, bootstrap_func=bootstrap_without_replacement,
-                 bootstrap_threshold=None, verbose=0, n_jobs=1, pre_dispatch='2*n_jobs',
+                 sample_fraction=0.5, threshold=0.6, max_features=None,
+                 bootstrap_func=bootstrap_without_replacement,
+                 bootstrap_threshold=None, bootstrap_max_features=None,
+                 verbose=0, n_jobs=1, pre_dispatch='2*n_jobs',
                  random_state=None):
         self.base_estimator = base_estimator
         self.lambda_name = lambda_name
@@ -264,8 +281,10 @@ def __init__(self, base_estimator=LogisticRegression(penalty='l1'), lambda_name=
         self.n_bootstrap_iterations = n_bootstrap_iterations
         self.sample_fraction = sample_fraction
         self.threshold = threshold
+        self.max_features = max_features
         self.bootstrap_func = bootstrap_func
         self.bootstrap_threshold = bootstrap_threshold
+        self.bootstrap_max_features = bootstrap_max_features
         self.verbose = verbose
         self.n_jobs = n_jobs
         self.pre_dispatch = pre_dispatch
@@ -282,6 +301,9 @@ def _validate_input(self):
         if not isinstance(self.threshold, float) or not (0.0 < self.threshold <= 1.0):
             raise ValueError('threshold should be a float in (0, 1], got %s' % self.threshold)
 
+        if self.max_features is not None and (not isinstance(self.max_features, int) or self.max_features < 1):
+            raise ValueError('max_features should be a positive int, got %s' % self.max_features)
+
         if self.lambda_name not in self.base_estimator.get_params().keys():
             raise ValueError('lambda_name is set to %s, but base_estimator %s '
                              'does not have a parameter '
@@ -340,7 +362,8 @@ def fit(self, X, y):
                                              y=y[subsample],
                                              lambda_name=self.lambda_name,
                                              lambda_value=lambda_value,
-                                             threshold=self.bootstrap_threshold)
+                                             threshold=self.bootstrap_threshold,
+                                             max_features=self.bootstrap_max_features)
               for subsample in bootstrap_samples)
 
             stability_scores[:, idx] = np.vstack(selected_variables).mean(axis=0)
@@ -348,7 +371,7 @@ def fit(self, X, y):
         self.stability_scores_ = stability_scores
         return self
 
-    def get_support(self, indices=False, threshold=None):
+    def get_support(self, indices=False, threshold=None, max_features=None):
         """Get a mask, or integer index, of the features selected
 
         Parameters
@@ -361,6 +384,9 @@ def get_support(self, indices=False, threshold=None):
             Threshold defining the minimum cutoff value for the
             stability scores.
 
+        max_features : int or None, optional
+            The maximum number of features selected scoring above ``threshold``.
+
         Returns
         -------
         support : array
@@ -377,9 +403,23 @@ def get_support(self, indices=False, threshold=None):
             raise ValueError('threshold should be a float in (0, 1], '
                              'got %s' % self.threshold)
 
-        cutoff = self.threshold if threshold is None else threshold
-        mask = (self.stability_scores_.max(axis=1) > cutoff)
+        n_features = self.stability_scores_.shape[0]
+        if max_features is not None and (not isinstance(max_features, int)
+                                         or not(1 < max_features <= n_features)):
+            raise ValueError('max_features should be an int in [1, %s], '
+                             'got %s' % (n_features, max_features))
 
+        threshold_cutoff = self.threshold if threshold is None else threshold
+        max_features_cutoff = self.max_features if max_features is None else max_features
+
+        if max_features_cutoff is None:
+            mask = (self.stability_scores_.max(axis=1) > threshold_cutoff)
+        else:
+            exceed_counts = (self.stability_scores_ > threshold_cutoff).sum(axis=1)
+            max_features_cutoff = min(max_features_cutoff, (exceed_counts > 0).sum())
+            feature_indices = (-exceed_counts).argsort()[:max_features_cutoff]
+            mask = np.zeros(n_features, dtype=np.bool)
+            mask[feature_indices] = True
         return mask if not indices else np.where(mask)[0]
 
     def transform(self, X, threshold=None):

From 1b1530728d0b79c924f42aee8341a81ee19d1b22 Mon Sep 17 00:00:00 2001
From: Anton Ermakov <a-n-ermakov@yandex.ru>
Date: Fri, 15 Mar 2019 15:42:05 +0300
Subject: [PATCH 2/6] tests added

---
 stability_selection/tests/test_common.py      |  5 ++++
 .../tests/test_stability_selection.py         | 29 +++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/stability_selection/tests/test_common.py b/stability_selection/tests/test_common.py
index 7ef0995..b65083d 100644
--- a/stability_selection/tests/test_common.py
+++ b/stability_selection/tests/test_common.py
@@ -71,3 +71,8 @@ def test_sample_fraction():
 @raises(ValueError)
 def test_lambda_name():
     StabilitySelection(lambda_name='n_estimators')._validate_input()
+
+
+@raises(ValueError)
+def test_max_features_zero():
+    StabilitySelection(max_features=0)._validate_input()
diff --git a/stability_selection/tests/test_stability_selection.py b/stability_selection/tests/test_stability_selection.py
index 4193d29..8af30fe 100644
--- a/stability_selection/tests/test_stability_selection.py
+++ b/stability_selection/tests/test_stability_selection.py
@@ -154,6 +154,35 @@ def test_no_features():
                         np.empty(0).reshape((X.shape[0], 0)))
 
 
+def test_stability_selection_max_features():
+    n, p, k = 1000, 1000, 5
+    lambda_grid=np.logspace(-5, -1, 25)
+
+    X, y, important_betas = _generate_dummy_classification_data(n=n, k=k)
+    selector = StabilitySelection(lambda_grid=lambda_grid,
+                                  max_features=1,
+                                  verbose=1)
+    selector.fit(X, y)
+    X_r = selector.transform(X)
+    assert(X_r.shape == (n, 1))
+
+    selector = StabilitySelection(lambda_grid=lambda_grid,
+                                  max_features=k,
+                                  verbose=1)
+    selector.fit(X, y)
+    X_r = selector.transform(X)
+    assert(X_r.shape == (n, k))
+
+    selector = StabilitySelection(lambda_grid=lambda_grid,
+                                  max_features=k+1,
+                                  verbose=1)
+    selector.fit(X, y)
+    X_r = selector.transform(X)
+    assert(X_r.shape == (n, k))
+
+    print('ok')
+
+
 def test_stability_plot():
     n, p, k = 500, 200, 5
 

From c5728b3ebb2a957ada094bb325b20416e1418648 Mon Sep 17 00:00:00 2001
From: Anton Ermakov <a-n-ermakov@yandex.ru>
Date: Fri, 15 Mar 2019 23:43:14 +0300
Subject: [PATCH 3/6] reducing verbose level to satisfy travis CI log limit
 (10000 strings)

---
 stability_selection/tests/test_stability_selection.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/stability_selection/tests/test_stability_selection.py b/stability_selection/tests/test_stability_selection.py
index 8af30fe..34147c3 100644
--- a/stability_selection/tests/test_stability_selection.py
+++ b/stability_selection/tests/test_stability_selection.py
@@ -161,21 +161,21 @@ def test_stability_selection_max_features():
     X, y, important_betas = _generate_dummy_classification_data(n=n, k=k)
     selector = StabilitySelection(lambda_grid=lambda_grid,
                                   max_features=1,
-                                  verbose=1)
+                                  verbose=0)
     selector.fit(X, y)
     X_r = selector.transform(X)
     assert(X_r.shape == (n, 1))
 
     selector = StabilitySelection(lambda_grid=lambda_grid,
                                   max_features=k,
-                                  verbose=1)
+                                  verbose=0)
     selector.fit(X, y)
     X_r = selector.transform(X)
     assert(X_r.shape == (n, k))
 
     selector = StabilitySelection(lambda_grid=lambda_grid,
                                   max_features=k+1,
-                                  verbose=1)
+                                  verbose=0)
     selector.fit(X, y)
     X_r = selector.transform(X)
     assert(X_r.shape == (n, k))

From bfa331fded6633e9884b000b985e819446923d91 Mon Sep 17 00:00:00 2001
From: Anton Ermakov <a-n-ermakov@yandex.ru>
Date: Sat, 16 Mar 2019 00:00:40 +0300
Subject: [PATCH 4/6] explicit LogisticRegression 'liblinear' solver (l1) to
 suppress warnings and satisfy travis CI log limits

---
 stability_selection/stability_selection.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/stability_selection/stability_selection.py b/stability_selection/stability_selection.py
index fa3c80d..0bb6f9e 100644
--- a/stability_selection/stability_selection.py
+++ b/stability_selection/stability_selection.py
@@ -268,8 +268,8 @@ class StabilitySelection(BaseEstimator, TransformerMixin):
             75(1), pp.55-80.
     """
 
-    def __init__(self, base_estimator=LogisticRegression(penalty='l1'), lambda_name='C',
-                 lambda_grid=np.logspace(-5, -2, 25), n_bootstrap_iterations=100,
+    def __init__(self, base_estimator=LogisticRegression(penalty='l1', solver='liblinear'),
+                 lambda_name='C', lambda_grid=np.logspace(-5, -2, 25), n_bootstrap_iterations=100,
                  sample_fraction=0.5, threshold=0.6, max_features=None,
                  bootstrap_func=bootstrap_without_replacement,
                  bootstrap_threshold=None, bootstrap_max_features=None,

From be7decf8230adcc8cefc2a30df7b5ae0cace2b98 Mon Sep 17 00:00:00 2001
From: Anton Ermakov <a-n-ermakov@yandex.ru>
Date: Sat, 16 Mar 2019 01:00:47 +0300
Subject: [PATCH 5/6] small refactor + test fixes

---
 stability_selection/stability_selection.py    | 16 +++---
 .../tests/test_stability_selection.py         | 50 ++++++++++++-------
 2 files changed, 39 insertions(+), 27 deletions(-)

diff --git a/stability_selection/stability_selection.py b/stability_selection/stability_selection.py
index 0bb6f9e..6e31bee 100644
--- a/stability_selection/stability_selection.py
+++ b/stability_selection/stability_selection.py
@@ -410,16 +410,16 @@ def get_support(self, indices=False, threshold=None, max_features=None):
                              'got %s' % (n_features, max_features))
 
         threshold_cutoff = self.threshold if threshold is None else threshold
-        max_features_cutoff = self.max_features if max_features is None else max_features
+        mask = (self.stability_scores_.max(axis=1) > threshold_cutoff)
 
-        if max_features_cutoff is None:
-            mask = (self.stability_scores_.max(axis=1) > threshold_cutoff)
-        else:
+        max_features_cutoff = self.max_features if max_features is None else max_features
+        if max_features_cutoff is not None:
             exceed_counts = (self.stability_scores_ > threshold_cutoff).sum(axis=1)
-            max_features_cutoff = min(max_features_cutoff, (exceed_counts > 0).sum())
-            feature_indices = (-exceed_counts).argsort()[:max_features_cutoff]
-            mask = np.zeros(n_features, dtype=np.bool)
-            mask[feature_indices] = True
+            if max_features_cutoff < (exceed_counts > 0).sum():
+                feature_indices = (-exceed_counts).argsort()[:max_features_cutoff]
+                mask = np.zeros(n_features, dtype=np.bool)
+                mask[feature_indices] = True
+
         return mask if not indices else np.where(mask)[0]
 
     def transform(self, X, threshold=None):
diff --git a/stability_selection/tests/test_stability_selection.py b/stability_selection/tests/test_stability_selection.py
index 34147c3..25adc5d 100644
--- a/stability_selection/tests/test_stability_selection.py
+++ b/stability_selection/tests/test_stability_selection.py
@@ -44,7 +44,7 @@ def _generate_dummy_classification_data(p=1000, n=1000, k=5,
 def test_stability_selection_classification():
     n, p, k = 1000, 1000, 5
 
-    X, y, important_betas = _generate_dummy_classification_data(n=n, k=k)
+    X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k)
     selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25), verbose=1)
     selector.fit(X, y)
 
@@ -59,7 +59,7 @@ def test_stability_selection_classification():
 def test_stability_selection_regression():
     n, p, k = 500, 1000, 5
 
-    X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)
+    X, y, important_betas = _generate_dummy_regression_data(n=n, p=p, k=k)
 
     base_estimator = Pipeline([
         ('scaler', StandardScaler()),
@@ -81,7 +81,7 @@ def test_stability_selection_regression():
 def test_with_complementary_pairs_bootstrap():
     n, p, k = 500, 1000, 5
 
-    X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)
+    X, y, important_betas = _generate_dummy_regression_data(n=n, p=p, k=k)
 
     base_estimator = Pipeline([
         ('scaler', StandardScaler()),
@@ -104,7 +104,7 @@ def test_with_complementary_pairs_bootstrap():
 def test_with_stratified_bootstrap():
     n, p, k = 1000, 1000, 5
 
-    X, y, important_betas = _generate_dummy_classification_data(n=n, k=k)
+    X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k)
     selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25), verbose=1,
                                   bootstrap_func='stratified')
     selector.fit(X, y)
@@ -117,7 +117,7 @@ def test_with_stratified_bootstrap():
 def test_different_shape():
     n, p, k = 100, 200, 5
 
-    X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)
+    X, y, important_betas = _generate_dummy_regression_data(n=n, p=p, k=k)
 
     base_estimator = Pipeline([
         ('scaler', StandardScaler()),
@@ -136,7 +136,7 @@ def test_different_shape():
 def test_no_features():
     n, p, k = 100, 200, 0
 
-    X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)
+    X, y, important_betas = _generate_dummy_regression_data(n=n, p=p, k=k)
 
     base_estimator = Pipeline([
         ('scaler', StandardScaler()),
@@ -155,38 +155,50 @@ def test_no_features():
 
 
 def test_stability_selection_max_features():
-    n, p, k = 1000, 1000, 5
+    n, p, k = 2000, 100, 5
     lambda_grid=np.logspace(-5, -1, 25)
 
-    X, y, important_betas = _generate_dummy_classification_data(n=n, k=k)
-    selector = StabilitySelection(lambda_grid=lambda_grid,
-                                  max_features=1,
-                                  verbose=0)
+    X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k)
+    selector = StabilitySelection(lambda_grid=lambda_grid, max_features=1)
     selector.fit(X, y)
     X_r = selector.transform(X)
     assert(X_r.shape == (n, 1))
 
-    selector = StabilitySelection(lambda_grid=lambda_grid,
-                                  max_features=k,
-                                  verbose=0)
+    selector = StabilitySelection(lambda_grid=lambda_grid, max_features=k)
     selector.fit(X, y)
     X_r = selector.transform(X)
     assert(X_r.shape == (n, k))
 
-    selector = StabilitySelection(lambda_grid=lambda_grid,
-                                  max_features=k+1,
-                                  verbose=0)
+    selector = StabilitySelection(lambda_grid=lambda_grid, max_features=k+1)
     selector.fit(X, y)
     X_r = selector.transform(X)
     assert(X_r.shape == (n, k))
 
-    print('ok')
+
+@raises(ValueError)
+def test_get_support_max_features_low():
+    n, p, k = 500, 200, 5
+
+    X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k)
+    selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25))
+    selector.fit(X, y)
+    selector.get_support(max_features=0)
+
+
+@raises(ValueError)
+def test_get_support_max_features_high():
+    n, p, k = 500, 200, 5
+
+    X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k)
+    selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25))
+    selector.fit(X, y)
+    selector.get_support(max_features=p+1)
 
 
 def test_stability_plot():
     n, p, k = 500, 200, 5
 
-    X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)
+    X, y, important_betas = _generate_dummy_regression_data(n=n, p=p, k=k)
 
     base_estimator = Pipeline([
         ('scaler', StandardScaler()),

From 1e75f79366f6637c26436805dcbd427e56c9ce3a Mon Sep 17 00:00:00 2001
From: Anton Ermakov <a-n-ermakov@yandex.ru>
Date: Sun, 17 Mar 2019 00:37:23 +0300
Subject: [PATCH 6/6] test stability

---
 stability_selection/tests/test_stability_selection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stability_selection/tests/test_stability_selection.py b/stability_selection/tests/test_stability_selection.py
index 25adc5d..1498a7d 100644
--- a/stability_selection/tests/test_stability_selection.py
+++ b/stability_selection/tests/test_stability_selection.py
@@ -155,7 +155,7 @@ def test_no_features():
 
 
 def test_stability_selection_max_features():
-    n, p, k = 2000, 100, 5
+    n, p, k = 1000, 1000, 5
     lambda_grid=np.logspace(-5, -1, 25)
 
     X, y, important_betas = _generate_dummy_classification_data(n=n, p=p, k=k)