ENH Add optional positivity constraint in L1, WeightedL1 and L1_plus_L2 (#110)

PABannier · Badr-MOUFAD · web-flow · commit cb715d22c8eb · 2022-12-05T14:56:10.000+01:00
Co-authored-by: Badr-MOUFAD &lt;badr.moufad@emines.um6p.ma&gt;
diff --git a/skglm/estimators.py b/skglm/estimators.py
@@ -314,6 +314,9 @@ class Lasso(LinearModel, RegressorMixin):
     tol : float, optional
         Stopping criterion for the optimization.
 
+    positive : bool, optional
+        When set to ``True``, forces the coefficient vector to be positive.
+
     fit_intercept : bool, optional (default=True)
         Whether or not to fit an intercept.
 
@@ -345,14 +348,16 @@ class Lasso(LinearModel, RegressorMixin):
     """
 
     def __init__(self, alpha=1., max_iter=50, max_epochs=50_000, p0=10, verbose=0,
-                 tol=1e-4, fit_intercept=True, warm_start=False, ws_strategy="subdiff"):
+                 tol=1e-4, positive=False, fit_intercept=True, warm_start=False,
+                 ws_strategy="subdiff"):
         super().__init__()
         self.alpha = alpha
         self.tol = tol
         self.max_iter = max_iter
         self.max_epochs = max_epochs
         self.p0 = p0
         self.ws_strategy = ws_strategy
+        self.positive = positive
         self.fit_intercept = fit_intercept
         self.warm_start = warm_start
         self.verbose = verbose
@@ -378,7 +383,7 @@ def fit(self, X, y):
             self.max_iter, self.max_epochs, self.p0, tol=self.tol,
             ws_strategy=self.ws_strategy, fit_intercept=self.fit_intercept,
             warm_start=self.warm_start, verbose=self.verbose)
-        return _glm_fit(X, y, self, Quadratic(), L1(self.alpha), solver)
+        return _glm_fit(X, y, self, Quadratic(), L1(self.alpha, self.positive), solver)
 
     def path(self, X, y, alphas, coef_init=None, return_n_iter=True, **params):
         """Compute Lasso path.
@@ -417,7 +422,7 @@ def path(self, X, y, alphas, coef_init=None, return_n_iter=True, **params):
         n_iters : array, shape (n_alphas,), optional
             The number of iterations along the path. If return_n_iter is set to `True`.
         """
-        penalty = compiled_clone(L1(self.alpha))
+        penalty = compiled_clone(L1(self.alpha, self.positive))
         datafit = compiled_clone(Quadratic(), to_float32=X.dtype == np.float32)
         solver = AndersonCD(
             self.max_iter, self.max_epochs, self.p0, tol=self.tol,
@@ -457,6 +462,9 @@ class WeightedLasso(LinearModel, RegressorMixin):
     tol : float, optional
         Stopping criterion for the optimization.
 
+    positive : bool, optional
+        When set to ``True``, forces the coefficient vector to be positive.
+
     fit_intercept : bool, optional (default=True)
         Whether or not to fit an intercept.
 
@@ -492,8 +500,8 @@ class WeightedLasso(LinearModel, RegressorMixin):
     """
 
     def __init__(self, alpha=1., weights=None, max_iter=50, max_epochs=50_000, p0=10,
-                 verbose=0, tol=1e-4, fit_intercept=True, warm_start=False,
-                 ws_strategy="subdiff"):
+                 verbose=0, tol=1e-4, positive=False, fit_intercept=True,
+                 warm_start=False, ws_strategy="subdiff"):
         super().__init__()
         self.alpha = alpha
         self.weights = weights
@@ -502,6 +510,7 @@ def __init__(self, alpha=1., weights=None, max_iter=50, max_epochs=50_000, p0=10
         self.max_epochs = max_epochs
         self.p0 = p0
         self.ws_strategy = ws_strategy
+        self.positive = positive
         self.fit_intercept = fit_intercept
         self.warm_start = warm_start
         self.verbose = verbose
@@ -548,7 +557,7 @@ def path(self, X, y, alphas, coef_init=None, return_n_iter=True, **params):
             raise ValueError("The number of weights must match the number of \
                               features. Got %s, expected %s." % (
                 len(weights), X.shape[1]))
-        penalty = compiled_clone(WeightedL1(self.alpha, weights))
+        penalty = compiled_clone(WeightedL1(self.alpha, weights, self.positive))
         datafit = compiled_clone(Quadratic(), to_float32=X.dtype == np.float32)
         solver = AndersonCD(
             self.max_iter, self.max_epochs, self.p0, tol=self.tol,
@@ -574,9 +583,9 @@ def fit(self, X, y):
         """
         if self.weights is None:
             warnings.warn('Weights are not provided, fitting with Lasso penalty')
-            penalty = L1(self.alpha)
+            penalty = L1(self.alpha, self.positive)
         else:
-            penalty = WeightedL1(self.alpha, self.weights)
+            penalty = WeightedL1(self.alpha, self.weights, self.positive)
         solver = AndersonCD(
             self.max_iter, self.max_epochs, self.p0, tol=self.tol,
             ws_strategy=self.ws_strategy, fit_intercept=self.fit_intercept,
@@ -618,6 +627,9 @@ class ElasticNet(LinearModel, RegressorMixin):
     tol : float, optional
         Stopping criterion for the optimization.
 
+    positive : bool, optional
+        When set to ``True``, forces the coefficient vector to be positive.
+
     fit_intercept : bool, optional (default=True)
         Whether or not to fit an intercept.
 
@@ -648,8 +660,8 @@ class ElasticNet(LinearModel, RegressorMixin):
     """
 
     def __init__(self, alpha=1., l1_ratio=0.5, max_iter=50, max_epochs=50_000, p0=10,
-                 verbose=0, tol=1e-4, fit_intercept=True, warm_start=False,
-                 ws_strategy="subdiff"):
+                 verbose=0, tol=1e-4, positive=False, fit_intercept=True,
+                 warm_start=False, ws_strategy="subdiff"):
         super().__init__()
         self.alpha = alpha
         self.l1_ratio = l1_ratio
@@ -659,6 +671,7 @@ def __init__(self, alpha=1., l1_ratio=0.5, max_iter=50, max_epochs=50_000, p0=10
         self.p0 = p0
         self.ws_strategy = ws_strategy
         self.fit_intercept = fit_intercept
+        self.positive = positive
         self.warm_start = warm_start
         self.verbose = verbose
 
@@ -699,7 +712,7 @@ def path(self, X, y, alphas, coef_init=None, return_n_iter=True, **params):
         n_iters : array, shape (n_alphas,), optional
             The number of iterations along the path. If return_n_iter is set to `True`.
         """
-        penalty = compiled_clone(L1_plus_L2(self.alpha, self.l1_ratio))
+        penalty = compiled_clone(L1_plus_L2(self.alpha, self.l1_ratio, self.positive))
         datafit = compiled_clone(Quadratic(), to_float32=X.dtype == np.float32)
         solver = AndersonCD(
             self.max_iter, self.max_epochs, self.p0, tol=self.tol,
@@ -728,7 +741,7 @@ def fit(self, X, y):
             ws_strategy=self.ws_strategy, fit_intercept=self.fit_intercept,
             warm_start=self.warm_start, verbose=self.verbose)
         return _glm_fit(X, y, self, Quadratic(),
-                        L1_plus_L2(self.alpha, self.l1_ratio), solver)
+                        L1_plus_L2(self.alpha, self.l1_ratio, self.positive), solver)
 
 
 class MCPRegression(LinearModel, RegressorMixin):
diff --git a/skglm/penalties/separable.py b/skglm/penalties/separable.py
@@ -10,37 +10,48 @@
 class L1(BasePenalty):
     """L1 penalty."""
 
-    def __init__(self, alpha):
+    def __init__(self, alpha, positive=False):
         self.alpha = alpha
+        self.positive = positive
 
     def get_spec(self):
         spec = (
             ('alpha', float64),
+            ('positive', bool_),
         )
         return spec
 
     def params_to_dict(self):
-        return dict(alpha=self.alpha)
+        return dict(alpha=self.alpha, positive=self.positive)
 
     def value(self, w):
         """Compute L1 penalty value."""
         return self.alpha * np.sum(np.abs(w))
 
     def prox_1d(self, value, stepsize, j):
         """Compute proximal operator of the L1 penalty (soft-thresholding operator)."""
-        return ST(value, self.alpha * stepsize)
+        return ST(value, self.alpha * stepsize, self.positive)
 
     def subdiff_distance(self, w, grad, ws):
         """Compute distance of negative gradient to the subdifferential at w."""
         subdiff_dist = np.zeros_like(grad)
         for idx, j in enumerate(ws):
-            if w[j] == 0:
-                # distance of - grad_j to  [-alpha, alpha]
-                subdiff_dist[idx] = max(0, np.abs(grad[idx]) - self.alpha)
+            if self.positive:
+                if w[j] < 0:
+                    subdiff_dist[idx] = np.inf
+                elif w[j] == 0:
+                    # distance of -grad_j to (-infty, alpha]
+                    subdiff_dist[idx] = max(0, -grad[idx] - self.alpha)
+                else:
+                    # distance of -grad_j to {alpha}
+                    subdiff_dist[idx] = np.abs(grad[idx] + self.alpha)
             else:
-                # distance of - grad_j to alpha * sign(w[j])
-                subdiff_dist[idx] = np.abs(
-                    - grad[idx] - np.sign(w[j]) * self.alpha)
+                if w[j] == 0:
+                    # distance of -grad_j to  [-alpha, alpha]
+                    subdiff_dist[idx] = max(0, np.abs(grad[idx]) - self.alpha)
+                else:
+                    # distance of -grad_j to {alpha * sign(w[j])}
+                    subdiff_dist[idx] = np.abs(grad[idx] + np.sign(w[j]) * self.alpha)
         return subdiff_dist
 
     def is_penalized(self, n_features):
@@ -59,20 +70,21 @@ def alpha_max(self, gradient0):
 class L1_plus_L2(BasePenalty):
     """L1 + L2 penalty (aka ElasticNet penalty)."""
 
-    def __init__(self, alpha, l1_ratio):
+    def __init__(self, alpha, l1_ratio, positive=False):
         self.alpha = alpha
         self.l1_ratio = l1_ratio
+        self.positive = positive
 
     def get_spec(self):
         spec = (
             ('alpha', float64),
             ('l1_ratio', float64),
+            ('positive', bool_),
         )
         return spec
 
     def params_to_dict(self):
-        return dict(alpha=self.alpha,
-                    l1_ratio=self.l1_ratio)
+        return dict(alpha=self.alpha, l1_ratio=self.l1_ratio, positive=self.positive)
 
     def value(self, w):
         """Compute the L1 + L2 penalty value."""
@@ -82,25 +94,38 @@ def value(self, w):
 
     def prox_1d(self, value, stepsize, j):
         """Compute the proximal operator (scaled soft-thresholding)."""
-        prox = ST(value, self.l1_ratio * self.alpha * stepsize)
+        prox = ST(value, self.l1_ratio * self.alpha * stepsize, self.positive)
         prox /= (1 + stepsize * (1 - self.l1_ratio) * self.alpha)
         return prox
 
     def subdiff_distance(self, w, grad, ws):
         """Compute distance of negative gradient to the subdifferential at w."""
         subdiff_dist = np.zeros_like(grad)
+        alpha = self.alpha
+        l1_ratio = self.l1_ratio
+
         for idx, j in enumerate(ws):
-            if w[j] == 0:
-                # distance of - grad_j to alpha * l1_ratio * [-1, 1]
-                subdiff_dist[idx] = max(
-                    0, np.abs(grad[idx]) - self.alpha * self.l1_ratio)
+            if self.positive:
+                if w[j] < 0:
+                    subdiff_dist[idx] = np.inf
+                elif w[j] == 0:
+                    # distance of -grad_j to (-infty, alpha * l1_ratio]
+                    subdiff_dist[idx] = max(0, -grad[idx] - alpha * l1_ratio)
+                else:
+                    # distance of -grad_j to alpha * {l1_ratio + (1 - l1_ratio) * w[j]}
+                    subdiff_dist[idx] = np.abs(
+                        grad[idx] + alpha * (l1_ratio
+                                             + (1 - l1_ratio) * w[j]))
             else:
-                # distance of - grad_j to alpha * l_1 ratio * sign(w[j]) +
-                # alpha * (1 - l1_ratio) * w[j]
-                subdiff_dist[idx] = np.abs(
-                    - grad[idx] -
-                    self.alpha * (self.l1_ratio *
-                                  np.sign(w[j]) + (1 - self.l1_ratio) * w[j]))
+                if w[j] == 0:
+                    # distance of -grad_j to alpha * l1_ratio * [-1, 1]
+                    subdiff_dist[idx] = max(0, np.abs(grad[idx]) - alpha * l1_ratio)
+                else:
+                    # distance of -grad_j to
+                    # {alpha * (l1 ratio * sign(w[j]) + (1 - l1_ratio) * w[j])}
+                    subdiff_dist[idx] = np.abs(
+                        grad[idx] + alpha * (l1_ratio * np.sign(w[j])
+                                             + (1 - l1_ratio) * w[j]))
         return subdiff_dist
 
     def is_penalized(self, n_features):
@@ -119,41 +144,54 @@ def alpha_max(self, gradient0):
 class WeightedL1(BasePenalty):
     """Weighted L1 penalty."""
 
-    def __init__(self, alpha, weights):
+    def __init__(self, alpha, weights, positive=False):
         self.alpha = alpha
         self.weights = weights.astype(np.float64)
+        self.positive = positive
 
     def get_spec(self):
         spec = (
             ('alpha', float64),
             ('weights', float64[:]),
+            ('positive', bool_),
         )
         return spec
 
     def params_to_dict(self):
-        return dict(alpha=self.alpha,
-                    weights=self.weights)
+        return dict(alpha=self.alpha, weights=self.weights, positive=self.positive)
 
     def value(self, w):
         """Compute the weighted L1 penalty."""
         return self.alpha * np.sum(np.abs(w) * self.weights)
 
     def prox_1d(self, value, stepsize, j):
         """Compute the proximal operator of weighted L1 (weighted soft-thresholding)."""
-        return ST(value, self.alpha * stepsize * self.weights[j])
+        return ST(value, self.alpha * stepsize * self.weights[j], self.positive)
 
     def subdiff_distance(self, w, grad, ws):
         """Compute distance of negative gradient to the subdifferential at w."""
         subdiff_dist = np.zeros_like(grad)
+        alpha = self.alpha
+        weights = self.weights
+
         for idx, j in enumerate(ws):
-            if w[j] == 0:
-                # distance of - grad_j to alpha * weights[j] * [-1, 1]
-                subdiff_dist[idx] = max(
-                    0, np.abs(grad[idx]) - self.alpha * self.weights[j])
+            if self.positive:
+                if w[j] < 0:
+                    subdiff_dist[idx] = np.inf
+                elif w[j] == 0:
+                    # distance of -grad_j to (-infty, alpha * weights[j]]
+                    subdiff_dist[idx] = max(0, -grad[idx] - alpha * weights[j])
+                else:
+                    # distance of -grad_j to {alpha * weights[j]}
+                    subdiff_dist[idx] = np.abs(grad[idx] + alpha * weights[j])
             else:
-                # distance of - grad_j to alpha * weights[j] * sign(w[j])
-                subdiff_dist[idx] = np.abs(
-                    - grad[idx] - self.alpha * self.weights[j] * np.sign(w[j]))
+                if w[j] == 0:
+                    # distance of -grad_j to alpha * weights[j] * [-1, 1]
+                    subdiff_dist[idx] = max(0, np.abs(grad[idx]) - alpha * weights[j])
+                else:
+                    # distance of -grad_j to {alpha * weights[j] * sign(w[j])}
+                    subdiff_dist[idx] = np.abs(
+                        grad[idx] + alpha * weights[j] * np.sign(w[j]))
         return subdiff_dist
 
     def is_penalized(self, n_features):
diff --git a/skglm/tests/test_estimators.py b/skglm/tests/test_estimators.py
@@ -99,20 +99,27 @@ def test_check_estimator(estimator_name):
 @pytest.mark.parametrize("estimator_name", dict_estimators_ours.keys())
 @pytest.mark.parametrize('X', [X, X_sparse])
 @pytest.mark.parametrize('fit_intercept', [True, False])
-def test_estimator(estimator_name, X, fit_intercept):
+@pytest.mark.parametrize('positive', [True, False])
+def test_estimator(estimator_name, X, fit_intercept, positive):
     if estimator_name == "GeneralizedLinearEstimator":
         pytest.skip()
     if fit_intercept and estimator_name == "LogisticRegression":
         pytest.xfail("sklearn LogisticRegression does not support intercept.")
     if fit_intercept and estimator_name == "SVC":
         pytest.xfail("Intercept is not supported for SVC.")
+    if positive and estimator_name not in ("Lasso", "ElasticNet", "WeightedLasso"):
+        pytest.xfail("`positive` option is only supported by L1, L1_plus_L2 and wL1.")
 
     estimator_sk = clone(dict_estimators_sk[estimator_name])
     estimator_ours = clone(dict_estimators_ours[estimator_name])
 
     estimator_sk.set_params(fit_intercept=fit_intercept)
     estimator_ours.set_params(fit_intercept=fit_intercept)
 
+    if positive:
+        estimator_sk.set_params(positive=positive)
+        estimator_ours.set_params(positive=positive)
+
     estimator_sk.fit(X, y)
     estimator_ours.fit(X, y)
     coef_sk = estimator_sk.coef_
diff --git a/skglm/utils/prox_funcs.py b/skglm/utils/prox_funcs.py
@@ -4,11 +4,11 @@
 
 
 @njit
-def ST(x, u):
+def ST(x, u, positive=False):
     """Soft-thresholding of scalar x at level u."""
     if x > u:
         return x - u
-    elif x < - u:
+    elif x < - u and not positive:
         return x + u
     else:
         return 0.