ENH add Square root Lasso (#57)

Badr-MOUFAD · mathurinm · web-flow · commit 3d1f52436059 · 2022-10-11T15:58:56.000+02:00
Co-authored-by: mathurinm &lt;mathurin.massias@gmail.com&gt;
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -24,5 +24,6 @@ jobs:
         pip install pytest
         pip install numpydoc
         pip install .
+        pip install statsmodels cvxopt
     - name: Test with pytest
       run: pytest -v skglm/
diff --git a/doc/api.rst b/doc/api.rst
@@ -73,4 +73,14 @@ Solvers
    GroupBCD
    MultiTaskBCD
    ProxNewton
-   
+
+
+Experimental
+============
+
+.. currentmodule:: skglm.experimental
+
+.. autosummary::
+   :toctree: generated/
+
+   SqrtLasso
diff --git a/doc/changes/0.2.rst b/doc/changes/0.2.rst
@@ -1,7 +1,9 @@
 .. _changes_0_2:
 
 Version 0.2 (in progress)
-------------------------
+-------------------------
+
+- Experimental :ref:`Square root Lasso <skglm.experimental.SqrtLasso>` class with ProxNewton or Chambolle-Pock solver (PR :gh:`57`)
 
 - Accelerated block coordinate descent solver :ref:`GroupBCD <skglm.solvers.GroupBCD>` with working sets for problems with group penalties (PR :gh:`29`, :gh:`28`, and :gh:`26`)
 
diff --git a/skglm/experimental/__init__.py b/skglm/experimental/__init__.py
@@ -0,0 +1,5 @@
+from .sqrt_lasso import SqrtLasso
+
+__all__ = [
+    SqrtLasso,
+]
diff --git a/skglm/experimental/_plot_sqrt_lasso.py b/skglm/experimental/_plot_sqrt_lasso.py
@@ -0,0 +1,34 @@
+
+import numpy as np
+from numpy.linalg import norm
+import matplotlib.pyplot as plt
+from skglm.utils import make_correlated_data
+from skglm.experimental.sqrt_lasso import SqrtLasso, _chambolle_pock_sqrt
+
+X, y, _ = make_correlated_data(n_samples=200, n_features=100, random_state=24)
+
+n_samples, n_features = X.shape
+alpha_max = norm(X.T @ y, ord=np.inf) / (norm(y) * np.sqrt(n_samples))
+
+alpha = alpha_max / 10
+
+
+max_iter = 1000
+obj_freq = 10
+w, _, objs = _chambolle_pock_sqrt(X, y, alpha, max_iter=max_iter, obj_freq=obj_freq)
+
+
+# no convergence issue if n_features < n_samples, can use ProxNewton
+# clf = SqrtLasso(alpha=alpha / np.sqrt(n_samples), verbose=2, tol=1e-10)
+clf = SqrtLasso(alpha=alpha, verbose=2, tol=1e-10)
+clf.fit(X, y)
+
+# consider that our solver has converged
+w_star = clf.coef_
+p_star = norm(X @ w_star - y) / np.sqrt(n_samples) + alpha * norm(w_star, ord=1)
+
+plt.close("all")
+plt.semilogy(np.arange(1, max_iter+1, obj_freq), np.array(objs) - p_star)
+plt.xlabel("CP iteration")
+plt.ylabel("$F(x) - F(x^*)$")
+plt.show(block=False)
diff --git a/skglm/experimental/sqrt_lasso.py b/skglm/experimental/sqrt_lasso.py
@@ -0,0 +1,231 @@
+import warnings
+import numpy as np
+from numpy.linalg import norm
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.linear_model._base import LinearModel, RegressorMixin
+
+from skglm.penalties import L1
+from skglm.utils import compiled_clone, ST_vec, proj_L2ball
+from skglm.datafits.base import BaseDatafit
+from skglm.solvers.prox_newton import ProxNewton
+
+
+class SqrtQuadratic(BaseDatafit):
+    """Square root quadratic datafit.
+
+    The datafit reads::
+        ||y - Xw||_2 / sqrt(n_samples)
+    """
+
+    def __init__(self):
+        pass
+
+    def get_spec(self):
+        spec = ()
+        return spec
+
+    def params_to_dict(self):
+        return dict()
+
+    def value(self, y, w, Xw):
+        return np.linalg.norm(y - Xw) / np.sqrt(len(y))
+
+    def raw_grad(self, y, Xw):
+        """Compute gradient of datafit w.r.t ``Xw``.
+
+        Raises
+        ------
+            Exception
+                if value of residuals is less than ``1e-2 * ||y||``.
+        """
+        minus_residual = Xw - y
+        norm_residuals = norm(minus_residual)
+
+        if norm_residuals < 1e-2 * norm(y):
+            raise ValueError("SmallResidualException")
+
+        return minus_residual / (norm_residuals * np.sqrt(len(y)))
+
+    def raw_hessian(self, y, Xw):
+        """Diagonal matrix upper bounding the Hessian."""
+        n_samples = len(y)
+        fill_value = 1 / (np.sqrt(n_samples) * norm(y - Xw))
+        return np.full(n_samples, fill_value)
+
+
+class SqrtLasso(LinearModel, RegressorMixin):
+    """Square root Lasso estimator based on Prox Newton solver.
+
+    The optimization objective for square root Lasso is::
+
+        |y - X w||_2 / sqrt(n_samples) + alpha * ||w||_1
+
+    Parameters
+    ----------
+    alpha : float, default 1
+        Penalty strength.
+
+    max_iter : int, default 20
+        Maximum number of outer iterations.
+
+    max_pn_iter : int, default 1000
+        Maximum number of prox Newton iterations on each subproblem.
+
+    p0 : int, default 10
+        Minimum number of features to be included in the working set.
+
+    tol : float, default 1e-4
+        Tolerance for convergence.
+
+    verbose : bool, default False
+        Amount of verbosity. 0/False is silent.
+    """
+
+    def __init__(self, alpha=1., max_iter=100, max_pn_iter=100, p0=10,
+                 tol=1e-4, verbose=0):
+        super().__init__()
+        self.alpha = alpha
+        self.max_iter = max_iter
+        self.max_pn_iter = max_pn_iter
+
+        self.p0 = p0
+        self.tol = tol
+        self.verbose = verbose
+
+    def fit(self, X, y):
+        """Fit the model according to the given training data.
+
+        Parameters
+        ----------
+        X : array or sparse CSC matrix, shape (n_samples, n_features)
+            Training data, where n_samples is the number of samples and
+            n_features is the number of features.
+
+        y : array-like, shape (n_samples,)
+            Target vector relative to X.
+
+        Returns
+        -------
+        self :
+            Fitted estimator.
+        """
+        self.coef_ = self.path(X, y, alphas=[self.alpha])[1][0]
+        self.intercept_ = 0.  # TODO handle fit_intercept
+        return self
+
+    def path(self, X, y, alphas=None, eps=1e-3, n_alphas=10):
+        """Compute Lasso path.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples, n_features)
+            Design matrix.
+
+        y : array, shape (n_samples,)
+            Target vector.
+
+        alphas : array, shape (n_alphas,) default None
+            Grid of alpha. If None a path is constructed from
+            (0, alpha_max] with a length ``eps``.
+
+        eps: float, default 1e-2
+            Length of the path. ``eps=1e-3`` means that
+            ``alpha_min = 1e-3 * alpha_max``.
+
+        n_alphas: int, default 10
+            Number of alphas along the path. This argument is
+            ignored if ``alphas`` was provided.
+
+        Returns
+        -------
+        alphas : array, shape (n_alphas,)
+            The alphas along the path where models are computed.
+
+        coefs : array, shape (n_features, n_alphas)
+            Coefficients along the path.
+        """
+        if not hasattr(self, "solver_"):
+            self.solver_ = ProxNewton(
+                tol=self.tol, max_iter=self.max_iter, verbose=self.verbose)
+        # build path
+        if alphas is None:
+            alpha_max = norm(X.T @ y, ord=np.inf) / (np.sqrt(len(y)) * norm(y))
+            alphas = alpha_max * np.geomspace(1, eps, n_alphas)
+        else:
+            n_alphas = len(alphas)
+            alphas = np.sort(alphas)[::-1]
+
+        n_features = X.shape[1]
+        sqrt_quadratic = compiled_clone(SqrtQuadratic())
+        l1_penalty = compiled_clone(L1(1.))  # alpha is set along the path
+
+        coefs = np.zeros((n_alphas, n_features))
+
+        for i in range(n_alphas):
+            if self.verbose:
+                to_print = "##### Computing alpha %d/%d" % (i + 1, n_alphas)
+                print("#" * len(to_print))
+                print(to_print)
+                print("#" * len(to_print))
+
+            l1_penalty.alpha = alphas[i]
+            # no warm start for the first alpha
+            coef_init = coefs[i].copy() if i else np.zeros(n_features)
+
+            try:
+                coef, _, _ = self.solver_.solve(
+                    X, y, sqrt_quadratic, l1_penalty,
+                    w_init=coef_init, Xw_init=X @ coef_init)
+                coefs[i] = coef
+            except ValueError as val_exception:
+                # make sure to catch residual error
+                # it's implemented this way as Numba doesn't support custom Exception
+                if not str(val_exception) == "SmallResidualException":
+                    raise
+
+                # save coef despite not converging
+                # coef_init holds a ref to coef
+                coef = coef_init
+                res_norm = norm(y - X @ coef)
+                warnings.warn(
+                    f"Small residuals prevented the solver from converging "
+                    f"at alpha={alphas[i]:.2e} (residuals' norm: {res_norm:.4e}). "
+                    "Consider fitting with higher alpha.",
+                    ConvergenceWarning
+                )
+                coefs[i] = coef
+                break
+
+        return alphas, coefs
+
+
+def _chambolle_pock_sqrt(X, y, alpha, max_iter=1000, obj_freq=10, verbose=False):
+    """Apply Chambolle-Pock algorithm to solve square-root Lasso.
+
+    The objective function is:
+        min_w ||Xw - y||_2/sqrt(n_samples) + alpha * ||w||_1.
+    """
+    n_samples, n_features = X.shape
+    # dual variable is z, primal is w
+    z_old = np.zeros(n_samples)
+    z = z_old.copy()
+    w = np.zeros(n_features)
+
+    objs = []
+
+    L = norm(X, ord=2)
+    # take primal and dual stepsizes equal
+    tau = 0.99 / L
+    sigma = 0.99 / L
+
+    for t in range(max_iter):
+        w = ST_vec(w - tau * X.T @ (2 * z - z_old), alpha * np.sqrt(n_samples) * tau)
+        z_old = z.copy()
+        z[:] = proj_L2ball(z + sigma * (X @ w - y))
+
+        if t % obj_freq == 0:
+            objs.append(norm(X @ w - y) / np.sqrt(n_samples) + alpha * norm(w, ord=1))
+            if verbose:
+                print(f"Iter {t}, obj {objs[-1]: .10f}")
+
+    return w, z, objs
diff --git a/skglm/experimental/tests/test_sqrt_lasso.py b/skglm/experimental/tests/test_sqrt_lasso.py
@@ -0,0 +1,59 @@
+import pytest
+import numpy as np
+from numpy.linalg import norm
+
+from skglm.utils import make_correlated_data
+from skglm.experimental.sqrt_lasso import SqrtLasso, _chambolle_pock_sqrt
+
+
+def test_alpha_max():
+    n_samples, n_features = 50, 10
+    X, y, _ = make_correlated_data(n_samples, n_features, random_state=0)
+    alpha_max = norm(X.T @ y, ord=np.inf) / (np.sqrt(n_samples) * norm(y))
+
+    sqrt_lasso = SqrtLasso(alpha=alpha_max).fit(X, y)
+
+    np.testing.assert_equal(sqrt_lasso.coef_, 0)
+
+
+def test_vs_statsmodels():
+    try:
+        from statsmodels.regression import linear_model  # noqa
+    except ImportError:
+        pytest.xfail("This test requires statsmodels to run.")
+    n_samples, n_features = 50, 10
+    X, y, _ = make_correlated_data(n_samples, n_features, random_state=0)
+
+    alpha_max = norm(X.T @ y, ord=np.inf) / (np.sqrt(n_samples) * norm(y))
+    n_alphas = 3
+    alphas = alpha_max * np.geomspace(1, 1e-2, n_alphas+1)[1:]
+
+    sqrt_lasso = SqrtLasso(tol=1e-9)
+    coefs_skglm = sqrt_lasso.path(X, y, alphas)[1]
+
+    coefs_statsmodels = np.zeros((len(alphas), n_features))
+
+    # fit statsmodels on path
+    for i in range(n_alphas):
+        alpha = alphas[i]
+        model = linear_model.OLS(y, X)
+        model = model.fit_regularized(method='sqrt_lasso', L1_wt=1.,
+                                      alpha=n_samples * alpha)
+        coefs_statsmodels[i] = model.params
+
+    np.testing.assert_almost_equal(coefs_skglm, coefs_statsmodels, decimal=4)
+
+
+def test_prox_newton_cp():
+    n_samples, n_features = 50, 10
+    X, y, _ = make_correlated_data(n_samples, n_features, random_state=0)
+
+    alpha_max = norm(X.T @ y, ord=np.inf) / (np.sqrt(n_samples) * norm(y))
+    alpha = alpha_max / 10
+    clf = SqrtLasso(alpha=alpha, tol=1e-12).fit(X, y)
+    w, _, _ = _chambolle_pock_sqrt(X, y, alpha, max_iter=1000)
+    np.testing.assert_allclose(clf.coef_, w)
+
+
+if __name__ == '__main__':
+    pass
diff --git a/skglm/utils.py b/skglm/utils.py
@@ -105,6 +105,15 @@ def ST_vec(x, u):
     return np.sign(x) * np.maximum(0., np.abs(x) - u)
 
 
+@njit
+def proj_L2ball(u):
+    """Project input on L2 unit ball."""
+    norm_u = norm(u)
+    if norm_u <= 1:
+        return u
+    return u / norm_u
+
+
 @njit
 def BST(x, u):
     """Block soft-thresholding of vector x at level u."""