ENH - add Primal-Dual Coordinate Descent solver (#131)

Badr-MOUFAD · web-flow · commit cb43489a53f5 · 2022-12-06T17:27:16.000+01:00
diff --git a/skglm/experimental/pdcd_ws.py b/skglm/experimental/pdcd_ws.py
@@ -0,0 +1,230 @@
+import warnings
+
+import numpy as np
+from numpy.linalg import norm
+from scipy.sparse import issparse
+
+from numba import njit
+from skglm.utils.jit_compilation import compiled_clone
+from sklearn.exceptions import ConvergenceWarning
+
+
+class PDCD_WS:
+    r"""Primal-Dual Coordinate Descent solver with working sets.
+
+    It solves::
+
+        \min_w F(Xw) + G(w)
+
+    using a primal-dual method on the saddle point problem::
+
+        \min_w \max_z <Xw, z> + G(w) - F^*(z)
+
+    where :math:`F` is the datafit term (:math:`F^*` its Fenchel conjugate)
+    and :math:`G` is the penalty term.
+
+    The datafit is required to be convex and proximable. Also, the penalty
+    is required to be convex, separable, and proximable.
+
+    The solver is an adaptation of algorithm [1] to working sets [2].
+    The working sets are built using a fixed point distance strategy
+    where each feature is assigned a score based how much its coefficient varies
+    when performing a primal update::
+
+        \text{score}_j = \abs{w_j - prox_{\tau_j, G_j}(w_j - \tau_j <X_j, z>)}
+
+    where :maths:`\tau_j` is the primal step associated with the j-th feature.
+
+    Parameters
+    ----------
+    max_iter : int, optional
+        The maximum number of iterations or equivalently the
+        the maximum number of solved subproblems.
+
+    max_epochs : int, optional
+        Maximum number of primal CD epochs on each subproblem.
+
+    dual_init : array, shape (n_samples,) default None
+        The initialization of dual variables.
+        If None, they are initialized as the 0 vector ``np.zeros(n_samples)``.
+
+    p0 : int, optional
+        First working set size.
+
+    tol : float, optional
+        The tolerance for the optimization.
+
+    verbose : bool or int, default False
+        Amount of verbosity. 0/False is silent.
+
+    References
+    ----------
+    .. [1] Olivier Fercoq and Pascal Bianchi,
+        "A Coordinate-Descent Primal-Dual Algorithm with Large Step Size and Possibly
+        Nonseparable Functions", SIAM Journal on Optimization, 2020,
+        https://epubs.siam.org/doi/10.1137/18M1168480,
+        code: https://github.com/Badr-MOUFAD/Fercoq-Bianchi-solver
+
+    .. [2] Bertrand, Q. and Klopfenstein, Q. and Bannier, P.-A. and Gidel, G.
+           and Massias, M.
+           "Beyond L1: Faster and Better Sparse Models with skglm", NeurIPS, 2022
+           https://arxiv.org/abs/2204.07826
+    """
+
+    def __init__(self, max_iter=1000, max_epochs=1000, dual_init=None,
+                 p0=100, tol=1e-6, verbose=False):
+        self.max_iter = max_iter
+        self.max_epochs = max_epochs
+        self.dual_init = dual_init
+        self.p0 = p0
+        self.tol = tol
+        self.verbose = verbose
+
+    def solve(self, X, y, datafit_, penalty_, w_init=None, Xw_init=None):
+        if issparse(X):
+            raise ValueError("Sparse matrices are not yet support in PDCD_WS solver.")
+
+        datafit, penalty = PDCD_WS._validate_init(datafit_, penalty_)
+        n_samples, n_features = X.shape
+
+        # init steps
+        # Despite violating the conditions mentioned in [1]
+        # this choice of steps yield in practice a convergent algorithm
+        # with better speed of convergence
+        dual_step = 1 / norm(X, ord=2)
+        primal_steps = 1 / norm(X, axis=0, ord=2)
+
+        # primal vars
+        w = np.zeros(n_features) if w_init is None else w_init
+        Xw = np.zeros(n_samples) if Xw_init is None else Xw_init
+
+        # dual vars
+        if self.dual_init is None:
+            z = np.zeros(n_samples)
+            z_bar = np.zeros(n_samples)
+        else:
+            z = self.dual_init.copy()
+            z_bar = self.dual_init.copy()
+
+        p_objs = []
+        stop_crit = 0.
+        all_features = np.arange(n_features)
+
+        for iteration in range(self.max_iter):
+
+            # check convergence using fixed-point criteria on both dual and primal
+            opts_primal = _scores_primal(X, w, z, penalty, primal_steps, all_features)
+            opt_dual = _score_dual(y, z, Xw, datafit, dual_step)
+
+            stop_crit = max(max(opts_primal), opt_dual)
+
+            if self.verbose:
+                current_p_obj = datafit.value(y, w, Xw) + penalty.value(w)
+                print(
+                    f"Iteration {iteration+1}: {current_p_obj:.10f}, "
+                    f"stopping crit: {stop_crit:.2e}")
+
+            if stop_crit <= self.tol:
+                break
+
+            # build ws
+            gsupp_size = (w != 0).sum()
+            ws_size = max(min(self.p0, n_features),
+                          min(n_features, 2 * gsupp_size))
+
+            # similar to np.argsort()[-ws_size:] but without full sort
+            ws = np.argpartition(opts_primal, -ws_size)[-ws_size:]
+
+            # solve sub problem
+            # inplace update of w, Xw, z, z_bar
+            PDCD_WS._solve_subproblem(
+                y, X, w, Xw, z, z_bar, datafit, penalty,
+                primal_steps, dual_step, ws, self.max_epochs, tol_in=0.3*stop_crit)
+
+            current_p_obj = datafit.value(y, w, Xw) + penalty.value(w)
+            p_objs.append(current_p_obj)
+        else:
+            warnings.warn(
+                f"PDCD_WS did not converge for tol={self.tol:.3e} "
+                f"and max_iter={self.max_iter}.\n"
+                "Considering increasing `max_iter` or `tol`.",
+                category=ConvergenceWarning
+            )
+
+        return w, np.asarray(p_objs), stop_crit
+
+    @staticmethod
+    @njit
+    def _solve_subproblem(y, X, w, Xw, z, z_bar, datafit, penalty,
+                          primal_steps, dual_step, ws, max_epochs, tol_in):
+        n_features = X.shape[1]
+
+        for epoch in range(max_epochs):
+
+            for j in ws:
+                # update primal
+                old_w_j = w[j]
+                pseudo_grad = X[:, j] @ (2 * z_bar - z)
+                w[j] = penalty.prox_1d(
+                    old_w_j - primal_steps[j] * pseudo_grad,
+                    primal_steps[j], j)
+
+                # keep Xw syncr with X @ w
+                delta_w_j = w[j] - old_w_j
+                if delta_w_j:
+                    Xw += delta_w_j * X[:, j]
+
+                # update dual
+                z_bar[:] = datafit.prox_conjugate(z + dual_step * Xw,
+                                                  dual_step, y)
+                z += (z_bar - z) / n_features
+
+            # check convergence using fixed-point criteria on both dual and primal
+            if epoch % 10 == 0:
+                opts_primal_in = _scores_primal(X, w, z, penalty, primal_steps, ws)
+                opt_dual_in = _score_dual(y, z, Xw, datafit, dual_step)
+
+                stop_crit_in = max(max(opts_primal_in), opt_dual_in)
+
+                if stop_crit_in <= tol_in:
+                    break
+
+    @staticmethod
+    def _validate_init(datafit_, penalty_):
+        # validate datafit
+        missing_attrs = []
+        for attr in ('prox_conjugate', 'subdiff_distance'):
+            if not hasattr(datafit_, attr):
+                missing_attrs.append(f"`{attr}`")
+
+        if len(missing_attrs):
+            raise AttributeError(
+                "Datafit is not compatible with PDCD_WS solver.\n"
+                "Datafit must implement `prox_conjugate` and `subdiff_distance`.\n"
+                f"Missing {' and '.join(missing_attrs)}."
+            )
+
+        # jit compile classes
+        compiled_datafit = compiled_clone(datafit_)
+        compiled_penalty = compiled_clone(penalty_)
+
+        return compiled_datafit, compiled_penalty
+
+
+@njit
+def _scores_primal(X, w, z, penalty, primal_steps, ws):
+    scores_ws = np.zeros(len(ws))
+
+    for idx, j in enumerate(ws):
+        next_w_j = penalty.prox_1d(w[j] - primal_steps[j] * X[:, j] @ z,
+                                   primal_steps[j], j)
+        scores_ws[idx] = abs(w[j] - next_w_j)
+
+    return scores_ws
+
+
+@njit
+def _score_dual(y, z, Xw, datafit, dual_step):
+    next_z = datafit.prox_conjugate(z + dual_step * Xw,
+                                    dual_step, y)
+    return norm(z - next_z, ord=np.inf)
diff --git a/skglm/experimental/sqrt_lasso.py b/skglm/experimental/sqrt_lasso.py
@@ -5,7 +5,7 @@
 from sklearn.linear_model._base import LinearModel, RegressorMixin
 
 from skglm.penalties import L1
-from skglm.utils.prox_funcs import ST_vec, proj_L2ball
+from skglm.utils.prox_funcs import ST_vec, proj_L2ball, BST
 from skglm.utils.jit_compilation import compiled_clone
 from skglm.datafits.base import BaseDatafit
 from skglm.solvers.prox_newton import ProxNewton
@@ -54,6 +54,24 @@ def raw_hessian(self, y, Xw):
         fill_value = 1 / norm(y - Xw)
         return np.full(n_samples, fill_value)
 
+    def prox(self, w, step, y):
+        """Prox of ``step * ||y - . ||``."""
+        return y - BST(y - w, step)
+
+    def prox_conjugate(self, z, step, y):
+        """Prox of ``step * ||y - . ||^*``."""
+        return proj_L2ball(z - step * y)
+
+    def subdiff_distance(self, Xw, z, y):
+        """Distance of ``z`` to subdiff of ||y - . || at ``Xw``."""
+        # computation note: \partial ||y - . ||(Xw) = - \partial || . ||(y - Xw)
+        y_minus_Xw = y - Xw
+
+        if np.any(y_minus_Xw):
+            return norm(z + y_minus_Xw / norm(y_minus_Xw))
+
+        return norm(z - proj_L2ball(z))
+
 
 class SqrtLasso(LinearModel, RegressorMixin):
     """Square root Lasso estimator based on Prox Newton solver.
diff --git a/skglm/experimental/tests/test_sqrt_lasso.py b/skglm/experimental/tests/test_sqrt_lasso.py
@@ -2,8 +2,11 @@
 import numpy as np
 from numpy.linalg import norm
 
+from skglm.penalties import L1
 from skglm.utils.data import make_correlated_data
-from skglm.experimental.sqrt_lasso import SqrtLasso, _chambolle_pock_sqrt
+from skglm.experimental.sqrt_lasso import (SqrtLasso, SqrtQuadratic,
+                                           _chambolle_pock_sqrt)
+from skglm.experimental.pdcd_ws import PDCD_WS
 
 
 def test_alpha_max():
@@ -56,5 +59,20 @@ def test_prox_newton_cp():
     np.testing.assert_allclose(clf.coef_, w)
 
 
+@pytest.mark.parametrize('with_dual_init', [True, False])
+def test_PDCD_WS(with_dual_init):
+    n_samples, n_features = 50, 10
+    X, y, _ = make_correlated_data(n_samples, n_features, random_state=0)
+
+    alpha_max = norm(X.T @ y, ord=np.inf) / norm(y)
+    alpha = alpha_max / 10
+
+    dual_init = y / norm(y) if with_dual_init else None
+
+    w = PDCD_WS(dual_init=dual_init).solve(X, y, SqrtQuadratic(), L1(alpha))[0]
+    clf = SqrtLasso(alpha=alpha, tol=1e-12).fit(X, y)
+    np.testing.assert_allclose(clf.coef_, w, atol=1e-6)
+
+
 if __name__ == '__main__':
     pass