ENH add single task group solver (#26)

Badr-MOUFAD · mathurinm · web-flow · commit de7af7cc2eb4 · 2022-06-05T11:50:52.000+02:00
Co-authored-by: mathurinm &lt;mathurin.massias@gmail.com&gt;
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -20,6 +20,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
+        python -m pip install git+https://github.com/mathurinm/celer.git
         pip install pytest
         pip install numpydoc
         pip install .
diff --git a/doc/api.rst b/doc/api.rst
@@ -40,6 +40,7 @@ Penalties
    L2_3
    MCPenalty
    WeightedL1
+   WeightedGroupL2
 
 
 Datafits
@@ -50,6 +51,7 @@ Datafits
 .. autosummary::
    :toctree: generated/
 
+   GroupQuadratic
    Logistic
    Quadratic
    QuadraticSVC
diff --git a/skglm/datafits/group.py b/skglm/datafits/group.py
@@ -0,0 +1,67 @@
+import numpy as np
+from numpy.linalg import norm
+from numba.experimental import jitclass
+from numba import int32, float64
+
+from skglm.datafits.base import BaseDatafit
+
+
+spec_QuadraticGroup = [
+    ('grp_ptr', int32[:]),
+    ('grp_indices', int32[:]),
+    ('lipschitz', float64[:])
+]
+
+
+@jitclass(spec_QuadraticGroup)
+class QuadraticGroup(BaseDatafit):
+    """Quadratic datafit used with group penalties.
+
+    The datafit reads::
+
+    (1 / (2 * n_samples)) * ||y - X w||^2_2
+
+    Attributes
+    ----------
+    grp_indices : array, shape (n_features,)
+        The group indices stacked contiguously
+        (e.g. [grp1_indices, grp2_indices, ...]).
+
+    grp_ptr : array, shape (n_groups + 1,)
+        The group pointers such that two consecutive elements delimit
+        the indices of a group in ``grp_indices``.
+
+    lipschitz : array, shape (n_groups,)
+        The lipschitz constants for each group.
+    """
+
+    def __init__(self, grp_ptr, grp_indices):
+        self.grp_ptr, self.grp_indices = grp_ptr, grp_indices
+
+    def initialize(self, X, y):
+        grp_ptr, grp_indices = self.grp_ptr, self.grp_indices
+        n_groups = len(grp_ptr) - 1
+
+        lipschitz = np.zeros(n_groups)
+        for g in range(n_groups):
+            grp_g_indices = grp_indices[grp_ptr[g]: grp_ptr[g+1]]
+            X_g = X[:, grp_g_indices]
+            lipschitz[g] = norm(X_g, ord=2) ** 2 / len(y)
+
+        self.lipschitz = lipschitz
+
+    def value(self, y, w, Xw):
+        return norm(y - Xw) ** 2 / (2 * len(y))
+
+    def gradient_g(self, X, y, w, Xw, g):
+        grp_ptr, grp_indices = self.grp_ptr, self.grp_indices
+        grp_g_indices = grp_indices[grp_ptr[g]: grp_ptr[g+1]]
+
+        grad_g = np.zeros(len(grp_g_indices))
+        for idx, j in enumerate(grp_g_indices):
+            grad_g[idx] = self.gradient_scalar(X, y, w, Xw, j)
+
+        return grad_g
+
+    def gradient_scalar(self, X, y, w, Xw, j):
+        return X[:, j] @ (Xw - y) / len(y)
diff --git a/skglm/penalties/block_separable.py b/skglm/penalties/block_separable.py
@@ -1,6 +1,7 @@
 import numpy as np
-from numpy.linalg.linalg import norm
-from numba import float64
+from numpy.linalg import norm
+
+from numba import float64, int32
 from numba.experimental import jitclass
 from numba.types import bool_
 
@@ -153,3 +154,81 @@ def subdiff_distance(self, W, grad, ws):
     def is_penalized(self, n_features):
         """Return a binary mask with the penalized features."""
         return np.ones(n_features, bool_)
+
+
+spec_WeightedGroupL2 = [
+    ('alpha', float64),
+    ('weights', float64[:]),
+    ('grp_ptr', int32[:]),
+    ('grp_indices', int32[:]),
+]
+
+
+@jitclass(spec_WeightedGroupL2)
+class WeightedGroupL2(BasePenalty):
+    r"""Weighted Group L2 penalty.
+
+    The penalty reads::
+
+        \sum_{g} weights[g] * ||w_g||_2
+
+    Attributes
+    ----------
+    alpha : float
+        The regularization parameter.
+
+    weights : array, shape (n_groups,)
+        The weights of the groups.
+
+    grp_indices : array, shape (n_features,)
+        The group indices stacked contiguously
+        (e.g. [grp1_indices, grp2_indices, ...]).
+
+    grp_ptr : array, shape (n_groups + 1,)
+        The group pointers such that two consecutive elements delimit
+        the indices of a group in ``grp_indices``.
+    """
+
+    def __init__(self, alpha, weights, grp_ptr, grp_indices):
+        self.alpha, self.weights = alpha, weights
+        self.grp_ptr, self.grp_indices = grp_ptr, grp_indices
+
+    def value(self, w):
+        """Value of penalty at vector ``w``."""
+        alpha, weights = self.alpha, self.weights
+        grp_ptr, grp_indices = self.grp_ptr, self.grp_indices
+        n_grp = len(grp_ptr) - 1
+
+        sum_weighted_L2 = 0.
+        for g in range(n_grp):
+            grp_g_indices = grp_indices[grp_ptr[g]: grp_ptr[g+1]]
+            w_g = w[grp_g_indices]
+
+            sum_weighted_L2 += alpha * weights[g] * norm(w_g)
+
+        return sum_weighted_L2
+
+    def prox_1group(self, value, stepsize, g):
+        """Compute the proximal operator of group ``g``."""
+        return BST(value, self.alpha * stepsize * self.weights[g])
+
+    def subdiff_distance(self, w, grad, ws):
+        """Compute distance of negative gradient to the subdifferential at ``w``."""
+        alpha, weights = self.alpha, self.weights
+        grp_ptr, grp_indices = self.grp_ptr, self.grp_indices
+
+        scores = np.zeros(len(ws))
+        for idx, g in enumerate(ws):
+            grad_g = grad[idx]
+
+            grp_g_indices = grp_indices[grp_ptr[g]: grp_ptr[g+1]]
+            w_g = w[grp_g_indices]
+            norm_w_g = norm(w_g)
+
+            if norm_w_g == 0:
+                scores[idx] = max(0, norm(grad_g) - alpha * weights[g])
+            else:
+                subdiff = alpha * weights[g] * w_g / norm_w_g
+                scores[idx] = norm(grad_g - subdiff)
+
+        return scores
diff --git a/skglm/solvers/group_bcd_solver.py b/skglm/solvers/group_bcd_solver.py
@@ -0,0 +1,121 @@
+import numpy as np
+from numba import njit
+
+
+def bcd_solver(X, y, datafit, penalty, w_init=None,
+               max_iter=1000, max_epochs=100, tol=1e-7, verbose=False):
+    """Run a group BCD solver.
+
+    Parameters
+    ----------
+    X : array, shape (n_samples, n_features)
+        Design matrix.
+
+    y : array, shape (n_samples,)
+        Target vector.
+
+    datafit : instance of BaseDatafit
+        Datafit object.
+
+    penalty : instance of BasePenalty
+        Penalty object.
+
+    w_init : array, shape (n_features,), default None
+        Initial value of coefficients.
+        If set to None, a zero vector is used instead.
+
+    max_iter : int, default 1000
+        Maximum number of iterations.
+
+    max_epochs : int, default 100
+        Maximum number of epochs.
+
+    tol : float, default 1e-6
+        Tolerance for convergence.
+
+    verbose : bool, default False
+        Amount of verbosity. 0/False is silent.
+
+    Returns
+    -------
+    w : array, shape (n_features,)
+        Solution that minimizes the problem defined by datafit and penalty.
+
+    p_objs_out: array (max_iter,)
+        The objective values at every outer iteration.
+
+    stop_crit: float
+        The value of the stop criterion.
+    """
+    n_features = X.shape[1]
+    n_groups = len(penalty.grp_ptr) - 1
+
+    # init
+    w = np.zeros(n_features) if w_init is None else w_init
+    Xw = X @ w
+    datafit.initialize(X, y)
+    all_groups = np.arange(n_groups)
+    p_objs_out = np.zeros(max_iter)
+
+    for t in range(max_iter):
+        if t == 0:  # avoid computing p_obj twice
+            prev_p_obj = datafit.value(y, w, Xw) + penalty.value(w)
+
+        for epoch in range(max_epochs):
+            _bcd_epoch(X, y, w, Xw, datafit, penalty, all_groups)
+
+            if epoch % 10 == 0:
+                current_p_obj = datafit.value(y, w, Xw) + penalty.value(w)
+                stop_crit_in = prev_p_obj - current_p_obj
+
+                if max(verbose - 1, 0):
+                    print(
+                        f"Epoch {epoch+1}: {current_p_obj:.10f} "
+                        f"obj. variation: {stop_crit_in:.2e}"
+                    )
+
+                if stop_crit_in <= tol:
+                    print("Early exit")
+                    break
+                prev_p_obj = current_p_obj
+
+        current_p_obj = datafit.value(y, w, Xw) + penalty.value(w)
+        stop_crit = prev_p_obj - current_p_obj
+
+        if max(verbose, 0):
+            print(
+                f"Iteration {t+1}: {current_p_obj:.10f}, "
+                f"stopping crit: {stop_crit:.2f}"
+            )
+
+        if stop_crit <= tol:
+            print("Outer solver: Early exit")
+            break
+
+        prev_p_obj = current_p_obj
+        p_objs_out[t] = current_p_obj
+
+    return w, p_objs_out, stop_crit
+
+
+@njit
+def _bcd_epoch(X, y, w, Xw, datafit, penalty, ws):
+    """Perform a single BCD epoch on groups in ws."""
+    grp_ptr, grp_indices = penalty.grp_ptr, penalty.grp_indices
+
+    for g in ws:
+        grp_g_indices = grp_indices[grp_ptr[g]: grp_ptr[g+1]]
+        old_w_g = w[grp_g_indices].copy()
+
+        lipschitz_g = datafit.lipschitz[g]
+        grad_g = datafit.gradient_g(X, y, w, Xw, g)
+
+        w[grp_g_indices] = penalty.prox_1group(
+            old_w_g - grad_g / lipschitz_g,
+            1 / lipschitz_g, g
+        )
+
+        for idx, j in enumerate(grp_g_indices):
+            if old_w_g[idx] != w[j]:
+                Xw += (w[j] - old_w_g[idx]) * X[:, j]
+    return
diff --git a/skglm/tests/test_group.py b/skglm/tests/test_group.py
diff --git a/skglm/utils.py b/skglm/utils.py