ENH add weights for Group Lasso (#223)

Badr-MOUFAD · mathurinm · web-flow · commit 700c280b6990 · 2022-04-01T17:00:41.000+02:00
Co-authored-by: mathurinm &lt;mathurin.massias@gmail.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -4,15 +4,20 @@
 *.o
 *.c
 *.html
-celer.egg-info/*
-.eggs/*
+celer.egg-info
+.eggs
 
 # Python precompilation
 *pyc
+*pyd
+
+# build
+build
 
 
 # cache
-.pytest_cache/*
+.pytest_cache
+__pycache__
 
 doc/*
 coverage/*
diff --git a/celer/dropin_sklearn.py b/celer/dropin_sklearn.py
@@ -711,9 +711,9 @@ class GroupLasso(Lasso_sklearn):
 
     The optimization objective for the Group Lasso is::
 
-    (1 / (2 * n_samples)) * ||y - X w||^2_2 + alpha * \sum_g ||w_g||_2
+    (1 / (2 * n_samples)) * ||y - X w||^2_2 + alpha * \sum_g weights_g ||w_g||_2
 
-    where `w_g` is the weight vector of group number `g`.
+    where `w_g` are the regression coefficients of group number `g`.
 
     Parameters
     ----------
@@ -752,6 +752,10 @@ class GroupLasso(Lasso_sklearn):
     fit_intercept : bool, optional (default=True)
         Whether or not to fit an intercept.
 
+    weights : array, shape (n_groups,), optional (default=None)
+        Strictly positive weights used in the L2 penalty part of the
+        GroupLasso objective. If None, weights equal to 1 are used.
+
     warm_start : bool, optional (default=False)
         When set to True, reuse the solution of the previous call to fit as
         initialization, otherwise, just erase the previous solution.
@@ -801,7 +805,7 @@ class GroupLasso(Lasso_sklearn):
 
     def __init__(self, groups=1, alpha=1., max_iter=100,
                  max_epochs=50000, p0=10, verbose=0, tol=1e-4, prune=True,
-                 fit_intercept=True, warm_start=False):
+                 fit_intercept=True, weights=None, warm_start=False):
         super(GroupLasso, self).__init__(
             alpha=alpha, tol=tol, max_iter=max_iter,
             fit_intercept=fit_intercept,
@@ -811,6 +815,7 @@ def __init__(self, groups=1, alpha=1., max_iter=100,
         self.max_epochs = max_epochs
         self.p0 = p0
         self.prune = prune
+        self.weights = weights
 
     def path(self, X, y, alphas, coef_init=None, return_n_iter=True,
              **kwargs):
@@ -820,7 +825,7 @@ def path(self, X, y, alphas, coef_init=None, return_n_iter=True,
             coef_init=coef_init, max_iter=self.max_iter,
             return_n_iter=return_n_iter, max_epochs=self.max_epochs,
             p0=self.p0, verbose=self.verbose, tol=self.tol, prune=self.prune,
-            X_scale=kwargs.get('X_scale', None),
+            weights=self.weights, X_scale=kwargs.get('X_scale', None),
             X_offset=kwargs.get('X_offset', None))
 
         return results
diff --git a/celer/group_fast.pyx b/celer/group_fast.pyx
@@ -25,18 +25,21 @@ cdef:
 @cython.cdivision(True)
 cpdef floating primal_grplasso(
         floating alpha, floating[:] R, int[::1] grp_ptr,
-        int[::1] grp_indices, floating[:] w):
+        int[::1] grp_indices, floating[:] w, floating[:] weights):
     cdef floating nrm = 0.
     cdef int j, k, g
     cdef int n_samples = R.shape[0]
     cdef int n_groups = grp_ptr.shape[0] - 1
     cdef floating p_obj = fnrm2(&n_samples, &R[0], &inc) ** 2 / (2 * n_samples)
+
     for g in range(n_groups):
         nrm = 0.
+
         for k in range(grp_ptr[g], grp_ptr[g + 1]):
             j = grp_indices[k]
             nrm += w[j] ** 2
-        p_obj += alpha * sqrt(nrm)
+        p_obj += alpha * sqrt(nrm) * weights[g]
+
     return p_obj
 
 
@@ -47,7 +50,7 @@ cpdef floating dnorm_grp(
         bint is_sparse, floating[::1] theta, int[::1] grp_ptr,
         int[::1] grp_indices, floating[::1, :] X, floating[::1] X_data,
         int[::1] X_indices, int[::1] X_indptr, floating[::1] X_mean,
-        int ws_size, int[:] C, bint center):
+        floating[:] weights, int ws_size, int[:] C, bint center):
     """Dual norm in the group case, i.e. L2/infty ofter groups."""
     cdef floating Xj_theta, tmp
     cdef floating scal = 0.
@@ -63,6 +66,9 @@ cpdef floating dnorm_grp(
 
     if ws_size == n_groups:  # max over all groups
         for g in range(n_groups):
+            if weights[g] == INFINITY:
+                continue
+
             tmp = 0
             for k in range(grp_ptr[g], grp_ptr[g + 1]):
                 j = grp_indices[k]
@@ -79,10 +85,13 @@ cpdef floating dnorm_grp(
                                     &inc)
                 tmp += Xj_theta ** 2
 
-            scal = max(scal, sqrt(tmp))
+            scal = max(scal, sqrt(tmp) / weights[g])
 
     else:  # scaling only with features in C
         for g_idx in range(ws_size):
+            if weights[g] == INFINITY:
+                continue
+
             g = C[g_idx]
             tmp = 0
             for k in range(grp_ptr[g], grp_ptr[g + 1]):
@@ -100,7 +109,7 @@ cpdef floating dnorm_grp(
                                     &inc)
                 tmp += Xj_theta ** 2
 
-            scal = max(scal, sqrt(tmp))
+            scal = max(scal, sqrt(tmp) / weights[g])
     return scal
 
 
@@ -110,9 +119,9 @@ cpdef floating dnorm_grp(
 cdef void set_prios_grp(
         bint is_sparse, int pb, floating[::1] theta, floating[::1, :] X,
         floating[::1] X_data, int[::1] X_indices, int[::1] X_indptr,
-        floating[::1] norms_X_grp, int[::1] grp_ptr, int[::1] grp_indices,
-        floating[::1] prios, int[::1] screened, floating radius,
-        int * n_screened):
+        floating[:] weights, floating[::1] norms_X_grp, int[::1] grp_ptr,
+        int[::1] grp_indices, floating[::1] prios, int[::1] screened,
+        floating radius, int * n_screened):
     cdef int i, j, k, g, startptr, endptr
     cdef floating nrm_Xgtheta, Xj_theta
     cdef int n_groups = grp_ptr.shape[0] - 1
@@ -134,7 +143,7 @@ cdef void set_prios_grp(
             else:
                 Xj_theta = fdot(&n_samples, &theta[0], &inc, &X[0, j], &inc)
             nrm_Xgtheta += Xj_theta ** 2
-        nrm_Xgtheta = sqrt(nrm_Xgtheta)
+        nrm_Xgtheta = sqrt(nrm_Xgtheta) / weights[g]
 
         prios[g] = (1. - nrm_Xgtheta) / norms_X_grp[g]
 
@@ -150,8 +159,8 @@ cpdef celer_grp(
         int[::1] grp_ptr, floating[::1] X_data, int[::1] X_indices,
         int[::1] X_indptr, floating[::1] X_mean, floating[:] y, floating alpha,
         floating[:] w, floating[:] R, floating[::1] theta,
-        floating[::1] norms_X_grp, floating tol, int max_iter, int max_epochs,
-        int gap_freq=10, floating tol_ratio_inner=0.3, int p0=100,
+        floating[::1] norms_X_grp, floating tol, floating[:] weights, int max_iter,
+        int max_epochs, int gap_freq=10, floating tol_ratio_inner=0.3, int p0=100,
         bint prune=1, bint use_accel=1,
         bint verbose=0):
 
@@ -225,7 +234,7 @@ cpdef celer_grp(
 
         scal = dnorm_grp(
             is_sparse, theta, grp_ptr, grp_indices, X, X_data, X_indices,
-            X_indptr, X_mean, n_groups, dummy_C, center)
+            X_indptr, X_mean, weights, n_groups, dummy_C, center)
 
         if scal > 1. :
             tmp = 1. / scal
@@ -234,11 +243,10 @@ cpdef celer_grp(
         d_obj = dual(pb, n_samples, alpha, norm_y2, &theta[0], &y[0])
 
         if t > 0:
-            pass
             # also test dual point returned by inner solver after 1st iter:
             scal = dnorm_grp(
                     is_sparse, theta_inner, grp_ptr, grp_indices, X, X_data,
-                    X_indices, X_indptr, X_mean, n_groups, dummy_C, center)
+                    X_indices, X_indptr, X_mean, weights, n_groups, dummy_C, center)
             if scal > 1.:
                 tmp = 1. / scal
                 fscal(&n_samples, &tmp, &theta_inner[0], &inc)
@@ -254,7 +262,7 @@ cpdef celer_grp(
             highest_d_obj = d_obj
             # TODO implement a best_theta
 
-        p_obj = primal_grplasso(alpha, R, grp_ptr, grp_indices, w)
+        p_obj = primal_grplasso(alpha, R, grp_ptr, grp_indices, w, weights)
         gap = p_obj - highest_d_obj
         gaps[t] = gap
 
@@ -272,8 +280,9 @@ cpdef celer_grp(
             # radius = sqrt(gap / 2.) / alpha
 
         set_prios_grp(
-            is_sparse, pb, theta, X, X_data, X_indices, X_indptr, lc_groups,
-            grp_ptr, grp_indices, prios, screened, radius, &n_screened)
+            is_sparse, pb, theta, X, X_data, X_indices, X_indptr,
+            weights, lc_groups, grp_ptr, grp_indices, prios, screened,
+            radius, &n_screened)
 
         if prune:
             nnz = 0
@@ -327,7 +336,7 @@ cpdef celer_grp(
 
                 scal = dnorm_grp(
                     is_sparse, theta_inner, grp_ptr, grp_indices, X, X_data,
-                    X_indices, X_indptr, X_mean, ws_size, C, center)
+                    X_indices, X_indptr, X_mean, weights, ws_size, C, center)
 
                 if scal > 1. :
                     tmp = 1. / scal
@@ -348,8 +357,8 @@ cpdef celer_grp(
                     if epoch // gap_freq >= K:
                         scal = dnorm_grp(
                             is_sparse, thetacc, grp_ptr, grp_indices, X,
-                            X_data, X_indices, X_indptr, X_mean, ws_size, C,
-                            center)
+                            X_data, X_indices, X_indptr, X_mean, weights,
+                            ws_size, C, center)
 
                         if scal > 1.:
                             tmp = 1. / scal
@@ -365,7 +374,8 @@ cpdef celer_grp(
 
                 if d_obj_in > highest_d_obj_in:
                     highest_d_obj_in = d_obj_in
-                p_obj_in = primal_grplasso(alpha, R, grp_ptr, grp_indices, w)
+
+                p_obj_in = primal_grplasso(alpha, R, grp_ptr, grp_indices, w, weights)
                 gap_in = p_obj_in - highest_d_obj_in
 
                 if verbose_in:
@@ -402,7 +412,7 @@ cpdef celer_grp(
                     norm_wg += w[j] ** 2
                 norm_wg = sqrt(norm_wg)
                 bst_scal = max(0.,
-                               1. - alpha / lc_groups[g] * n_samples / norm_wg)
+                               1. - alpha * weights[g] / lc_groups[g] * n_samples / norm_wg)
 
                 for k in range(grp_ptr[g + 1] - grp_ptr[g]):
                     j = grp_indices[grp_ptr[g] + k]
diff --git a/celer/homotopy.py b/celer/homotopy.py
@@ -146,6 +146,8 @@ def celer_path(X, y, pb, eps=1e-3, n_alphas=100, alphas=None,
 
     if pb.lower() not in ("lasso", "logreg", "grouplasso"):
         raise ValueError("Unsupported problem %s" % pb)
+
+    n_groups = None  # set n_groups to None for lasso and logreg
     if pb.lower() == "lasso":
         pb = LASSO
     elif pb.lower() == "logreg":
@@ -182,15 +184,7 @@ def celer_path(X, y, pb, eps=1e-3, n_alphas=100, alphas=None,
 
     X_dense, X_data, X_indices, X_indptr = _sparse_and_dense(X)
 
-    if weights is None:
-        weights = np.ones(n_features).astype(X.dtype)
-    elif (weights <= 0).any():
-        raise ValueError("0 or negative weights are not supported.")
-    elif weights.shape[0] != X.shape[1]:
-        raise ValueError(
-            "As many weights as features must be passed. "
-            f"Expected {X.shape[1]}, got {weights.shape[0]}."
-        )
+    weights = _check_weights(weights, pb, X, n_groups)
 
     if alphas is None:
         if pb == LASSO:
@@ -204,7 +198,7 @@ def celer_path(X, y, pb, eps=1e-3, n_alphas=100, alphas=None,
             alpha_max = 0
             for g in range(n_groups):
                 X_g = X[:, grp_indices[grp_ptr[g]:grp_ptr[g + 1]]]
-                alpha_max = max(alpha_max, norm(X_g.T @ y, ord=2))
+                alpha_max = max(alpha_max, norm(X_g.T @ y / weights[g], ord=2))
             alpha_max /= n_samples
 
         alphas = alpha_max * np.geomspace(1, eps, n_alphas,
@@ -287,17 +281,17 @@ def celer_path(X, y, pb, eps=1e-3, n_alphas=100, alphas=None,
                 scal = dnorm_grp(
                     is_sparse, theta, grp_ptr, grp_indices, X_dense,
                     X_data, X_indices, X_indptr, X_sparse_scaling,
-                    len(grp_ptr) - 1, np.zeros(1, dtype=np.int32),
+                    weights, len(grp_ptr) - 1, np.zeros(1, dtype=np.int32),
                     X_sparse_scaling.any())
             theta /= scal
 
         # celer modifies w, Xw, and theta in place:
-        if pb == GRPLASSO:  # TODO this if else scheme is complicated
+        if pb == GRPLASSO:
+            # TODO this if else scheme is complicated
             sol = celer_grp(
                 is_sparse, LASSO, X_dense, grp_indices, grp_ptr, X_data,
-                X_indices,
-                X_indptr, X_sparse_scaling, y, alpha, w, Xw, theta,
-                norms_X_grp, tol, max_iter, max_epochs, p0=p0,
+                X_indices, X_indptr, X_sparse_scaling, y, alpha, w, Xw, theta,
+                norms_X_grp, tol, weights, max_iter, max_epochs, p0=p0,
                 prune=prune, verbose=verbose)
         elif pb == LASSO or (pb == LOGREG and not use_PN):
             sol = celer(
@@ -325,6 +319,26 @@ def celer_path(X, y, pb, eps=1e-3, n_alphas=100, alphas=None,
     return results
 
 
+def _check_weights(weights, pb, X, n_groups):
+    """Handle weights cases."""
+    if weights is None:
+        n_weights = n_groups if pb == GRPLASSO else X.shape[1]
+        weights = np.ones(n_weights, dtype=X.dtype)
+    elif (weights <= 0).any():
+        raise ValueError("0 or negative weights are not supported.")
+    else:
+        expected_n_weights = n_groups if pb == GRPLASSO else X.shape[1]
+        feat_or_grp = "groups" if pb == GRPLASSO else "features"
+
+        if weights.shape[0] != expected_n_weights:
+            raise ValueError(
+                f"As many weights as {feat_or_grp} must be passed. "
+                f"Expected {expected_n_weights}, got {weights.shape[0]}."
+            )
+
+    return weights
+
+
 def _sparse_and_dense(X):
     if sparse.issparse(X):
         X_dense = np.empty([1, 1], order='F', dtype=X.data.dtype)
diff --git a/celer/tests/test_docstring_parameters.py b/celer/tests/test_docstring_parameters.py
@@ -8,6 +8,7 @@
 from pkgutil import walk_packages
 from inspect import getsource
 
+from numpydoc import docscrape
 import celer
 
 # copied from sklearn.fixes
@@ -65,7 +66,6 @@ def get_name(func):
 
 def check_parameters_match(func, doc=None):
     """Check docstring, return list of incorrect results."""
-    from numpydoc import docscrape
     incorrect = []
     name_ = get_name(func)
     if not name_.startswith('celer.'):
@@ -108,8 +108,6 @@ def check_parameters_match(func, doc=None):
 # @requires_numpydoc
 def test_docstring_parameters():
     """Test module docstring formatting."""
-    from numpydoc import docscrape
-
     public_modules_ = public_modules[:]
 
     incorrect = []
diff --git a/celer/tests/test_lasso.py b/celer/tests/test_lasso.py
diff --git a/celer/tests/test_mtl.py b/celer/tests/test_mtl.py