Merge branch 'main' of https://github.com/scikit-learn-contrib/skglm into fista

Badr-MOUFAD · Badr-MOUFAD · commit b6c664ca50ae · 2022-10-20T19:07:25.000+02:00
diff --git a/doc/api.rst b/doc/api.rst
@@ -55,6 +55,7 @@ Datafits
 
    Huber
    Logistic
+   LogisticGroup
    Quadratic
    QuadraticGroup
    QuadraticSVC
diff --git a/skglm/__init__.py b/skglm/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '0.2dev'
+__version__ = '0.3.dev'
 
 from skglm.estimators import (  # noqa F401
     Lasso, WeightedLasso, ElasticNet, MCPRegression, MultiTaskLasso, LinearSVC,
diff --git a/skglm/datafits/__init__.py b/skglm/datafits/__init__.py
@@ -1,12 +1,12 @@
 from .base import BaseDatafit, BaseMultitaskDatafit
 from .single_task import Quadratic, QuadraticSVC, Logistic, Huber, Poisson
 from .multi_task import QuadraticMultiTask
-from .group import QuadraticGroup
+from .group import QuadraticGroup, LogisticGroup
 
 
 __all__ = [
     BaseDatafit, BaseMultitaskDatafit,
     Quadratic, QuadraticSVC, Logistic, Huber, Poisson,
     QuadraticMultiTask,
-    QuadraticGroup
+    QuadraticGroup, LogisticGroup
 ]
diff --git a/skglm/datafits/group.py b/skglm/datafits/group.py
@@ -3,6 +3,7 @@
 from numba import int32, float64
 
 from skglm.datafits.base import BaseDatafit
+from skglm.datafits.single_task import Logistic
 
 
 class QuadraticGroup(BaseDatafit):
@@ -71,3 +72,63 @@ def gradient_scalar(self, X, y, w, Xw, j):
 
     def intercept_update_step(self, y, Xw):
         return np.mean(Xw - y)
+
+
+class LogisticGroup(Logistic):
+    r"""Logistic datafit used with group penalties.
+
+    The datafit reads::
+
+    (1 / n_samples) * \sum_i log(1 + exp(-y_i * Xw_i))
+
+    Attributes
+    ----------
+    grp_indices : array, shape (n_features,)
+        The group indices stacked contiguously
+        ([grp1_indices, grp2_indices, ...]).
+
+    grp_ptr : array, shape (n_groups + 1,)
+        The group pointers such that two consecutive elements delimit
+        the indices of a group in ``grp_indices``.
+
+    lipschitz : array, shape (n_groups,)
+        The lipschitz constants for each group.
+    """
+
+    def __init__(self, grp_ptr, grp_indices):
+        self.grp_ptr, self.grp_indices = grp_ptr, grp_indices
+
+    def get_spec(self):
+        spec = (
+            ('grp_ptr', int32[:]),
+            ('grp_indices', int32[:]),
+            ('lipschitz', float64[:])
+        )
+        return spec
+
+    def params_to_dict(self):
+        return dict(grp_ptr=self.grp_ptr,
+                    grp_indices=self.grp_indices)
+
+    def initialize(self, X, y):
+        grp_ptr, grp_indices = self.grp_ptr, self.grp_indices
+        n_groups = len(grp_ptr) - 1
+
+        lipschitz = np.zeros(n_groups)
+        for g in range(n_groups):
+            grp_g_indices = grp_indices[grp_ptr[g]: grp_ptr[g+1]]
+            X_g = X[:, grp_g_indices]
+            lipschitz[g] = norm(X_g, ord=2) ** 2 / (4 * len(y))
+
+        self.lipschitz = lipschitz
+
+    def gradient_g(self, X, y, w, Xw, g):
+        grp_ptr, grp_indices = self.grp_ptr, self.grp_indices
+        grp_g_indices = grp_indices[grp_ptr[g]: grp_ptr[g+1]]
+        raw_grad_val = self.raw_grad(y, Xw)
+
+        grad_g = np.zeros(len(grp_g_indices))
+        for idx, j in enumerate(grp_g_indices):
+            grad_g[idx] = X[:, j] @ raw_grad_val
+
+        return grad_g
diff --git a/skglm/datafits/multi_task.py b/skglm/datafits/multi_task.py
@@ -68,7 +68,7 @@ def value(self, Y, W, XW):
     def gradient_j(self, X, Y, W, XW, j):
         """Gradient with respect to j-th coordinate of W."""
         n_samples = X.shape[0]
-        return (X[:, j:j+1].T @ XW - self.XtY[j, :]) / n_samples
+        return (X[:, j] @ XW - self.XtY[j, :]) / n_samples
 
     def gradient_j_sparse(self, X_data, X_indptr, X_indices, Y, XW, j):
         """Gradient with respect to j-th coordinate of W when X is sparse."""
diff --git a/skglm/experimental/sqrt_lasso.py b/skglm/experimental/sqrt_lasso.py
@@ -146,7 +146,8 @@ def path(self, X, y, alphas=None, eps=1e-3, n_alphas=10):
         """
         if not hasattr(self, "solver_"):
             self.solver_ = ProxNewton(
-                tol=self.tol, max_iter=self.max_iter, verbose=self.verbose)
+                tol=self.tol, max_iter=self.max_iter, verbose=self.verbose,
+                fit_intercept=False)
         # build path
         if alphas is None:
             alpha_max = norm(X.T @ y, ord=np.inf) / (np.sqrt(len(y)) * norm(y))
diff --git a/skglm/penalties/block_separable.py b/skglm/penalties/block_separable.py
@@ -286,8 +286,8 @@ def prox_1group(self, value, stepsize, g):
     def subdiff_distance(self, w, grad_ws, ws):
         """Compute distance to the subdifferential at ``w`` of negative gradient.
 
-        Note: ``grad_ws`` is a stacked array of ``-``gradients.
-        ([-grad_ws_1, -grad_ws_2, ...])
+        Note: ``grad_ws`` is a stacked array of gradients.
+        ([grad_ws_1, grad_ws_2, ...])
         """
         alpha, weights = self.alpha, self.weights
         grp_ptr, grp_indices = self.grp_ptr, self.grp_indices
@@ -307,7 +307,7 @@ def subdiff_distance(self, w, grad_ws, ws):
                 scores[idx] = max(0, norm(grad_g) - alpha * weights[g])
             else:
                 subdiff = alpha * weights[g] * w_g / norm_w_g
-                scores[idx] = norm(grad_g - subdiff)
+                scores[idx] = norm(grad_g + subdiff)
 
         return scores
 
diff --git a/skglm/solvers/group_bcd.py b/skglm/solvers/group_bcd.py
@@ -170,6 +170,6 @@ def _construct_grad(X, y, w, Xw, datafit, ws):
     grad_ptr = 0
     for g in ws:
         grad_g = datafit.gradient_g(X, y, w, Xw, g)
-        grads[grad_ptr: grad_ptr+len(grad_g)] = -grad_g
+        grads[grad_ptr: grad_ptr+len(grad_g)] = grad_g
         grad_ptr += len(grad_g)
     return grads
diff --git a/skglm/solvers/multitask_bcd.py b/skglm/solvers/multitask_bcd.py
@@ -369,8 +369,8 @@ def _bcd_epoch(X, Y, W, XW, datafit, penalty, ws):
             continue
         Xj = X[:, j]
         old_W_j = W[j, :].copy()  # copy is very important here
-        W[j:j+1, :] = penalty.prox_1feat(
-            W[j:j+1, :] - datafit.gradient_j(X, Y, W, XW, j) / lc[j],
+        W[j, :] = penalty.prox_1feat(
+            W[j, :] - datafit.gradient_j(X, Y, W, XW, j) / lc[j],
             1 / lc[j], j)
         if not np.all(W[j, :] == old_W_j):
             for k in range(n_tasks):
diff --git a/skglm/solvers/prox_newton.py b/skglm/solvers/prox_newton.py
diff --git a/skglm/tests/test_group.py b/skglm/tests/test_group.py
diff --git a/skglm/tests/test_prox_newton.py b/skglm/tests/test_prox_newton.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-__version__ = '0.2dev'`
	`1`	`+__version__ = '0.3.dev'`
`2`	`2`
`3`	`3`	`from skglm.estimators import ( # noqa F401`
`4`	`4`	`Lasso, WeightedLasso, ElasticNet, MCPRegression, MultiTaskLasso, LinearSVC,`