FIX time overhead when fitting with Lasso estimator (#129)

Badr-MOUFAD · web-flow · commit 9fe4bae4409e · 2022-11-30T16:54:05.000+01:00
diff --git a/skglm/datafits/single_task.py b/skglm/datafits/single_task.py
@@ -17,7 +17,8 @@ class Quadratic(BaseDatafit):
     Attributes
     ----------
     Xty : array, shape (n_features,)
-        Pre-computed quantity used during the gradient evaluation. Equal to X.T @ y.
+        Pre-computed quantity used during the gradient evaluation.
+        Equal to ``X.T @ y``.
 
     lipschitz : array, shape (n_features,)
         The coordinatewise gradient Lipschitz constants. Equal to
@@ -50,7 +51,7 @@ def params_to_dict(self):
     def initialize(self, X, y):
         self.Xty = X.T @ y
         n_features = X.shape[1]
-        self.global_lipschitz = norm(X, ord=2) ** 2 / len(y)
+
         self.lipschitz = np.zeros(n_features, dtype=X.dtype)
         for j in range(n_features):
             self.lipschitz[j] = (X[:, j] ** 2).sum() / len(y)
@@ -59,9 +60,6 @@ def initialize_sparse(self, X_data, X_indptr, X_indices, y):
         n_features = len(X_indptr) - 1
         self.Xty = np.zeros(n_features, dtype=X_data.dtype)
 
-        self.global_lipschitz = spectral_norm(X_data, X_indptr, X_indices, len(y)) ** 2
-        self.global_lipschitz /= len(y)
-
         self.lipschitz = np.zeros(n_features, dtype=X_data.dtype)
         for j in range(n_features):
             nrm2 = 0.
@@ -73,6 +71,13 @@ def initialize_sparse(self, X_data, X_indptr, X_indices, y):
             self.lipschitz[j] = nrm2 / len(y)
             self.Xty[j] = xty
 
+    def init_global_lipschitz(self, X, y):
+        self.global_lipschitz = norm(X, ord=2) ** 2 / len(y)
+
+    def init_global_lipschitz_sparse(self, X_data, X_indptr, X_indices, y):
+        self.global_lipschitz = spectral_norm(
+            X_data, X_indptr, X_indices, len(y)) ** 2 / len(y)
+
     def value(self, y, w, Xw):
         return np.sum((y - Xw) ** 2) / (2 * len(Xw))
 
@@ -155,19 +160,22 @@ def raw_hessian(self, y, Xw):
 
     def initialize(self, X, y):
         self.lipschitz = (X ** 2).sum(axis=0) / (len(y) * 4)
-        self.global_lipschitz = norm(X, ord=2) ** 2 / (len(y) * 4)
 
     def initialize_sparse(self, X_data, X_indptr, X_indices, y):
         n_features = len(X_indptr) - 1
 
-        self.global_lipschitz = spectral_norm(X_data, X_indptr, X_indices, len(y)) ** 2
-        self.global_lipschitz /= 4 * len(y)
-
         self.lipschitz = np.zeros(n_features, dtype=X_data.dtype)
         for j in range(n_features):
             Xj = X_data[X_indptr[j]:X_indptr[j+1]]
             self.lipschitz[j] = (Xj ** 2).sum() / (len(y) * 4)
 
+    def init_global_lipschitz(self, X, y):
+        self.global_lipschitz = norm(X, ord=2) ** 2 / (4 * len(y))
+
+    def init_global_lipschitz_sparse(self, X_data, X_indptr, X_indices, y):
+        self.global_lipschitz = spectral_norm(
+            X_data, X_indptr, X_indices, len(y)) ** 2 / (4 * len(y))
+
     def value(self, y, w, Xw):
         return np.log(1. + np.exp(- y * Xw)).sum() / len(y)
 
@@ -235,23 +243,27 @@ def params_to_dict(self):
     def initialize(self, yXT, y):
         n_features = yXT.shape[1]
         self.lipschitz = np.zeros(n_features, dtype=yXT.dtype)
-        self.global_lipschitz = norm(yXT, ord=2) ** 2
+
         for j in range(n_features):
             self.lipschitz[j] = norm(yXT[:, j]) ** 2
 
     def initialize_sparse(self, yXT_data, yXT_indptr, yXT_indices, y):
         n_features = len(yXT_indptr) - 1
 
-        self.global_lipschitz = spectral_norm(
-            yXT_data, yXT_indptr, yXT_indices, max(yXT_indices)+1) ** 2
-
         self.lipschitz = np.zeros(n_features, dtype=yXT_data.dtype)
         for j in range(n_features):
             nrm2 = 0.
             for idx in range(yXT_indptr[j], yXT_indptr[j + 1]):
                 nrm2 += yXT_data[idx] ** 2
             self.lipschitz[j] = nrm2
 
+    def init_global_lipschitz(self, yXT, y):
+        self.global_lipschitz = norm(yXT, ord=2) ** 2
+
+    def init_global_lipschitz_sparse(self, yXT_data, yXT_indptr, yXT_indices, y):
+        self.global_lipschitz = spectral_norm(
+            yXT_data, yXT_indptr, yXT_indices, max(yXT_indices)+1) ** 2
+
     def value(self, y, w, yXTw):
         return (yXTw ** 2).sum() / 2 - np.sum(w)
 
@@ -328,24 +340,26 @@ def params_to_dict(self):
     def initialize(self, X, y):
         n_features = X.shape[1]
         self.lipschitz = np.zeros(n_features, dtype=X.dtype)
-        self.global_lipschitz = 0.
         for j in range(n_features):
             self.lipschitz[j] = (X[:, j] ** 2).sum() / len(y)
-            self.global_lipschitz += (X[:, j] ** 2).sum() / len(y)
 
     def initialize_sparse(self, X_data, X_indptr, X_indices, y):
         n_features = len(X_indptr) - 1
 
-        self.global_lipschitz = spectral_norm(X_data, X_indptr, X_indices, len(y)) ** 2
-        self.global_lipschitz /= len(y)
-
         self.lipschitz = np.zeros(n_features, dtype=X_data.dtype)
         for j in range(n_features):
             nrm2 = 0.
             for idx in range(X_indptr[j], X_indptr[j + 1]):
                 nrm2 += X_data[idx] ** 2
             self.lipschitz[j] = nrm2 / len(y)
 
+    def init_global_lipschitz(self, X, y):
+        self.global_lipschitz = norm(X, ord=2) ** 2 / len(y)
+
+    def init_global_lipschitz_sparse(self, X_data, X_indptr, X_indices, y):
+        self.global_lipschitz = spectral_norm(
+            X_data, X_indptr, X_indices, len(y)) ** 2 / len(y)
+
     def value(self, y, w, Xw):
         n_samples = len(y)
         res = 0.
diff --git a/skglm/solvers/fista.py b/skglm/solvers/fista.py
@@ -39,23 +39,31 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None):
         p_objs_out = []
         n_samples, n_features = X.shape
         all_features = np.arange(n_features)
+        X_is_sparse = issparse(X)
         t_new = 1.
 
         w = w_init.copy() if w_init is not None else np.zeros(n_features)
         z = w_init.copy() if w_init is not None else np.zeros(n_features)
         Xw = Xw_init.copy() if Xw_init is not None else np.zeros(n_samples)
 
-        if hasattr(datafit, "global_lipschitz"):
-            lipschitz = datafit.global_lipschitz
-        else:
-            # TODO: OR line search
-            raise Exception("Line search is not yet implemented for FISTA solver.")
+        try:
+            if X_is_sparse:
+                datafit.init_global_lipschitz_sparse(X.data, X.indptr, X.indices, y)
+            else:
+                datafit.init_global_lipschitz(X, y)
+        except AttributeError:
+            sparse_suffix = '_sparse' if X_is_sparse else ''
+
+            raise Exception(
+                "Datafit is not compatible with FISTA solver.\n Datafit must "
+                f"implement `init_global_lipschitz{sparse_suffix}` method")
 
+        lipschitz = datafit.global_lipschitz
         for n_iter in range(self.max_iter):
             t_old = t_new
             t_new = (1 + np.sqrt(1 + 4 * t_old ** 2)) / 2
             w_old = w.copy()
-            if issparse(X):
+            if X_is_sparse:
                 grad = construct_grad_sparse(
                     X.data, X.indptr, X.indices, y, z, X @ z, datafit, all_features)
             else: