scikit-learn-contrib · Badr-MOUFAD · Mar 12, 2023 · Mar 12, 2023 · Mar 13, 2023 · Mar 13, 2023
diff --git a/skglm/gpu/README.md b/skglm/gpu/README.md
@@ -0,0 +1,34 @@
+## Installation
+
+1. checkout branch
+```shell
+# add remote if it does't exist (check with: git remote -v)
+git remote add Badr-MOUFAD https://github.com/Badr-MOUFAD/skglm.git
+
+git fetch Badr-MOUFAD skglm-gpu
+
+git checkout skglm-gpu
+```
+
+2. create then activate``conda`` environnement
+```shell
+# create
+conda create -n skglm-gpu python=3.7
+
+# activate env
+conda activate skglm-gpu
+```
+
+3. install ``skglm`` in editable mode
+```shell
+pip install skglm -e .
+```
+
+4. install dependencies
+```shell
+# cupy
+conda conda install -c conda-forge cupy cudatoolkit=11.5
+
+# jax
+conda install jaxlib=*=*cuda* jax cuda-nvcc -c conda-forge -c nvidia
+```
diff --git a/skglm/gpu/__init__.py b/skglm/gpu/__init__.py
@@ -0,0 +1,6 @@
+"""Solve Lasso problem using FISTA GPU-implementation.
+
+Problem reads::
+
+    min_w (1/2) * ||y - Xw||^2 + lmbd * ||w||_1
+"""
diff --git a/skglm/gpu/example.py b/skglm/gpu/example.py
@@ -0,0 +1,56 @@
+import time
+
+import numpy as np
+from numpy.linalg import norm
+
+from skglm.gpu.solvers import NumbaSolver, CPUSolver
+
+from skglm.gpu.utils.host_utils import compute_obj, eval_opt_crit
+
+
+random_state = 1265
+n_samples, n_features = 10_000, 500
+reg = 1e-2
+
+# generate dummy data
+rng = np.random.RandomState(random_state)
+X = rng.randn(n_samples, n_features)
+y = rng.randn(n_samples)
+
+
+# set lambda
+lmbd_max = norm(X.T @ y, ord=np.inf)
+lmbd = reg * lmbd_max
+
+solver = NumbaSolver(verbose=0)
+solver.max_iter = 10
+solver.solve(X, y, lmbd)
+
+# solve problem
+start = time.perf_counter()
+solver.max_iter = 1000
+w_gpu = solver.solve(X, y, lmbd)
+end = time.perf_counter()
+
+print("gpu time: ", end - start)
+
+
+solver_cpu = CPUSolver()
+start = time.perf_counter()
+w_cpu = solver_cpu.solve(X, y, lmbd)
+end = time.perf_counter()
+print("sklearn time: ", end - start)
+
+
+print(
+    "Objective\n"
+    f"gpu    : {compute_obj(X, y, lmbd, w_gpu):.8f}\n"
+    f"cpu    : {compute_obj(X, y, lmbd, w_cpu):.8f}"
+)
+
+
+print(
+    "Optimality condition\n"
+    f"gpu    : {eval_opt_crit(X, y, lmbd, w_gpu):.8f}\n"
+    f"cpu    : {eval_opt_crit(X, y, lmbd, w_cpu):.8f}"
+)
diff --git a/skglm/gpu/solvers/__init__.py b/skglm/gpu/solvers/__init__.py
@@ -0,0 +1,4 @@
+from skglm.gpu.solvers.cpu_solver import CPUSolver  # noqa
+from skglm.gpu.solvers.cupy_solver import CupySolver  # noqa
+from skglm.gpu.solvers.jax_solver import JaxSolver  # noqa
+from skglm.gpu.solvers.numba_solver import NumbaSolver  # noqa
diff --git a/skglm/gpu/solvers/cpu_solver.py b/skglm/gpu/solvers/cpu_solver.py
@@ -0,0 +1,56 @@
+import numpy as np
+
+from skglm.utils.prox_funcs import ST_vec
+from skglm.gpu.utils.host_utils import compute_obj, eval_opt_crit
+
+
+class CPUSolver:
+
+    def __init__(self, max_iter=1000, verbose=0):
+        self.max_iter = max_iter
+        self.verbose = verbose
+
+    def solve(self, X, y, lmbd):
+        n_samples, n_features = X.shape
+
+        # compute step
+        lipschitz = np.linalg.norm(X, ord=2) ** 2
+        if lipschitz == 0.:
+            return np.zeros(n_features)
+
+        step = 1 / lipschitz
+
+        # init vars
+        w = np.zeros(n_features)
+        old_w = np.zeros(n_features)
+        mid_w = np.zeros(n_features)
+        grad = np.zeros(n_features)
+
+        t_old, t_new = 1, 1
+
+        for it in range(self.max_iter):
+
+            # compute grad
+            grad = X.T @ (X @ mid_w - y)
+
+            # forward / backward
+            mid_w = mid_w - step * grad
+            w = ST_vec(mid_w, step * lmbd)
+
+            if self.verbose:
+                p_obj = compute_obj(X, y, lmbd, w)
+                opt_crit = eval_opt_crit(X, y, lmbd, w)
+
+                print(
+                    f"Iteration {it:4}: p_obj={p_obj:.8f}, opt crit={opt_crit:.4e}"
+                )
+
+            # extrapolate
+            mid_w = w + ((t_old - 1) / t_new) * (w - old_w)
+
+            # update FISTA vars
+            t_old = t_new
+            t_new = (1 + np.sqrt(1 + 4 * t_old ** 2)) / 2
+            old_w = np.copy(w)
+
+        return w
diff --git a/skglm/gpu/solvers/cupy_solver.py b/skglm/gpu/solvers/cupy_solver.py
@@ -0,0 +1,65 @@
+import cupy as cp
+import numpy as np
+
+from skglm.gpu.utils.host_utils import compute_obj, eval_opt_crit
+
+
+class CupySolver:
+
+    def __init__(self, max_iter=1000, verbose=0):
+        self.max_iter = max_iter
+        self.verbose = verbose
+
+    def solve(self, X, y, lmbd):
+        n_samples, n_features = X.shape
+
+        # compute step
+        lipschitz = np.linalg.norm(X, ord=2) ** 2
+        if lipschitz == 0.:
+            return np.zeros(n_features)
+
+        step = 1 / lipschitz
+
+        # transfer to device
+        X_gpu = cp.array(X)
+        y_gpu = cp.array(y)
+
+        # init vars in device
+        w = cp.zeros(n_features)
+        old_w = cp.zeros(n_features)
+        mid_w = cp.zeros(n_features)
+        grad = cp.zeros(n_features)
+
+        t_old, t_new = 1, 1
+
+        for it in range(self.max_iter):
+
+            # compute grad
+            cp.dot(X_gpu.T, X_gpu @ mid_w - y_gpu, out=grad)
+
+            # forward / backward: w = ST(mid_w - step * grad, step * lmbd)
+            mid_w = mid_w - step * grad
+            w = cp.sign(mid_w) * cp.maximum(cp.abs(mid_w) - step * lmbd, 0.)
+
+            if self.verbose:
+                w_cpu = cp.asnumpy(w)
+
+                p_obj = compute_obj(X, y, lmbd, w_cpu)
+                opt_crit = eval_opt_crit(X, y, lmbd, w_cpu)
+
+                print(
+                    f"Iteration {it:4}: p_obj={p_obj:.8f}, opt crit={opt_crit:.4e}"
+                )
+
+            # extrapolate
+            mid_w = w + ((t_old - 1) / t_new) * (w - old_w)
+
+            # update FISTA vars
+            t_old = t_new
+            t_new = (1 + cp.sqrt(1 + 4 * t_old ** 2)) / 2
+            old_w = cp.copy(w)
+
+        # transfer back to host
+        w_cpu = cp.asnumpy(w)
+
+        return w_cpu
diff --git a/skglm/gpu/solvers/jax_solver.py b/skglm/gpu/solvers/jax_solver.py
@@ -0,0 +1,89 @@
+# if not set, raises an error related to CUDA linking API.
+# as recommended, setting the 'XLA_FLAGS' to bypass it.
+# side-effect: (perhaps) slow compilation time.
+import os
+os.environ['XLA_FLAGS'] = '--xla_gpu_force_compilation_parallelism=1'  # noqa
+
+import numpy as np  # noqa
+
+import jax  # noqa
+import jax.numpy as jnp  # noqa
+# set float64 as default float type.
+# if not, amplifies rounding errors.
+jax.config.update("jax_enable_x64", True)  # noqa
+
+from skglm.gpu.utils.host_utils import compute_obj, eval_opt_crit  # noqa
+
+
+class JaxSolver:
+
+    def __init__(self, max_iter=1000, use_auto_diff=True, verbose=0):
+        self.max_iter = max_iter
+        self.use_auto_diff = use_auto_diff
+        self.verbose = verbose
+
+    def solve(self, X, y, lmbd):
+        n_samples, n_features = X.shape
+
+        # compute step
+        lipschitz = np.linalg.norm(X, ord=2) ** 2
+        if lipschitz == 0.:
+            return np.zeros(n_features)
+
+        step = 1 / lipschitz
+
+        # transfer to device
+        X_gpu = jnp.asarray(X)
+        y_gpu = jnp.asarray(y)
+
+        # get grad func of datafit
+        if self.use_auto_diff:
+            grad_quad_loss = jax.grad(_quad_loss)
+
+        # init vars in device
+        w = jnp.zeros(n_features)
+        old_w = jnp.zeros(n_features)
+        mid_w = jnp.zeros(n_features)
+        grad = jnp.zeros(n_features)
+
+        t_old, t_new = 1, 1
+
+        for it in range(self.max_iter):
+
+            # compute grad
+            if self.use_auto_diff:
+                grad = grad_quad_loss(mid_w, X_gpu, y_gpu)
+            else:
+                grad = jnp.dot(X_gpu.T, jnp.dot(X_gpu, mid_w) - y_gpu)
+
+            # forward / backward
+            mid_w = mid_w - step * grad
+            w = jnp.sign(mid_w) * jnp.maximum(jnp.abs(mid_w) - step * lmbd, 0.)
+
+            if self.verbose:
+                w_cpu = np.asarray(w, dtype=np.float64)
+
+                p_obj = compute_obj(X, y, lmbd, w_cpu)
+                opt_crit = eval_opt_crit(X, y, lmbd, w_cpu)
+
+                print(
+                    f"Iteration {it:4}: p_obj={p_obj:.8f}, opt crit={opt_crit:.4e}"
+                )
+
+            # extrapolate
+            mid_w = w + ((t_old - 1) / t_new) * (w - old_w)
+
+            # update FISTA vars
+            t_old = t_new
+            t_new = 0.5 * (1 + jnp.sqrt(1. + 4. * t_old ** 2))
+            old_w = jnp.copy(w)
+
+        # transfer back to host
+        w_cpu = np.asarray(w, dtype=np.float64)
+
+        return w_cpu
+
+
+def _quad_loss(w, X_gpu, y_gpu):
+    pred_y = jnp.dot(X_gpu, w)
+    return 0.5 * jnp.sum((y_gpu - pred_y) ** 2)