From aabbf9136f219102d17ee53430a484ea79221c4d Mon Sep 17 00:00:00 2001 From: Carrivain Pascal Date: Thu, 14 Dec 2023 10:31:00 +0100 Subject: [PATCH 01/52] fix add support for intercept in SqrtLasso --- skglm/experimental/sqrt_lasso.py | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/skglm/experimental/sqrt_lasso.py b/skglm/experimental/sqrt_lasso.py index 97c10105d..3175d5bd4 100644 --- a/skglm/experimental/sqrt_lasso.py +++ b/skglm/experimental/sqrt_lasso.py @@ -101,10 +101,13 @@ class SqrtLasso(LinearModel, RegressorMixin): verbose : bool, default False Amount of verbosity. 0/False is silent. + + fit_intercept: bool, default True + xxx """ def __init__(self, alpha=1., max_iter=100, max_pn_iter=100, p0=10, - tol=1e-4, verbose=0): + tol=1e-4, verbose=0, fit_intercept=True): super().__init__() self.alpha = alpha self.max_iter = max_iter @@ -113,6 +116,7 @@ def __init__(self, alpha=1., max_iter=100, max_pn_iter=100, p0=10, self.p0 = p0 self.tol = tol self.verbose = verbose + self.fit_intercept = fit_intercept def fit(self, X, y): """Fit the model according to the given training data. @@ -132,7 +136,10 @@ def fit(self, X, y): Fitted estimator. """ self.coef_ = self.path(X, y, alphas=[self.alpha])[1][0] - self.intercept_ = 0. # TODO handle fit_intercept + if self.fit_intercept: + self.intercept_ = self.coef_[-1] + else: + self.intercept_ = 0. return self def path(self, X, y, alphas=None, eps=1e-3, n_alphas=10): @@ -182,7 +189,7 @@ def path(self, X, y, alphas=None, eps=1e-3, n_alphas=10): sqrt_quadratic = compiled_clone(SqrtQuadratic()) l1_penalty = compiled_clone(L1(1.)) # alpha is set along the path - coefs = np.zeros((n_alphas, n_features)) + coefs = np.zeros((n_alphas, n_features + self.fit_intercept)) for i in range(n_alphas): if self.verbose: @@ -193,12 +200,17 @@ def path(self, X, y, alphas=None, eps=1e-3, n_alphas=10): l1_penalty.alpha = alphas[i] # no warm start for the first alpha - coef_init = coefs[i].copy() if i else np.zeros(n_features) + coef_init = coefs[i].copy() if i else np.zeros(n_features + self.fit_intercept) try: - coef, _, _ = self.solver_.solve( - X, y, sqrt_quadratic, l1_penalty, - w_init=coef_init, Xw_init=X @ coef_init) + if self.fit_intercept: + coef, _, _ = self.solver_.solve( + X, y, sqrt_quadratic, l1_penalty, + w_init=coef_init, Xw_init=X @ coef_init) + else: + coef, _, _ = self.solver_.solve( + X, y, sqrt_quadratic, l1_penalty, + w_init=coef_init, Xw_init=X @ coef_init[:-1] + coef_init[-1]) coefs[i] = coef except ValueError as val_exception: # make sure to catch residual error @@ -209,7 +221,10 @@ def path(self, X, y, alphas=None, eps=1e-3, n_alphas=10): # save coef despite not converging # coef_init holds a ref to coef coef = coef_init - res_norm = norm(y - X @ coef) + if self.fit_intercept: + res_norm = norm(y - X @ coef[:-1] - coef[-1]) + else: + res_norm = norm(y - X @ coef) warnings.warn( f"Small residuals prevented the solver from converging " f"at alpha={alphas[i]:.2e} (residuals' norm: {res_norm:.4e}). " From a44c11ff8b28ce6ccc90350291b718f2bf355c09 Mon Sep 17 00:00:00 2001 From: Carrivain Pascal Date: Thu, 14 Dec 2023 14:48:30 +0100 Subject: [PATCH 02/52] fix line too long (91 > 88 characters) --- skglm/experimental/sqrt_lasso.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/skglm/experimental/sqrt_lasso.py b/skglm/experimental/sqrt_lasso.py index 3175d5bd4..4b5bc6d7b 100644 --- a/skglm/experimental/sqrt_lasso.py +++ b/skglm/experimental/sqrt_lasso.py @@ -200,7 +200,8 @@ def path(self, X, y, alphas=None, eps=1e-3, n_alphas=10): l1_penalty.alpha = alphas[i] # no warm start for the first alpha - coef_init = coefs[i].copy() if i else np.zeros(n_features + self.fit_intercept) + coef_init = coefs[i].copy() if i else np.zeros(n_features + + self.fit_intercept) try: if self.fit_intercept: From d4e69f22d7b2a3a82e32f667d94d87e198fa060d Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Fri, 15 Dec 2023 10:20:09 +0100 Subject: [PATCH 03/52] [CI trigger] From 1975371439ee78edfe7dd04c138ccb25f9abf7cc Mon Sep 17 00:00:00 2001 From: Badr-MOUFAD Date: Fri, 15 Dec 2023 10:29:02 +0100 Subject: [PATCH 04/52] set intercept to false in unittest --- skglm/experimental/tests/test_sqrt_lasso.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/skglm/experimental/tests/test_sqrt_lasso.py b/skglm/experimental/tests/test_sqrt_lasso.py index 91722abea..e67883f99 100644 --- a/skglm/experimental/tests/test_sqrt_lasso.py +++ b/skglm/experimental/tests/test_sqrt_lasso.py @@ -31,7 +31,7 @@ def test_vs_statsmodels(): n_alphas = 3 alphas = alpha_max * np.geomspace(1, 1e-2, n_alphas+1)[1:] - sqrt_lasso = SqrtLasso(tol=1e-9) + sqrt_lasso = SqrtLasso(tol=1e-9, fit_intercept=False) coefs_skglm = sqrt_lasso.path(X, y, alphas)[1] coefs_statsmodels = np.zeros((len(alphas), n_features)) @@ -54,7 +54,7 @@ def test_prox_newton_cp(): alpha_max = norm(X.T @ y, ord=np.inf) / norm(y) alpha = alpha_max / 10 - clf = SqrtLasso(alpha=alpha, tol=1e-12).fit(X, y) + clf = SqrtLasso(alpha=alpha, fit_intercept=False, tol=1e-12).fit(X, y) w, _, _ = _chambolle_pock_sqrt(X, y, alpha, max_iter=1000) np.testing.assert_allclose(clf.coef_, w) @@ -70,7 +70,7 @@ def test_PDCD_WS(with_dual_init): dual_init = y / norm(y) if with_dual_init else None w = PDCD_WS(dual_init=dual_init).solve(X, y, SqrtQuadratic(), L1(alpha))[0] - clf = SqrtLasso(alpha=alpha, tol=1e-12).fit(X, y) + clf = SqrtLasso(alpha=alpha, fit_intercept=False, tol=1e-12).fit(X, y) np.testing.assert_allclose(clf.coef_, w, atol=1e-6) From 6f68666074a182dc9ad8115f8db6dc192980a389 Mon Sep 17 00:00:00 2001 From: Carrivain Pascal Date: Mon, 18 Dec 2023 13:20:41 +0100 Subject: [PATCH 05/52] fix inverting the cases (fit_intercept) --- skglm/experimental/sqrt_lasso.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/skglm/experimental/sqrt_lasso.py b/skglm/experimental/sqrt_lasso.py index 4b5bc6d7b..692909363 100644 --- a/skglm/experimental/sqrt_lasso.py +++ b/skglm/experimental/sqrt_lasso.py @@ -207,11 +207,11 @@ def path(self, X, y, alphas=None, eps=1e-3, n_alphas=10): if self.fit_intercept: coef, _, _ = self.solver_.solve( X, y, sqrt_quadratic, l1_penalty, - w_init=coef_init, Xw_init=X @ coef_init) + w_init=coef_init, Xw_init=X @ coef_init[:-1] + coef_init[-1]) else: coef, _, _ = self.solver_.solve( X, y, sqrt_quadratic, l1_penalty, - w_init=coef_init, Xw_init=X @ coef_init[:-1] + coef_init[-1]) + w_init=coef_init, Xw_init=X @ coef_init) coefs[i] = coef except ValueError as val_exception: # make sure to catch residual error @@ -222,10 +222,8 @@ def path(self, X, y, alphas=None, eps=1e-3, n_alphas=10): # save coef despite not converging # coef_init holds a ref to coef coef = coef_init - if self.fit_intercept: - res_norm = norm(y - X @ coef[:-1] - coef[-1]) - else: - res_norm = norm(y - X @ coef) + X_coef = X @ coef[:-1] + coef[-1] if self.fit_intercept else X @ coef + res_norm = norm(y - X_coeff) warnings.warn( f"Small residuals prevented the solver from converging " f"at alpha={alphas[i]:.2e} (residuals' norm: {res_norm:.4e}). " From cdc21ea6e33dc1b8d22c559cba988cf3af697bfb Mon Sep 17 00:00:00 2001 From: Carrivain Pascal Date: Tue, 23 Jan 2024 13:27:25 +0100 Subject: [PATCH 06/52] fix undefined name 'X_coeff' --- skglm/experimental/sqrt_lasso.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skglm/experimental/sqrt_lasso.py b/skglm/experimental/sqrt_lasso.py index 692909363..4c8288069 100644 --- a/skglm/experimental/sqrt_lasso.py +++ b/skglm/experimental/sqrt_lasso.py @@ -223,7 +223,7 @@ def path(self, X, y, alphas=None, eps=1e-3, n_alphas=10): # coef_init holds a ref to coef coef = coef_init X_coef = X @ coef[:-1] + coef[-1] if self.fit_intercept else X @ coef - res_norm = norm(y - X_coeff) + res_norm = norm(y - X_coef) warnings.warn( f"Small residuals prevented the solver from converging " f"at alpha={alphas[i]:.2e} (residuals' norm: {res_norm:.4e}). " From c38c00d52bd1f7da33cabeff32623f20960ff0cf Mon Sep 17 00:00:00 2001 From: Carrivain Pascal Date: Thu, 22 Feb 2024 10:13:44 +0100 Subject: [PATCH 07/52] add fit_intercept to test_alpha_max and to self.solver_ --- skglm/experimental/sqrt_lasso.py | 2 +- skglm/experimental/tests/test_sqrt_lasso.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/skglm/experimental/sqrt_lasso.py b/skglm/experimental/sqrt_lasso.py index 4c8288069..ceee128f0 100644 --- a/skglm/experimental/sqrt_lasso.py +++ b/skglm/experimental/sqrt_lasso.py @@ -176,7 +176,7 @@ def path(self, X, y, alphas=None, eps=1e-3, n_alphas=10): if not hasattr(self, "solver_"): self.solver_ = ProxNewton( tol=self.tol, max_iter=self.max_iter, verbose=self.verbose, - fit_intercept=False) + fit_intercept=self.fit_intercept) # build path if alphas is None: alpha_max = norm(X.T @ y, ord=np.inf) / (np.sqrt(len(y)) * norm(y)) diff --git a/skglm/experimental/tests/test_sqrt_lasso.py b/skglm/experimental/tests/test_sqrt_lasso.py index e67883f99..ae0d77935 100644 --- a/skglm/experimental/tests/test_sqrt_lasso.py +++ b/skglm/experimental/tests/test_sqrt_lasso.py @@ -16,7 +16,10 @@ def test_alpha_max(): sqrt_lasso = SqrtLasso(alpha=alpha_max).fit(X, y) - np.testing.assert_equal(sqrt_lasso.coef_, 0) + if sqrt_lasso.fit_intercept: + np.testing.assert_equal(sqrt_lasso.coef_[:-1], 0) + else: + np.testing.assert_equal(sqrt_lasso.coef_, 0) def test_vs_statsmodels(): From ad47b19fac5c726845740f1fe409e324c0802613 Mon Sep 17 00:00:00 2001 From: Carrivain Pascal Date: Fri, 23 Feb 2024 13:09:40 +0100 Subject: [PATCH 08/52] add fit_intercept docstring, factorized duplicate code --- skglm/experimental/sqrt_lasso.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/skglm/experimental/sqrt_lasso.py b/skglm/experimental/sqrt_lasso.py index ceee128f0..55ba908ae 100644 --- a/skglm/experimental/sqrt_lasso.py +++ b/skglm/experimental/sqrt_lasso.py @@ -102,8 +102,8 @@ class SqrtLasso(LinearModel, RegressorMixin): verbose : bool, default False Amount of verbosity. 0/False is silent. - fit_intercept: bool, default True - xxx + fit_intercept: bool, optional (default=True) + Whether or not to fit an intercept. """ def __init__(self, alpha=1., max_iter=100, max_pn_iter=100, p0=10, @@ -204,14 +204,10 @@ def path(self, X, y, alphas=None, eps=1e-3, n_alphas=10): + self.fit_intercept) try: - if self.fit_intercept: - coef, _, _ = self.solver_.solve( - X, y, sqrt_quadratic, l1_penalty, - w_init=coef_init, Xw_init=X @ coef_init[:-1] + coef_init[-1]) - else: - coef, _, _ = self.solver_.solve( - X, y, sqrt_quadratic, l1_penalty, - w_init=coef_init, Xw_init=X @ coef_init) + coef, _, _ = self.solver_.solve( + X, y, sqrt_quadratic, l1_penalty, + w_init=coef_init, Xw_init=X @ coef_init[:-1] + coef_init[-1] + if self.fit_intercept else X @ coef_init) coefs[i] = coef except ValueError as val_exception: # make sure to catch residual error From e90add15506b8263869f7df49625b543c5b2108b Mon Sep 17 00:00:00 2001 From: Pascal Carrivain <39987524+PascalCarrivain@users.noreply.github.com> Date: Mon, 11 Dec 2023 10:59:02 +0100 Subject: [PATCH 09/52] Fix imports in README (#211) Co-authored-by: Carrivain Pascal --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6e46f098b..444446038 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ Once you installed ``skglm``, you can run the following code snippet to fit a MC # import model to fit from skglm.estimators import MCPRegression # import util to create a toy dataset -from skglm.utils import make_correlated_data +from skglm.utils.data import make_correlated_data # generate a toy dataset X, y, _ = make_correlated_data(n_samples=10, n_features=100) @@ -83,12 +83,14 @@ from skglm.penalties import MCPenalty from skglm.estimators import GeneralizedLinearEstimator from skglm.utils import make_correlated_data +from skglm.solvers import AndersonCD X, y, _ = make_correlated_data(n_samples=10, n_features=100) # create and fit GLM estimator with Huber loss and MCP penalty estimator = GeneralizedLinearEstimator( datafit=Huber(delta=1.), penalty=MCPenalty(alpha=1e-2, gamma=3), + solver=AndersonCD() ) estimator.fit(X, y) ``` From 4a7d466ffdffb650ad246e868a70f15d14848cf3 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 11 Dec 2023 11:20:49 +0100 Subject: [PATCH 10/52] DOC - Add installation instructions for conda-forge (#210) Signed-off-by: Julien Jerphanion --- README.md | 4 ++-- doc/getting_started.rst | 6 +++--- doc/index.rst | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 444446038..51fa06153 100644 --- a/README.md +++ b/README.md @@ -45,10 +45,10 @@ There are several reasons to opt for ``skglm`` among which: pip install -U skglm ``` -It is also available on Conda _(not yet, but very soon...)_ and can be installed via the command +It is also available on conda-forge and can be installed using, for instance: ```shell -conda install skglm +conda install -c conda-forge skglm ``` ## First steps with ``skglm`` diff --git a/doc/getting_started.rst b/doc/getting_started.rst index b2121f636..f26472720 100644 --- a/doc/getting_started.rst +++ b/doc/getting_started.rst @@ -18,11 +18,11 @@ Beforehand, make sure that you have already installed ``skglm`` .. code-block:: shell - # using pip + # Installing from PyPI using pip pip install -U skglm - # using conda - conda install skglm + # Installing from conda-forge using conda + conda install -c conda-forge skglm ------------------------- diff --git a/doc/index.rst b/doc/index.rst index 9cd10bcce..194a2e4b7 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -48,11 +48,11 @@ Installing ``skglm`` $ pip install -U skglm -It is also available on Conda and can be installed via the command +It is also available on conda-forge and can be installed using, for instance: .. code-block:: shell - $ conda install skglm + $ conda install -c conda-forge skglm With ``skglm`` being installed, Get the first steps with the package via the :ref:`Getting started section `. Other advanced topics and uses-cases are covered in :ref:`Tutorials `. From a1507d50696385d70624ad22a154fc5df6cd6c17 Mon Sep 17 00:00:00 2001 From: Badr MOUFAD <65614794+Badr-MOUFAD@users.noreply.github.com> Date: Tue, 12 Dec 2023 13:52:44 +0100 Subject: [PATCH 11/52] FIX - Upload documentation only when merging to ``main`` (#209) * skip upload to gh-pages if it branch not main --- .circleci/config.yml | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index c18cd28e7..65730f522 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -67,31 +67,27 @@ jobs: cd ..; - # Deploy docs + # Add stable doc + - run: + name: add stable doc + command: | + set -e + mkdir -p ~/.ssh + echo -e "Host *\nStrictHostKeyChecking no" > ~/.ssh/config + chmod og= ~/.ssh/config + cd doc; + make add-stable-doc; + + + # upload to gh-pages - run: name: deploy command: | if [[ ${CIRCLE_BRANCH} == "main" ]]; then - set -e - mkdir -p ~/.ssh - echo -e "Host *\nStrictHostKeyChecking no" > ~/.ssh/config - chmod og= ~/.ssh/config - cd doc; pip install ghp-import; - make add-stable-doc make install fi - # Add stable documentation - - run: - name: stable_doc - command: | - set -e - mkdir -p ~/.ssh - echo -e "Host *\nStrictHostKeyChecking no" > ~/.ssh/config - chmod og= ~/.ssh/config - cd doc; - make add-stable-doc # Save the outputs - store_artifacts: From c1d6a159a928880d3e7fc434ddb68ff0a03abcd9 Mon Sep 17 00:00:00 2001 From: Badr MOUFAD <65614794+Badr-MOUFAD@users.noreply.github.com> Date: Tue, 12 Dec 2023 14:16:38 +0100 Subject: [PATCH 12/52] cd to doc before (#212) --- .circleci/config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 65730f522..fe5ac1f18 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -84,6 +84,7 @@ jobs: name: deploy command: | if [[ ${CIRCLE_BRANCH} == "main" ]]; then + cd doc; pip install ghp-import; make install fi From 564fa61fc8ab4c8e130b947ea4b962893c23b1a8 Mon Sep 17 00:00:00 2001 From: Badr MOUFAD <65614794+Badr-MOUFAD@users.noreply.github.com> Date: Tue, 12 Dec 2023 15:24:01 +0100 Subject: [PATCH 13/52] DOC - Hide table of contents in documentation home page (#213) --- README.md | 2 +- doc/index.rst | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 51fa06153..e845dfeec 100644 --- a/README.md +++ b/README.md @@ -82,7 +82,7 @@ from skglm.datafits import Huber from skglm.penalties import MCPenalty from skglm.estimators import GeneralizedLinearEstimator -from skglm.utils import make_correlated_data +from skglm.utils.data import make_correlated_data from skglm.solvers import AndersonCD X, y, _ = make_correlated_data(n_samples=10, n_features=100) diff --git a/doc/index.rst b/doc/index.rst index 194a2e4b7..a8b17ce0f 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -79,11 +79,13 @@ You are free to use it and if you do so, please cite year = {2022}, -Explore the documentation -------------------------- +.. it is mandatory to keep the toctree here although it doesn't show up in the page +.. when adding/modifying pages, don't forget to update the toctree .. toctree:: :maxdepth: 1 + :hidden: + :includehidden: getting_started.rst tutorials/tutorials.rst From a36c05c929e19e0d9001f48335207a984e0b46aa Mon Sep 17 00:00:00 2001 From: Badr MOUFAD <65614794+Badr-MOUFAD@users.noreply.github.com> Date: Sat, 23 Dec 2023 22:46:40 +0100 Subject: [PATCH 14/52] Add Code of Conduct (#217) * add code of conduct * include code of conduct in doc * build doc options --- CODE_OF_CONDUCT.md | 3 +++ doc/conf.py | 1 + doc/contribute.rst | 29 ++++++++++++++++++++++------- pyproject.toml | 1 + 4 files changed, 27 insertions(+), 7 deletions(-) create mode 100644 CODE_OF_CONDUCT.md diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 000000000..85128cfb7 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,3 @@ +# Code of Conduct + +As part of the [scikit-learn-contrib](https://github.com/scikit-learn-contrib) GitHub organization, we adopt the scikit-learn [code of conduct](https://github.com/scikit-learn/scikit-learn/blob/main/CODE_OF_CONDUCT.md). diff --git a/doc/conf.py b/doc/conf.py index afa64bdef..4a5b89f7a 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -59,6 +59,7 @@ 'sphinx_gallery.gen_gallery', 'sphinx.ext.autosectionlabel', 'sphinx_copybutton', + 'sphinx_design', 'numpydoc', 'sphinx.ext.linkcode', 'gh_substitutions', diff --git a/doc/contribute.rst b/doc/contribute.rst index b97258013..659289fd2 100644 --- a/doc/contribute.rst +++ b/doc/contribute.rst @@ -23,10 +23,12 @@ Your contribution is welcome and highly valuable. It can be You can submit a `pull request `_ to integrate your changes and we will reach out to you shortly. + +As part of the `scikit-learn-contrib `_ GitHub organization, we adopt the scikit-learn `code of conduct `_. + .. note:: - - If you are willing to contribute with code to ``skglm``, check the section below to learn how to install - the development version of ``skglm`` + If you are willing to contribute with code to ``skglm``, check the section below to learn how to install the development version of ``skglm`` @@ -51,10 +53,23 @@ contribute with code or documentation. $ pip install -e . -3. To run the gallery of examples and build the documentation, run +3. To build the documentation, run -.. code-block:: shell - $ cd doc - $ pip install .[doc] - $ make html +.. tab-set:: + + .. tab-item:: with the gallery of example + + .. code-block:: shell + + $ cd doc + $ pip install .[doc] + $ make html + + .. tab-item:: without the gallery of example + + .. code-block:: shell + + $ cd doc + $ pip install .[doc] + $ make html-noplot diff --git a/pyproject.toml b/pyproject.toml index b94f99e77..34ec1a49f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,6 +50,7 @@ doc = [ "sphinx-bootstrap-theme", "sphinx_copybutton", "sphinx-gallery", + "sphinx-design", "pytest", "furo", "lifelines", From 240f11a3cec2e255bef887d1f5f5b5217fb580ab Mon Sep 17 00:00:00 2001 From: mathurinm Date: Sat, 23 Dec 2023 22:47:45 +0100 Subject: [PATCH 15/52] MNT - bump to version 0.3.2 (#218) * release 0.3.1 * update what's new * bump version to ``0.3.2dev`` * v0.3 ---> v0.4 * add release date --------- Co-authored-by: Badr-MOUFAD --- doc/changes/0.4.rst | 4 ++++ skglm/__init__.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/changes/0.4.rst b/doc/changes/0.4.rst index e649956a9..4dd673d98 100644 --- a/doc/changes/0.4.rst +++ b/doc/changes/0.4.rst @@ -2,6 +2,10 @@ Version 0.4 (in progress) ------------------------- + + +Version 0.3.1 (2023/12/21) +-------------------------- - Add support for weights and positive coefficients to :ref:`MCPRegression Estimator ` (PR: :gh:`184`) - Move solver specific computations from ``Datafit.initialize()`` to separate ``Datafit`` methods to ease ``Solver`` - ``Datafit`` compatibility check (PR: :gh:`192`) - Add :ref:`LogSumPenalty ` (PR: :gh:`#127`) diff --git a/skglm/__init__.py b/skglm/__init__.py index 2f22debe8..ed6bb05f7 100644 --- a/skglm/__init__.py +++ b/skglm/__init__.py @@ -1,4 +1,4 @@ -__version__ = '0.3.1dev' +__version__ = '0.3.2dev' from skglm.estimators import ( # noqa F401 Lasso, WeightedLasso, ElasticNet, MCPRegression, MultiTaskLasso, LinearSVC, From a1662bb15b5c2a395e8ca001690ddd99b5e0a539 Mon Sep 17 00:00:00 2001 From: Badr MOUFAD <65614794+Badr-MOUFAD@users.noreply.github.com> Date: Wed, 10 Jan 2024 11:02:46 +0100 Subject: [PATCH 16/52] DOC - Fix link to stable documentation (#219) --- doc/_templates/sidebar/version_toggler.html | 6 +++--- doc/conf.py | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/doc/_templates/sidebar/version_toggler.html b/doc/_templates/sidebar/version_toggler.html index afd2a3e1f..1ea55a486 100644 --- a/doc/_templates/sidebar/version_toggler.html +++ b/doc/_templates/sidebar/version_toggler.html @@ -3,15 +3,15 @@ - + \ No newline at end of file diff --git a/doc/conf.py b/doc/conf.py index 4a5b89f7a..a976fefcc 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -269,7 +269,8 @@ "announcement": ( "You are viewing the documentation of the dev version of " "skglm which contains WIP features. " - "View stable version." + "View " + "stable version." ) } From 360b41a439074f608b2178534cdeee96e12b523c Mon Sep 17 00:00:00 2001 From: Quentin Bertrand Date: Tue, 23 Jan 2024 10:16:12 -0500 Subject: [PATCH 17/52] Add Group Lasso with positive weigths (#221) --- doc/changes/0.4.rst | 1 + doc/tutorials/prox_nn_group_lasso.rst | 139 ++++++++++++++++++++++++++ doc/tutorials/tutorials.rst | 5 + skglm/penalties/block_separable.py | 25 ++++- skglm/tests/test_group.py | 8 +- skglm/utils/prox_funcs.py | 32 +++++- 6 files changed, 201 insertions(+), 9 deletions(-) create mode 100644 doc/tutorials/prox_nn_group_lasso.rst diff --git a/doc/changes/0.4.rst b/doc/changes/0.4.rst index 4dd673d98..9fa7ca815 100644 --- a/doc/changes/0.4.rst +++ b/doc/changes/0.4.rst @@ -11,3 +11,4 @@ Version 0.3.1 (2023/12/21) - Add :ref:`LogSumPenalty ` (PR: :gh:`#127`) - Remove abstract methods in ``BaseDatafit`` and ``BasePenalty`` to make solver/penalty/datafit compatibility check easier (PR :gh:`#205`) - Add fixed-point distance to build working sets in :ref:`ProxNewton ` solver (:gh:`138`) +- Add support and tutorial for positive coefficients to :ref:`Group Lasso Penalty ` (PR: :gh:`221`) diff --git a/doc/tutorials/prox_nn_group_lasso.rst b/doc/tutorials/prox_nn_group_lasso.rst new file mode 100644 index 000000000..a862e69e1 --- /dev/null +++ b/doc/tutorials/prox_nn_group_lasso.rst @@ -0,0 +1,139 @@ +.. _prox_nn_group_lasso: + +==================================================== +Details on the proximity operator of the group Lasso +==================================================== + +This tutorial presents how to derive the proximity operator of the :math:`l_2`-penalty, and the :math:`l_2`-penalty with nonnegative constraints. + + +Proximity operator of the group Lasso +===================================== + +Let + +.. math:: + g:x \mapsto \norm{x}_2 + , + +then + +.. math:: + :label: fenchel + + g^{\star}:x \mapsto i_{\norm{x}_2 \leq 1} + , + +and for all :math:`x \in \mathbb{R}^p` + +.. math:: + :label: prox_projection + + \text{prox}_{g^{\star}}(x) + = + \text{proj}_{\mathcal{B}_2}(x) = \frac{x}{\max(\norm{x}_2, 1)} + . + +Using the Moreau decomposition, Equations :eq:`fenchel` and :eq:`prox_projection`, one has + + +.. math:: + + \text{prox}_{\lambda g}(x) + = + x + - \lambda \text{prox}_{\lambda g^\star}(x/\lambda) + +.. math:: + + = x + - \lambda \text{prox}_{g^\star}(x/\lambda) + +.. math:: + + = x + - \lambda \frac{x/\lambda}{\max(\norm{x/\lambda}_2, 1)} + +.. math:: + + = x + - \frac{\lambda x}{\max(\norm{x}_2, \lambda)} + +.. math:: + + = (1 - \frac{\lambda}{\norm{x}})_{+} x + . + +A similar formula can be derived for the group Lasso with nonnegative constraints. + + +Proximity operator of the group Lasso with positivity constraints +================================================================= + + +Let + +.. math:: + h:x \mapsto \norm{x}_2 + + i_{x \geq 0} + . + +Let :math:`x \in \mathbb{R}^p` and :math:`S = \{ j \in 1, ..., p | x_j > 0 \} \in \mathbb{R}^p`, then + + +.. math:: + :label: fenchel_nn + + h^{\star} :x \mapsto i_{\norm{x_S}_2 \leq 1} + , + +and + +.. math:: + :label: prox_projection_nn_Sc + + \text{prox}_{h^{\star}}(x)_{S^c} + = + x_{S^c} + + +.. math:: + :label: prox_projection_nn_S + + \text{prox}_{h^{\star}}(x)_S + = + \text{proj}_{\mathcal{B}_2}(x_S) = \frac{x_S}{\max(\norm{x_S}_2, 1)} + . + +As before, using the Moreau decomposition and Equation :eq:`fenchel_nn` yields + + +.. math:: + + \text{prox}_{\lambda h}(x) + = + x + - \lambda \text{prox}_{\lambda h^\star}(x/\lambda) + +.. math:: + + = x + - \lambda \text{prox}_{h^\star}(x/\lambda) + , + +and thus, combined with Equations :eq:`prox_projection_nn_Sc` and :eq:`prox_projection_nn_S` it leads to + +.. math:: + + \text{prox}_{\lambda h}(x)_{S^c} = 0 + +.. math:: + + \text{prox}_{\lambda h}(x)_{S} + = + (1 - \frac{\lambda}{\norm{x_S}})_{+} x_S + . + + +References +========== diff --git a/doc/tutorials/tutorials.rst b/doc/tutorials/tutorials.rst index ab54a574c..77c2ce166 100644 --- a/doc/tutorials/tutorials.rst +++ b/doc/tutorials/tutorials.rst @@ -28,3 +28,8 @@ Explore how ``skglm`` fits an unpenalized intercept. ----------------------------------------------------------------- Get details about Cox datafit equations. + +:ref:`Details on the group Lasso ` +----------------------------------------------------------------- + +Get details about how to compute the prox of the group Lasso with and without nonnegativity constraints. diff --git a/skglm/penalties/block_separable.py b/skglm/penalties/block_separable.py index 5a2f8f421..880f01470 100644 --- a/skglm/penalties/block_separable.py +++ b/skglm/penalties/block_separable.py @@ -2,6 +2,7 @@ from numpy.linalg import norm from numba import float64, int32 +from numba.types import bool_ from skglm.penalties.base import BasePenalty from skglm.utils.prox_funcs import ( @@ -254,11 +255,15 @@ class WeightedGroupL2(BasePenalty): grp_ptr : array, shape (n_groups + 1,) The group pointers such that two consecutive elements delimit the indices of a group in ``grp_indices``. + + positive : bool, optional + When set to ``True``, forces the coefficient vector to be positive. """ - def __init__(self, alpha, weights, grp_ptr, grp_indices): + def __init__(self, alpha, weights, grp_ptr, grp_indices, positive=False): self.alpha, self.weights = alpha, weights self.grp_ptr, self.grp_indices = grp_ptr, grp_indices + self.positive = positive def get_spec(self): spec = ( @@ -266,15 +271,19 @@ def get_spec(self): ('weights', float64[:]), ('grp_ptr', int32[:]), ('grp_indices', int32[:]), + ('positive', bool_), ) return spec def params_to_dict(self): return dict(alpha=self.alpha, weights=self.weights, - grp_ptr=self.grp_ptr, grp_indices=self.grp_indices) + grp_ptr=self.grp_ptr, grp_indices=self.grp_indices, + positive=self.positive) def value(self, w): """Value of penalty at vector ``w``.""" + if self.positive and np.any(w < 0): + return np.inf alpha, weights = self.alpha, self.weights grp_ptr, grp_indices = self.grp_ptr, self.grp_indices n_grp = len(grp_ptr) - 1 @@ -290,7 +299,8 @@ def value(self, w): def prox_1group(self, value, stepsize, g): """Compute the proximal operator of group ``g``.""" - return BST(value, self.alpha * stepsize * self.weights[g]) + return BST( + value, self.alpha * stepsize * self.weights[g], positive=self.positive) def subdiff_distance(self, w, grad_ws, ws): """Compute distance to the subdifferential at ``w`` of negative gradient. @@ -312,9 +322,16 @@ def subdiff_distance(self, w, grad_ws, ws): w_g = w[grp_g_indices] norm_w_g = norm(w_g) - if norm_w_g == 0: + if self.positive and np.any(w_g < 0): + scores[idx] = np.inf + if self.positive and norm_w_g == 0: + # distance of -norm(grad_g) to )-infty, alpha * weights[g]] + scores[idx] = max(0, - norm(grad_g) - self.alpha * weights[g]) + if (not self.positive) and norm_w_g == 0: + # distance of -norm(grad_g) to weights[g] * [-alpha, alpha] scores[idx] = max(0, norm(grad_g) - alpha * weights[g]) else: + # distance of -grad_g to the subdiff (here a singleton) subdiff = alpha * weights[g] * w_g / norm_w_g scores[idx] = norm(grad_g + subdiff) diff --git a/skglm/tests/test_group.py b/skglm/tests/test_group.py index a9b5ac732..25e657ba8 100644 --- a/skglm/tests/test_group.py +++ b/skglm/tests/test_group.py @@ -76,7 +76,8 @@ def test_alpha_max(n_groups, n_features, shuffle): np.testing.assert_allclose(norm(w), 0, atol=1e-14) -def test_equivalence_lasso(): +@pytest.mark.parametrize('positive', [False, True]) +def test_equivalence_lasso(positive): n_samples, n_features = 30, 50 rnd = np.random.RandomState(1123) X, y, _ = make_correlated_data(n_samples, n_features, random_state=rnd) @@ -90,7 +91,7 @@ def test_equivalence_lasso(): quad_group = QuadraticGroup(grp_ptr=grp_ptr, grp_indices=grp_indices) group_penalty = WeightedGroupL2( alpha=alpha, grp_ptr=grp_ptr, - grp_indices=grp_indices, weights=weights) + grp_indices=grp_indices, weights=weights, positive=positive) # compile classes quad_group = compiled_clone(quad_group, to_float32=X.dtype == np.float32) @@ -98,7 +99,8 @@ def test_equivalence_lasso(): w = GroupBCD(tol=1e-12).solve(X, y, quad_group, group_penalty)[0] celer_lasso = Lasso( - alpha=alpha, fit_intercept=False, tol=1e-12, weights=weights).fit(X, y) + alpha=alpha, fit_intercept=False, tol=1e-12, weights=weights, + positive=positive).fit(X, y) np.testing.assert_allclose(celer_lasso.coef_, w) diff --git a/skglm/utils/prox_funcs.py b/skglm/utils/prox_funcs.py index e029b6861..9ff99ff2d 100644 --- a/skglm/utils/prox_funcs.py +++ b/skglm/utils/prox_funcs.py @@ -30,8 +30,36 @@ def proj_L2ball(u): @njit -def BST(x, u): - """Block soft-thresholding of vector x at level u.""" +def BST(x, u, positive=False): + """Block soft-thresholding of vector x at level u. + + If positive=False: + + .. math:: + "BST"(x, u) = max(1 - u / ||x||, 0) x , + + the proof can be found in [1]. + + If positive=True, let :math:`S = {j in 1, ..., p | x_j > 0}` + + .. math:: + "BST"(x, u)_S = max(1 - u / ||x_S||, 0) x_S + + .. math:: + "BST"(x, u)_{S^c} = 0 , + + the proof can be adapted from [1]. + + References + ---------- + [1] https://math.stackexchange.com/questions/1681658/ + closed-form-solution-of-arg-min-x-left-x-y-right-22-lamb + """ + if positive: + result = np.zeros_like(x) + positive_entries = x > 0 + result[positive_entries] = BST(x[positive_entries], u, positive=False) + return result norm_x = norm(x) if norm_x < u: return np.zeros_like(x) From c1d16a2b60d051cb50061fc8b6bf8a5eef64ed45 Mon Sep 17 00:00:00 2001 From: mathurinm Date: Mon, 5 Feb 2024 13:13:55 +0100 Subject: [PATCH 18/52] Add prox vec L05 (#222) --- skglm/penalties/separable.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/skglm/penalties/separable.py b/skglm/penalties/separable.py index ee45eca88..c7d8d0bd7 100644 --- a/skglm/penalties/separable.py +++ b/skglm/penalties/separable.py @@ -527,6 +527,12 @@ def prox_1d(self, value, stepsize, j): """Compute the proximal operator of L0_5.""" return prox_05(value, self.alpha * stepsize) + def prox_vec(self, x, stepsize): + res = np.zeros_like(x) + for j in range(x.shape[0]): + res[j] = self.prox_1d(x[j], stepsize, j) + return res + def subdiff_distance(self, w, grad, ws): """Compute distance of negative gradient to the subdifferential at w.""" subdiff_dist = np.zeros_like(grad) From e17ffc6bb92f7dfb84794b958cb5b822a700c2aa Mon Sep 17 00:00:00 2001 From: mathurinm Date: Tue, 20 Feb 2024 09:16:40 +0100 Subject: [PATCH 19/52] DOC update contribution section (#224) Co-authored-by: Badr-MOUFAD --- doc/contribute.rst | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/doc/contribute.rst b/doc/contribute.rst index 659289fd2..dd0629f47 100644 --- a/doc/contribute.rst +++ b/doc/contribute.rst @@ -2,16 +2,14 @@ Contribute to ``skglm`` ======================= ------------------------ - -``skglm`` is a continuous endeavour that relies on the community efforts to last and evolve. -Your contribution is welcome and highly valuable. It can be +``skglm`` is a continuous endeavour that relies on community efforts to last and evolve. +Your contribution is welcome and highly valuable. You can help with **bug report** - ``skglm`` runs continuously unit tests on the code base to prevent bugs. - Help us tighten these tests by reporting any bug that you encountered while using ``skglm``. - To do so, use the `issue section `_ ``skglm`` repository. + ``skglm`` runs unit tests on the codebase to prevent bugs. + Help us tighten these tests by reporting any bug that you encounter. + To do so, use the `issue section `_. **feature request** We are constantly improving ``skglm`` and we would like to align that with the user needs. @@ -19,30 +17,30 @@ Your contribution is welcome and highly valuable. It can be You can use the `the issue section `_ to make suggestions. **pull request** - you may have fixed a bug, added a feature, or even fixed a small typo in the documentation, ... + You may have fixed a bug, added a feature, or even fixed a small typo in the documentation... You can submit a `pull request `_ to integrate your changes and we will reach out to you shortly. - + If this is your first pull request, you can refer to `this scikit-learn guide `_. As part of the `scikit-learn-contrib `_ GitHub organization, we adopt the scikit-learn `code of conduct `_. .. note:: - If you are willing to contribute with code to ``skglm``, check the section below to learn how to install the development version of ``skglm`` + If you are willing to contribute with code to ``skglm``, check the section below to learn how to install the development version. Setup ``skglm`` on your local machine --------------------------------------- -Here are key steps to help you setup ``skglm`` on your local machine in case you wanted to +Here are the key steps to help you setup ``skglm`` on your machine in case you want to contribute with code or documentation. -1. Fork the repository and run the following command to clone it on your local machine +1. `Fork the repository `_ and run the following command to clone it on your local machine, make sure to replace ``{YOUR_GITHUB_USERNAME}`` with your GitHub username .. code-block:: shell - $ git clone https://github.com/{YOUR_GITHUB_USERNAME}/skglm.git + $ git clone https://github.com/{YOUR_GITHUB_USERNAME}/skglm 2. ``cd`` to ``skglm`` directory and install it in edit mode by running @@ -53,12 +51,11 @@ contribute with code or documentation. $ pip install -e . -3. To build the documentation, run - +3. To build the documentation locally, run .. tab-set:: - .. tab-item:: with the gallery of example + .. tab-item:: with plots in the example gallery .. code-block:: shell @@ -66,7 +63,7 @@ contribute with code or documentation. $ pip install .[doc] $ make html - .. tab-item:: without the gallery of example + .. tab-item:: without plots in the example gallery .. code-block:: shell From 3fe0cc62f684d40213d7d824b3ad9dc24cd3586b Mon Sep 17 00:00:00 2001 From: Badr MOUFAD <65614794+Badr-MOUFAD@users.noreply.github.com> Date: Wed, 6 Mar 2024 13:55:58 +0100 Subject: [PATCH 20/52] FIX - computation of ``subdiff_distance`` in ``WeightedGroupL2`` penalty (#225) Co-authored-by: mathurinm --- doc/tutorials/prox_nn_group_lasso.rst | 75 ++++++++++++++++++++++++--- doc/tutorials/tutorials.rst | 2 +- skglm/penalties/block_separable.py | 26 +++++++--- skglm/tests/test_group.py | 4 +- skglm/utils/prox_funcs.py | 2 +- 5 files changed, 91 insertions(+), 18 deletions(-) diff --git a/doc/tutorials/prox_nn_group_lasso.rst b/doc/tutorials/prox_nn_group_lasso.rst index a862e69e1..44537f90e 100644 --- a/doc/tutorials/prox_nn_group_lasso.rst +++ b/doc/tutorials/prox_nn_group_lasso.rst @@ -1,10 +1,10 @@ .. _prox_nn_group_lasso: -==================================================== -Details on the proximity operator of the group Lasso -==================================================== +=================================== +Details on the Positive Group Lasso +=================================== -This tutorial presents how to derive the proximity operator of the :math:`l_2`-penalty, and the :math:`l_2`-penalty with nonnegative constraints. +This tutorial presents how to derive the proximity operator and subdifferential of the :math:`l_2`-penalty, and the :math:`l_2`-penalty with nonnegative constraints. Proximity operator of the group Lasso @@ -16,7 +16,7 @@ Let g:x \mapsto \norm{x}_2 , -then +then its Fenchel-Legendre conjugate is .. math:: :label: fenchel @@ -42,7 +42,7 @@ Using the Moreau decomposition, Equations :eq:`fenchel` and :eq:`prox_projection \text{prox}_{\lambda g}(x) = x - - \lambda \text{prox}_{\lambda g^\star}(x/\lambda) + - \lambda \text{prox}_{g^\star/\lambda }(x/\lambda) .. math:: @@ -70,7 +70,6 @@ A similar formula can be derived for the group Lasso with nonnegative constraint Proximity operator of the group Lasso with positivity constraints ================================================================= - Let .. math:: @@ -113,7 +112,7 @@ As before, using the Moreau decomposition and Equation :eq:`fenchel_nn` yields \text{prox}_{\lambda h}(x) = x - - \lambda \text{prox}_{\lambda h^\star}(x/\lambda) + - \lambda \text{prox}_{h^\star / \lambda }(x/\lambda) .. math:: @@ -135,5 +134,65 @@ and thus, combined with Equations :eq:`prox_projection_nn_Sc` and :eq:`prox_proj . + +.. _subdiff_positive_group_lasso: +Subdifferential of the positive Group Lasso penalty +=================================================== + +For the ``subdiff_diff`` working set strategy, we compute the distance :math:`D(v)` for some :math:`v` to the subdifferential of the :math:`h` penalty at a point :math:`w`. +Since the penalty is group-separable, we consider a block of variables, in :math:`\mathbb{R}^g`. + +Case :math:`w` has a strictly negative coordinate +------------------------------------------------- + +If any component of :math:`w` is strictly negative, the subdifferential is empty, and the distance is :math:`+ \infty`. + +.. math:: + + D(v) = + \infty, \quad \forall v \in \mathbb{R}^g + . + + +Case :math:`w` is strictly positive +----------------------------------- + +At a non zero point with strictly positive entries, the penalty is differentiable hence its subgradient is the singleton :math:`w / {|| w ||}`. + +.. math:: + + D(v) = || v - w / {|| w ||} ||, \quad \forall v \in \mathbb{R}^g + . + +Case :math:`w = 0` +------------------ + +At :math:`w = 0`, the subdifferential is: + +.. math:: + + \lambda \partial || \cdot ||_2 + \partial \iota_{x \geq 0} = \lambda \mathcal{B}_2 + \mathbb{R}_-^g + , + +where :math:`\mathcal{B}_2` is the unit ball. + +Therefore, the distance to the subdifferential writes + +.. math:: + + D(v) = \min_{u \in \lambda \mathcal{B}_2, n \in \mathbb{R}_{-}^g} \ || u + n - v || + . + +Minimizing over :math:`n` then over :math:`u`, thanks to [`1 `_], yields + +.. math:: + + D(v) = \max(0, ||v^+|| - \lambda) + , + +Where :math:`v^+` is :math:`v` restricted to its positive coordinates. + + References ========== + +[1] ``_ \ No newline at end of file diff --git a/doc/tutorials/tutorials.rst b/doc/tutorials/tutorials.rst index 77c2ce166..24652a871 100644 --- a/doc/tutorials/tutorials.rst +++ b/doc/tutorials/tutorials.rst @@ -32,4 +32,4 @@ Get details about Cox datafit equations. :ref:`Details on the group Lasso ` ----------------------------------------------------------------- -Get details about how to compute the prox of the group Lasso with and without nonnegativity constraints. +Mathematical details about the group Lasso, in particular with nonnegativity constraints. diff --git a/skglm/penalties/block_separable.py b/skglm/penalties/block_separable.py index 880f01470..00fe7c30d 100644 --- a/skglm/penalties/block_separable.py +++ b/skglm/penalties/block_separable.py @@ -240,6 +240,16 @@ class WeightedGroupL2(BasePenalty): with :math:`w_{[g]}` being the coefficients of the g-th group. + When ``positive=True``, it reads + + .. math:: + sum_{g=1}^{n_"groups"} "weights"_g xx ||w_{[g]}|| + i_{w_{[g]} \geq 0} + + Where :math:`i_{w_{[g]} \geq 0}` is the indicator function of the positive orthant. + + Refer to :ref:`prox_nn_group_lasso` for details on the derivation of the proximal + operator and the distance to subdifferential. + Attributes ---------- alpha : float @@ -305,8 +315,11 @@ def prox_1group(self, value, stepsize, g): def subdiff_distance(self, w, grad_ws, ws): """Compute distance to the subdifferential at ``w`` of negative gradient. - Note: ``grad_ws`` is a stacked array of gradients. - ([grad_ws_1, grad_ws_2, ...]) + Refer to :ref:`subdiff_positive_group_lasso` for details of the derivation. + + Note: + ---- + ``grad_ws`` is a stacked array of gradients ``[grad_ws_1, grad_ws_2, ...]``. """ alpha, weights = self.alpha, self.weights grp_ptr, grp_indices = self.grp_ptr, self.grp_indices @@ -324,10 +337,11 @@ def subdiff_distance(self, w, grad_ws, ws): if self.positive and np.any(w_g < 0): scores[idx] = np.inf - if self.positive and norm_w_g == 0: - # distance of -norm(grad_g) to )-infty, alpha * weights[g]] - scores[idx] = max(0, - norm(grad_g) - self.alpha * weights[g]) - if (not self.positive) and norm_w_g == 0: + elif self.positive and norm_w_g == 0: + # distance of -norm(neg_grad_g) to weights[g] * [-alpha, alpha] + neg_grad_g = grad_g[grad_g < 0.] + scores[idx] = max(0, norm(neg_grad_g) - self.alpha * weights[g]) + elif (not self.positive) and norm_w_g == 0: # distance of -norm(grad_g) to weights[g] * [-alpha, alpha] scores[idx] = max(0, norm(grad_g) - alpha * weights[g]) else: diff --git a/skglm/tests/test_group.py b/skglm/tests/test_group.py index 25e657ba8..990ba5aec 100644 --- a/skglm/tests/test_group.py +++ b/skglm/tests/test_group.py @@ -79,14 +79,14 @@ def test_alpha_max(n_groups, n_features, shuffle): @pytest.mark.parametrize('positive', [False, True]) def test_equivalence_lasso(positive): n_samples, n_features = 30, 50 - rnd = np.random.RandomState(1123) + rnd = np.random.RandomState(112) X, y, _ = make_correlated_data(n_samples, n_features, random_state=rnd) grp_indices, grp_ptr = grp_converter(1, n_features) weights = abs(rnd.randn(n_features)) alpha_max = norm(X.T @ y / weights, ord=np.inf) / n_samples - alpha = alpha_max / 10. + alpha = alpha_max / 100. quad_group = QuadraticGroup(grp_ptr=grp_ptr, grp_indices=grp_indices) group_penalty = WeightedGroupL2( diff --git a/skglm/utils/prox_funcs.py b/skglm/utils/prox_funcs.py index 9ff99ff2d..b3389d0c9 100644 --- a/skglm/utils/prox_funcs.py +++ b/skglm/utils/prox_funcs.py @@ -48,7 +48,7 @@ def BST(x, u, positive=False): .. math:: "BST"(x, u)_{S^c} = 0 , - the proof can be adapted from [1]. + the proof can be adapted from [1]; see the details in the documentation. References ---------- From fe27eb72f18767220e4e76bd146046fe87bd7a02 Mon Sep 17 00:00:00 2001 From: mathurinm Date: Thu, 28 Mar 2024 12:45:46 +0100 Subject: [PATCH 21/52] API change use_acc default value to False in GreedyCD (#236) --- skglm/solvers/gram_cd.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/skglm/solvers/gram_cd.py b/skglm/solvers/gram_cd.py index 2f82a2778..d18b165b3 100644 --- a/skglm/solvers/gram_cd.py +++ b/skglm/solvers/gram_cd.py @@ -34,8 +34,9 @@ class GramCD(BaseSolver): Initial value of coefficients. If set to ``None``, a zero vector is used instead. - use_acc : bool, default True + use_acc : bool, default False Extrapolate the iterates based on the past 5 iterates if set to ``True``. + Can only be used when ``greedy_cd`` is ``False``. greedy_cd : bool, default True Use a greedy strategy to select features to update in coordinate descent epochs @@ -48,7 +49,7 @@ class GramCD(BaseSolver): Amount of verbosity. 0/False is silent. """ - def __init__(self, max_iter=100, use_acc=True, greedy_cd=True, tol=1e-4, + def __init__(self, max_iter=100, use_acc=False, greedy_cd=True, tol=1e-4, fit_intercept=True, warm_start=False, verbose=0): self.max_iter = max_iter self.use_acc = use_acc From c560dd1d586b6b325c11728abfbfa45b600cadf2 Mon Sep 17 00:00:00 2001 From: mathurinm Date: Thu, 28 Mar 2024 12:47:21 +0100 Subject: [PATCH 22/52] MNT change solver quantile regressor sklearn in test (#235) --- skglm/experimental/tests/test_quantile_regression.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/skglm/experimental/tests/test_quantile_regression.py b/skglm/experimental/tests/test_quantile_regression.py index bc982f35d..509b7079c 100644 --- a/skglm/experimental/tests/test_quantile_regression.py +++ b/skglm/experimental/tests/test_quantile_regression.py @@ -28,7 +28,8 @@ def test_PDCD_WS(quantile_level): clf = QuantileRegressor( quantile=quantile_level, alpha=alpha/n_samples, - fit_intercept=False + fit_intercept=False, + solver='highs', ).fit(X, y) np.testing.assert_allclose(w, clf.coef_, atol=1e-5) From b738b3fd71b490e638782347bb1cf96509f73f99 Mon Sep 17 00:00:00 2001 From: Tomasz Kacprzak Date: Wed, 3 Apr 2024 21:18:48 +0200 Subject: [PATCH 23/52] ENH add GroupLasso estimator with sparse X support (#228) Co-authored-by: mathurinm Co-authored-by: Badr-MOUFAD --- doc/api.rst | 1 + doc/changes/0.4.rst | 3 +- doc/tutorials/prox_nn_group_lasso.rst | 60 ++++++++--- skglm/__init__.py | 2 +- skglm/datafits/group.py | 33 +++++++ skglm/estimators.py | 137 +++++++++++++++++++++++++- skglm/penalties/block_separable.py | 36 ++++--- skglm/solvers/group_bcd.py | 73 ++++++++++++-- skglm/tests/test_estimators.py | 94 +++++++++++++++--- skglm/tests/test_fista.py | 15 --- skglm/tests/test_sparse_ops.py | 36 +++++++ skglm/utils/sparse_ops.py | 44 +++++++++ 12 files changed, 465 insertions(+), 69 deletions(-) create mode 100644 skglm/tests/test_sparse_ops.py diff --git a/doc/api.rst b/doc/api.rst index b980d775a..f925a89f6 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -17,6 +17,7 @@ Estimators GeneralizedLinearEstimator CoxEstimator ElasticNet + GroupLasso Lasso LinearSVC SparseLogisticRegression diff --git a/doc/changes/0.4.rst b/doc/changes/0.4.rst index 9fa7ca815..98d14d30a 100644 --- a/doc/changes/0.4.rst +++ b/doc/changes/0.4.rst @@ -2,6 +2,8 @@ Version 0.4 (in progress) ------------------------- +- Add :ref:`GroupLasso Estimator ` (PR: :gh:`228`) +- Add support and tutorial for positive coefficients to :ref:`Group Lasso Penalty ` (PR: :gh:`221`) Version 0.3.1 (2023/12/21) @@ -11,4 +13,3 @@ Version 0.3.1 (2023/12/21) - Add :ref:`LogSumPenalty ` (PR: :gh:`#127`) - Remove abstract methods in ``BaseDatafit`` and ``BasePenalty`` to make solver/penalty/datafit compatibility check easier (PR :gh:`#205`) - Add fixed-point distance to build working sets in :ref:`ProxNewton ` solver (:gh:`138`) -- Add support and tutorial for positive coefficients to :ref:`Group Lasso Penalty ` (PR: :gh:`221`) diff --git a/doc/tutorials/prox_nn_group_lasso.rst b/doc/tutorials/prox_nn_group_lasso.rst index 44537f90e..af0a6c18e 100644 --- a/doc/tutorials/prox_nn_group_lasso.rst +++ b/doc/tutorials/prox_nn_group_lasso.rst @@ -136,14 +136,15 @@ and thus, combined with Equations :eq:`prox_projection_nn_Sc` and :eq:`prox_proj .. _subdiff_positive_group_lasso: + Subdifferential of the positive Group Lasso penalty =================================================== For the ``subdiff_diff`` working set strategy, we compute the distance :math:`D(v)` for some :math:`v` to the subdifferential of the :math:`h` penalty at a point :math:`w`. -Since the penalty is group-separable, we consider a block of variables, in :math:`\mathbb{R}^g`. +Since the penalty is group-separable, we reduce the case where :math:`w` is a block of variables in :math:`\mathbb{R}^g`. -Case :math:`w` has a strictly negative coordinate -------------------------------------------------- +Case :math:`w \notin \mathbb{R}_+^g` +------------------------------------ If any component of :math:`w` is strictly negative, the subdifferential is empty, and the distance is :math:`+ \infty`. @@ -152,17 +153,6 @@ If any component of :math:`w` is strictly negative, the subdifferential is empty D(v) = + \infty, \quad \forall v \in \mathbb{R}^g . - -Case :math:`w` is strictly positive ------------------------------------ - -At a non zero point with strictly positive entries, the penalty is differentiable hence its subgradient is the singleton :math:`w / {|| w ||}`. - -.. math:: - - D(v) = || v - w / {|| w ||} ||, \quad \forall v \in \mathbb{R}^g - . - Case :math:`w = 0` ------------------ @@ -189,10 +179,48 @@ Minimizing over :math:`n` then over :math:`u`, thanks to [`1 0`, taking a non zero :math:`n_i` will only increase the quantity that :math:`u_i` needs to bring closer to 0. + +For a rigorous derivation of this, introduce the Lagrangian on a squared objective + +.. math:: + + \mathcal{L}(u, n, \nu, \mu) = + \frac{1}{2}\norm{u + n - v}^2 + \nu(\frac{1}{2} \norm{u}^2 - \lambda^2 / 2) + \langle \mu, n \rangle + , + +and write down the optimality condition with respect to :math:`u` and :math:`n`. +Treat the case :math:`nu = 0` separately; in the other case show that :\math:`u` must be positive, and that :math:`v = (1 + \nu) u + n`, together with :math:`u = \mu / \nu` and complementary slackness, to reach the conclusion. + +Case :math:`|| w || \ne 0` +--------------------------- +The subdifferential in that case is :math:`\lambda w / {|| w ||} + C_1 \times \ldots \times C_g` where :math:`C_j = {0}` if :math:`w_j > 0` and :math:`C_j = mathbb{R}_-` otherwise (:math:`w_j =0`). + +By letting :math:`p` denotes the projection of :math:`v` onto this set, +one has + +.. math:: + + p_j = \lambda \frac{w_j}{||w||} \text{ if } w_j > 0 + +and + +.. math:: + + p_j = \min(v_j, 0) \text{ otherwise}. + +The distance to the subdifferential is then: + +.. math:: + + D(v) = || v - p || = \sqrt{\sum_{j, w_j > 0} (v_j - \lambda \frac{w_j}{||w||})^2 + \sum_{j, w_j=0} \max(0, v_j)^2 + +since :math:`v_j - \min(v_j, 0) = v_j + \max(-v_j, 0) = \max(0, v_j)`. + References ========== -[1] ``_ \ No newline at end of file +[1] ``_ diff --git a/skglm/__init__.py b/skglm/__init__.py index ed6bb05f7..c134c98f2 100644 --- a/skglm/__init__.py +++ b/skglm/__init__.py @@ -2,5 +2,5 @@ from skglm.estimators import ( # noqa F401 Lasso, WeightedLasso, ElasticNet, MCPRegression, MultiTaskLasso, LinearSVC, - SparseLogisticRegression, GeneralizedLinearEstimator, CoxEstimator + SparseLogisticRegression, GeneralizedLinearEstimator, CoxEstimator, GroupLasso, ) diff --git a/skglm/datafits/group.py b/skglm/datafits/group.py index 383d3206f..487df4a30 100644 --- a/skglm/datafits/group.py +++ b/skglm/datafits/group.py @@ -4,6 +4,7 @@ from skglm.datafits.base import BaseDatafit from skglm.datafits.single_task import Logistic +from skglm.utils.sparse_ops import spectral_norm, sparse_columns_slice class QuadraticGroup(BaseDatafit): @@ -50,6 +51,20 @@ def get_lipschitz(self, X, y): return lipschitz + def get_lipschitz_sparse(self, X_data, X_indptr, X_indices, y): + grp_ptr, grp_indices = self.grp_ptr, self.grp_indices + n_groups = len(grp_ptr) - 1 + + lipschitz = np.zeros(n_groups, dtype=X_data.dtype) + for g in range(n_groups): + grp_g_indices = grp_indices[grp_ptr[g]: grp_ptr[g+1]] + X_data_g, X_indptr_g, X_indices_g = sparse_columns_slice( + grp_g_indices, X_data, X_indptr, X_indices) + lipschitz[g] = spectral_norm( + X_data_g, X_indptr_g, X_indices_g, len(y)) ** 2 / len(y) + + return lipschitz + def value(self, y, w, Xw): return norm(y - Xw) ** 2 / (2 * len(y)) @@ -63,6 +78,24 @@ def gradient_g(self, X, y, w, Xw, g): return grad_g + def gradient_g_sparse(self, X_data, X_indptr, X_indices, y, w, Xw, g): + grp_ptr, grp_indices = self.grp_ptr, self.grp_indices + grp_g_indices = grp_indices[grp_ptr[g]: grp_ptr[g+1]] + + grad_g = np.zeros(len(grp_g_indices)) + for idx, j in enumerate(grp_g_indices): + grad_g[idx] = self.gradient_scalar_sparse( + X_data, X_indptr, X_indices, y, w, Xw, j) + + return grad_g + + def gradient_scalar_sparse(self, X_data, X_indptr, X_indices, y, w, Xw, j): + grad_j = 0. + for i in range(X_indptr[j], X_indptr[j+1]): + grad_j += X_data[i] * (Xw[X_indices[i]] - y[X_indices[i]]) + + return grad_j / len(y) + def gradient_scalar(self, X, y, w, Xw, j): return X[:, j] @ (Xw - y) / len(y) diff --git a/skglm/estimators.py b/skglm/estimators.py index 37c5f6ad8..ed46e7934 100644 --- a/skglm/estimators.py +++ b/skglm/estimators.py @@ -19,10 +19,12 @@ from sklearn.multiclass import OneVsRestClassifier, check_classification_targets from skglm.utils.jit_compilation import compiled_clone -from skglm.solvers import AndersonCD, MultiTaskBCD -from skglm.datafits import Cox, Quadratic, Logistic, QuadraticSVC, QuadraticMultiTask -from skglm.penalties import (L1, WeightedL1, L1_plus_L2, L2, +from skglm.solvers import AndersonCD, MultiTaskBCD, GroupBCD +from skglm.datafits import (Cox, Quadratic, Logistic, QuadraticSVC, + QuadraticMultiTask, QuadraticGroup) +from skglm.penalties import (L1, WeightedL1, L1_plus_L2, L2, WeightedGroupL2, MCPenalty, WeightedMCPenalty, IndicatorBox, L2_1) +from skglm.utils.data import grp_converter def _glm_fit(X, y, model, datafit, penalty, solver): @@ -1537,3 +1539,132 @@ def path(self, X, Y, alphas, coef_init=None, return_n_iter=False, **params): ws_strategy=self.ws_strategy, fit_intercept=self.fit_intercept, warm_start=self.warm_start, verbose=self.verbose) return solver.path(X, Y, datafit, penalty, alphas, coef_init, return_n_iter) + + +class GroupLasso(LinearModel, RegressorMixin): + r"""GroupLasso estimator based on Celer solver and primal extrapolation. + + The optimization objective for GroupLasso is: + + .. math:: + 1 / (2 xx n_"samples") \sum_g ||y - X_{[g]} w_{[g]}||_2 ^ 2 + alpha \sum_g + weights_g ||w_{[g]}||_2 + + with :math:`w_{[g]}` (respectively :math:`X_{[g]}`) being the coefficients + (respectively the columns) of the g-th group. + + Parameters + ---------- + groups : int | list of ints | list of lists of ints + Partition of features used in the penalty on ``w``. + If an int is passed, groups are contiguous blocks of features, of size + ``groups``. + If a list of ints is passed, groups are assumed to be contiguous, + group number ``g`` being of size ``groups[g]``. + If a list of lists of ints is passed, ``groups[g]`` contains the + feature indices of the group number ``g``. + + alpha : float, optional + Penalty strength. + + weights : array, shape (n_groups,), optional (default=None) + Positive weights used in the L1 penalty part of the Lasso + objective. If ``None``, weights equal to 1 are used. + + max_iter : int, optional (default=50) + The maximum number of iterations (subproblem definitions). + + max_epochs : int, optional (default=50_000) + Maximum number of CD epochs on each subproblem. + + p0 : int, optional (default=10) + First working set size. + + verbose : bool or int, optional (default=0) + Amount of verbosity. + + tol : float, optional (default=1e-4) + Stopping criterion for the optimization. + + positive : bool, optional (defautl=False) + When set to ``True``, forces the coefficient vector to be positive. + + fit_intercept : bool, optional (default=True) + Whether or not to fit an intercept. + + warm_start : bool, optional (default=False) + When set to ``True``, reuse the solution of the previous call to fit as + initialization, otherwise, just erase the previous solution. + + ws_strategy : str, optional (default="subdiff") + The score used to build the working set. Can be ``fixpoint`` or ``subdiff``. + + Attributes + ---------- + coef_ : array, shape (n_features,) + parameter vector (:math:`w` in the cost function formula) + + intercept_ : float + constant term in decision function. + + n_iter_ : int + Number of subproblems solved to reach the specified tolerance. + + Notes + ----- + Supports weights equal to ``0``, i.e. unpenalized features. + """ + + def __init__(self, groups, alpha=1., weights=None, max_iter=50, max_epochs=50_000, + p0=10, verbose=0, tol=1e-4, positive=False, fit_intercept=True, + warm_start=False, ws_strategy="subdiff"): + super().__init__() + self.alpha = alpha + self.groups = groups + self.weights = weights + self.tol = tol + self.max_iter = max_iter + self.max_epochs = max_epochs + self.p0 = p0 + self.ws_strategy = ws_strategy + self.positive = positive + self.fit_intercept = fit_intercept + self.warm_start = warm_start + self.verbose = verbose + + def fit(self, X, y): + """Fit the model according to the given training data. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Training data, where ``n_samples`` is the number of samples and + n_features is the number of features. + y : array-like, shape (n_samples,) + Target vector relative to ``X``. + + Returns + ------- + self : Instance of GroupLasso + Fitted estimator. + """ + grp_indices, grp_ptr = grp_converter(self.groups, X.shape[1]) + group_sizes = np.diff(grp_ptr) + + n_features = np.sum(group_sizes) + if X.shape[1] != n_features: + raise ValueError( + "The total number of group members must equal the number of features. " + f"Got {n_features}, expected {X.shape[1]}.") + + weights = np.ones(len(group_sizes)) if self.weights is None else self.weights + group_penalty = WeightedGroupL2(alpha=self.alpha, grp_ptr=grp_ptr, + grp_indices=grp_indices, weights=weights, + positive=self.positive) + quad_group = QuadraticGroup(grp_ptr=grp_ptr, grp_indices=grp_indices) + solver = GroupBCD( + self.max_iter, self.max_epochs, self.p0, tol=self.tol, + fit_intercept=self.fit_intercept, warm_start=self.warm_start, + verbose=self.verbose) + + return _glm_fit(X, y, self, quad_group, group_penalty, solver) diff --git a/skglm/penalties/block_separable.py b/skglm/penalties/block_separable.py index 00fe7c30d..a96ac8260 100644 --- a/skglm/penalties/block_separable.py +++ b/skglm/penalties/block_separable.py @@ -335,19 +335,31 @@ def subdiff_distance(self, w, grad_ws, ws): w_g = w[grp_g_indices] norm_w_g = norm(w_g) - if self.positive and np.any(w_g < 0): - scores[idx] = np.inf - elif self.positive and norm_w_g == 0: - # distance of -norm(neg_grad_g) to weights[g] * [-alpha, alpha] - neg_grad_g = grad_g[grad_g < 0.] - scores[idx] = max(0, norm(neg_grad_g) - self.alpha * weights[g]) - elif (not self.positive) and norm_w_g == 0: - # distance of -norm(grad_g) to weights[g] * [-alpha, alpha] - scores[idx] = max(0, norm(grad_g) - alpha * weights[g]) + if self.positive: + if norm_w_g == 0: + # distance of -neg_grad_g to weights[g] * [-alpha, alpha] + neg_grad_g = grad_g[grad_g < 0.] + scores[idx] = max(0, + norm(neg_grad_g) - self.alpha * weights[g]) + elif np.any(w_g < 0): + scores[idx] = np.inf + else: + res = np.zeros_like(grad_g) + for j in range(len(w_g)): + thresh = alpha * weights[g] * w_g[j] / norm_w_g + if w_g[j] > 0: + res[j] = -grad_g[j] - thresh + else: + # thresh is 0, we simplify the expression + res[j] = max(-grad_g[j], 0) + scores[idx] = norm(res) else: - # distance of -grad_g to the subdiff (here a singleton) - subdiff = alpha * weights[g] * w_g / norm_w_g - scores[idx] = norm(grad_g + subdiff) + if norm_w_g == 0: + scores[idx] = max(0, norm(grad_g) - alpha * weights[g]) + else: + # distance of -grad_g to the subdiff (here a singleton) + subdiff = alpha * weights[g] * w_g / norm_w_g + scores[idx] = norm(grad_g + subdiff) return scores diff --git a/skglm/solvers/group_bcd.py b/skglm/solvers/group_bcd.py index e005c8276..fcadbbe05 100644 --- a/skglm/solvers/group_bcd.py +++ b/skglm/solvers/group_bcd.py @@ -1,5 +1,6 @@ import numpy as np from numba import njit +from scipy.sparse import issparse from skglm.solvers.base import BaseSolver from skglm.utils.anderson import AndersonAcceleration @@ -66,8 +67,13 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): f"expected {n_features}, got {len(w)}.") raise ValueError(val_error_message) - datafit.initialize(X, y) - lipschitz = datafit.get_lipschitz(X, y) + is_sparse = issparse(X) + if is_sparse: + datafit.initialize_sparse(X.data, X.indptr, X.indices, y) + lipschitz = datafit.get_lipschitz_sparse(X.data, X.indptr, X.indices, y) + else: + datafit.initialize(X, y) + lipschitz = datafit.get_lipschitz(X, y) all_groups = np.arange(n_groups) p_objs_out = np.zeros(self.max_iter) @@ -75,7 +81,11 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): accelerator = AndersonAcceleration(K=5) for t in range(self.max_iter): - grad = _construct_grad(X, y, w, Xw, datafit, all_groups) + if is_sparse: + grad = _construct_grad_sparse( + X.data, X.indptr, X.indices, y, w, Xw, datafit, all_groups) + else: + grad = _construct_grad(X, y, w, Xw, datafit, all_groups) opt = penalty.subdiff_distance(w, grad, all_groups) if self.fit_intercept: @@ -102,7 +112,13 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): for epoch in range(self.max_epochs): # inplace update of w and Xw - _bcd_epoch(X, y, w[:n_features], Xw, lipschitz, datafit, penalty, ws) + if is_sparse: + _bcd_epoch_sparse(X.data, X.indptr, X.indices, y, w[:n_features], + Xw, lipschitz, datafit, penalty, ws) + + else: + _bcd_epoch(X, y, w[:n_features], Xw, + lipschitz, datafit, penalty, ws) # update intercept if self.fit_intercept: @@ -122,7 +138,12 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): # check sub-optimality every 10 epochs if epoch % 10 == 0: - grad_ws = _construct_grad(X, y, w, Xw, datafit, ws) + if is_sparse: + grad_ws = _construct_grad_sparse( + X.data, X.indptr, X.indices, y, w, Xw, datafit, ws) + else: + grad_ws = _construct_grad(X, y, w, Xw, datafit, ws) + opt_in = penalty.subdiff_distance(w, grad_ws, ws) stop_crit_in = np.max(opt_in) @@ -154,15 +175,35 @@ def _bcd_epoch(X, y, w, Xw, lipschitz, datafit, penalty, ws): grad_g = datafit.gradient_g(X, y, w, Xw, g) w[grp_g_indices] = penalty.prox_1group( - old_w_g - grad_g / lipschitz_g, - 1 / lipschitz_g, g - ) + old_w_g - grad_g / lipschitz_g, 1 / lipschitz_g, g) for idx, j in enumerate(grp_g_indices): if old_w_g[idx] != w[j]: Xw += (w[j] - old_w_g[idx]) * X[:, j] +@njit +def _bcd_epoch_sparse( + X_data, X_indptr, X_indices, y, w, Xw, lipschitz, datafit, penalty, ws): + # perform a single BCD epoch on groups in ws + grp_ptr, grp_indices = penalty.grp_ptr, penalty.grp_indices + + for g in ws: + grp_g_indices = grp_indices[grp_ptr[g]: grp_ptr[g+1]] + old_w_g = w[grp_g_indices].copy() + + lipschitz_g = lipschitz[g] + grad_g = datafit.gradient_g_sparse(X_data, X_indptr, X_indices, y, w, Xw, g) + + w[grp_g_indices] = penalty.prox_1group( + old_w_g - grad_g / lipschitz_g, 1 / lipschitz_g, g) + + for idx, j in enumerate(grp_g_indices): + if old_w_g[idx] != w[j]: + for i in range(X_indptr[j], X_indptr[j+1]): + Xw[X_indices[i]] += (w[j] - old_w_g[idx]) * X_data[i] + + @njit def _construct_grad(X, y, w, Xw, datafit, ws): # compute the -gradient according to each group in ws @@ -177,3 +218,19 @@ def _construct_grad(X, y, w, Xw, datafit, ws): grads[grad_ptr: grad_ptr+len(grad_g)] = grad_g grad_ptr += len(grad_g) return grads + + +@njit +def _construct_grad_sparse(X_data, X_indptr, X_indices, y, w, Xw, datafit, ws): + # compute the -gradient according to each group in ws + # note: -gradients are stacked in a 1d array ([-grad_ws_1, -grad_ws_2, ...]) + grp_ptr = datafit.grp_ptr + n_features_ws = sum([grp_ptr[g+1] - grp_ptr[g] for g in ws]) + + grads = np.zeros(n_features_ws) + grad_ptr = 0 + for g in ws: + grad_g = datafit.gradient_g_sparse(X_data, X_indptr, X_indices, y, w, Xw, g) + grads[grad_ptr: grad_ptr+len(grad_g)] = grad_g + grad_ptr += len(grad_g) + return grads diff --git a/skglm/tests/test_estimators.py b/skglm/tests/test_estimators.py index 37faf9416..0998cfbe5 100644 --- a/skglm/tests/test_estimators.py +++ b/skglm/tests/test_estimators.py @@ -3,7 +3,11 @@ import pytest import numpy as np +import pandas as pd +import scipy.optimize from numpy.linalg import norm +from scipy.sparse import csc_matrix, issparse +from celer import GroupLasso as GroupLasso_celer from sklearn.base import clone from sklearn.linear_model import Lasso as Lasso_sklearn @@ -14,22 +18,15 @@ from sklearn.svm import LinearSVC as LinearSVC_sklearn from sklearn.utils.estimator_checks import check_estimator -import scipy.optimize -from scipy.sparse import csc_matrix, issparse - -from skglm.utils.data import make_correlated_data, make_dummy_survival_data +from skglm.utils.data import (make_correlated_data, make_dummy_survival_data, + _alpha_max_group_lasso, grp_converter) from skglm.estimators import ( GeneralizedLinearEstimator, Lasso, MultiTaskLasso, WeightedLasso, ElasticNet, - MCPRegression, SparseLogisticRegression, LinearSVC) + MCPRegression, SparseLogisticRegression, LinearSVC, GroupLasso, CoxEstimator) from skglm.datafits import Logistic, Quadratic, QuadraticSVC, QuadraticMultiTask, Cox from skglm.penalties import L1, IndicatorBox, L1_plus_L2, MCPenalty, WeightedL1, SLOPE -from skglm.solvers import AndersonCD, FISTA - -import pandas as pd -from skglm.solvers import ProxNewton +from skglm.solvers import AndersonCD, FISTA, ProxNewton from skglm.utils.jit_compilation import compiled_clone -from skglm.estimators import CoxEstimator - n_samples = 50 n_tasks = 9 @@ -50,6 +47,7 @@ C = 1 / alpha tol = 1e-10 l1_ratio = 0.3 +groups = [20, 30, 10] dict_estimators_sk = {} dict_estimators_ours = {} @@ -119,7 +117,7 @@ def test_estimator(estimator_name, X, fit_intercept, positive): if fit_intercept and estimator_name == "SVC": pytest.xfail("Intercept is not supported for SVC.") if positive and estimator_name not in ( - "Lasso", "ElasticNet", "wLasso", "MCP", "wMCP"): + "Lasso", "ElasticNet", "wLasso", "MCP", "wMCP", "GroupLasso"): pytest.xfail("`positive` option is only supported by L1, L1_plus_L2 and wL1.") estimator_sk = clone(dict_estimators_sk[estimator_name]) @@ -368,7 +366,7 @@ def test_equivalence_cox_SLOPE_cox_L1(use_efron, issparse): method = 'efron' if use_efron else 'breslow' estimator = CoxEstimator(alpha, l1_ratio=1., method=method, tol=1e-9).fit(X, y) - np.testing.assert_allclose(w, estimator.coef_, atol=1e-6) + np.testing.assert_allclose(w, estimator.coef_, atol=1e-5) @pytest.mark.parametrize("use_efron", [True, False]) @@ -532,5 +530,75 @@ def test_warm_start(estimator_name): np.testing.assert_equal(0, model.n_iter_) +@pytest.mark.parametrize("fit_intercept, issparse", + product([False, True], [False, True])) +def test_GroupLasso_estimator(fit_intercept, issparse): + reg = 1e-1 + grp_indices, grp_ptr = grp_converter(groups, X.shape[1]) + n_groups = len(grp_ptr) - 1 + weights = np.abs(np.random.randn(n_groups)) + alpha = reg * _alpha_max_group_lasso(X, y, grp_indices, grp_ptr, weights) + + estimator_ours = GroupLasso(groups=groups, alpha=alpha, tol=tol, + weights=weights, fit_intercept=fit_intercept) + estimator_celer = GroupLasso_celer(groups=groups, alpha=alpha, tol=tol, + weights=weights, fit_intercept=fit_intercept) + + X_ = csc_matrix(X) if issparse else X + + estimator_celer.fit(X_, y) + estimator_ours.fit(X_, y) + coef_celer = estimator_celer.coef_ + coef_ours = estimator_ours.coef_ + + np.testing.assert_allclose(coef_ours, coef_celer, atol=1e-4, rtol=1e-2) + np.testing.assert_allclose(estimator_celer.intercept_, + estimator_ours.intercept_, atol=1e-5, rtol=1e-4) + + +@pytest.mark.parametrize("fit_intercept, issparse", + product([False, True], [True, False])) +def test_GroupLasso_estimator_positive(fit_intercept, issparse): + reg = 1e-1 + grp_indices, grp_ptr = grp_converter(groups, X.shape[1]) + n_groups = len(grp_ptr) - 1 + weights = np.abs(np.random.randn(n_groups)) + alpha = reg * _alpha_max_group_lasso(X, y, grp_indices, grp_ptr, weights) + + estimator_ours = GroupLasso(groups=groups, alpha=alpha, tol=tol, + weights=weights, fit_intercept=fit_intercept, + positive=True) + + X_ = csc_matrix(X) if issparse else X + estimator_ours.fit(X_, y) + + # check all coefs are positive + coef_ = estimator_ours.coef_ + np.testing.assert_equal(len(coef_[coef_ < 0]), 0) + # check optimality + np.testing.assert_array_less(estimator_ours.stop_crit_, tol) + + +@pytest.mark.parametrize("positive", [False, True]) +def test_GroupLasso_estimator_sparse_vs_dense(positive): + reg = 1e-1 + grp_indices, grp_ptr = grp_converter(groups, X.shape[1]) + n_groups = len(grp_ptr) - 1 + weights = np.abs(np.random.randn(n_groups)) + alpha = reg * _alpha_max_group_lasso(X, y, grp_indices, grp_ptr, weights) + + glasso = GroupLasso(groups=groups, alpha=alpha, tol=1e-8, + weights=weights, positive=positive) + + glasso.fit(X, y) + coef_dense = glasso.coef_ + + X_sparse = csc_matrix(X) + glasso.fit(X_sparse, y) + coef_sparse = glasso.coef_ + + np.testing.assert_allclose(coef_sparse, coef_dense, atol=1e-7, rtol=1e-5) + + if __name__ == "__main__": pass diff --git a/skglm/tests/test_fista.py b/skglm/tests/test_fista.py index a1f907821..04f9c1ea8 100644 --- a/skglm/tests/test_fista.py +++ b/skglm/tests/test_fista.py @@ -3,15 +3,12 @@ import numpy as np from numpy.linalg import norm -import scipy.sparse -import scipy.sparse.linalg from scipy.sparse import csc_matrix, issparse from skglm.penalties import L1, IndicatorBox from skglm.solvers import FISTA, AndersonCD from skglm.datafits import Quadratic, Logistic, QuadraticSVC -from skglm.utils.sparse_ops import spectral_norm from skglm.utils.data import make_correlated_data from skglm.utils.jit_compilation import compiled_clone @@ -56,17 +53,5 @@ def test_fista_solver(X, Datafit, Penalty): np.testing.assert_allclose(w_fista, w_cd, atol=1e-7) -def test_spectral_norm(): - n_samples, n_features = 50, 60 - A_sparse = scipy.sparse.random(n_samples, n_features, density=0.7, format='csc', - random_state=random_state) - - A_bundles = (A_sparse.data, A_sparse.indptr, A_sparse.indices) - spectral_norm_our = spectral_norm(*A_bundles, n_samples=len(y)) - spectral_norm_sp = scipy.sparse.linalg.svds(A_sparse, k=1)[1] - - np.testing.assert_allclose(spectral_norm_our, spectral_norm_sp) - - if __name__ == '__main__': pass diff --git a/skglm/tests/test_sparse_ops.py b/skglm/tests/test_sparse_ops.py new file mode 100644 index 000000000..9af93bea1 --- /dev/null +++ b/skglm/tests/test_sparse_ops.py @@ -0,0 +1,36 @@ +import numpy as np +import scipy.sparse + +from skglm.utils.sparse_ops import spectral_norm, sparse_columns_slice + + +def test_spectral_norm(): + n_samples, n_features = 50, 60 + A_sparse = scipy.sparse.random( + n_samples, n_features, density=0.7, format='csc', random_state=37) + + A_bundles = (A_sparse.data, A_sparse.indptr, A_sparse.indices) + spectral_norm_our = spectral_norm(*A_bundles, n_samples=A_sparse.shape[0]) + spectral_norm_sp = scipy.sparse.linalg.svds(A_sparse, k=1)[1] + + np.testing.assert_allclose(spectral_norm_our, spectral_norm_sp) + + +def test_slice_cols_sparse(): + n_samples, n_features = 20, 50 + rng = np.random.RandomState(546) + + M = scipy.sparse.random( + n_samples, n_features, density=0.9, format="csc", random_state=rng) + cols = rng.choice(n_features, size=n_features // 10, replace=False) + + sub_M_data, sub_M_indptr, sub_M_indices = sparse_columns_slice( + cols, M.data, M.indptr, M.indices) + sub_M = scipy.sparse.csc_matrix( + (sub_M_data, sub_M_indices, sub_M_indptr), shape=(n_samples, len(cols))) + + np.testing.assert_array_equal(sub_M.toarray(), M.toarray()[:, cols]) + + +if __name__ == "__main__": + pass diff --git a/skglm/utils/sparse_ops.py b/skglm/utils/sparse_ops.py index abef3267b..dab1ee8b0 100644 --- a/skglm/utils/sparse_ops.py +++ b/skglm/utils/sparse_ops.py @@ -58,6 +58,50 @@ def spectral_norm(X_data, X_indptr, X_indices, n_samples, return np.sqrt(eigenvalue) +@njit +def sparse_columns_slice(cols, X_data, X_indptr, X_indices): + """Select a sub matrix from CSC sparse matrix. + + Similar to ``X[:, cols]`` but for ``X`` a CSC sparse matrix. + + Parameters + ---------- + cols : array of int + Columns to select in matrix ``X``. + + X_data : array, shape (n_elements,) + ``data`` attribute of the sparse CSC matrix ``X``. + + X_indptr : array, shape (n_features + 1,) + ``indptr`` attribute of the sparse CSC matrix ``X``. + + X_indices : array, shape (n_elements,) + ``indices`` attribute of the sparse CSC matrix ``X``. + + Returns + ------- + sub_X_data, sub_X_indptr, sub_X_indices + The ``data``, ``indptr``, and ``indices`` attributes of the sub matrix. + """ + nnz = sum([X_indptr[j+1] - X_indptr[j] for j in cols]) + + sub_X_indptr = np.zeros(len(cols) + 1, dtype=cols.dtype) + sub_X_indices = np.zeros(nnz, dtype=X_indices.dtype) + sub_X_data = np.zeros(nnz, dtype=X_data.dtype) + + for idx, j in enumerate(cols): + n_elements = X_indptr[j+1] - X_indptr[j] + sub_X_indptr[idx + 1] = sub_X_indptr[idx] + n_elements + + col_j_slice = slice(X_indptr[j], X_indptr[j+1]) + col_idx_slice = slice(sub_X_indptr[idx], sub_X_indptr[idx+1]) + + sub_X_indices[col_idx_slice] = X_indices[col_j_slice] + sub_X_data[col_idx_slice] = X_data[col_j_slice] + + return sub_X_data, sub_X_indptr, sub_X_indices + + @njit def _XXT_dot_vec(X_data, X_indptr, X_indices, vec, n_samples): # computes X @ X.T @ vec, with X csc encoded From dbcc2073654e79b2d375dc4e05a6832069a8a742 Mon Sep 17 00:00:00 2001 From: mathurinm Date: Thu, 4 Apr 2024 02:46:42 +0200 Subject: [PATCH 24/52] DOC add ucurve example (#239) Co-authored-by: Badr-MOUFAD --- examples/plot_ucurve.py | 56 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 examples/plot_ucurve.py diff --git a/examples/plot_ucurve.py b/examples/plot_ucurve.py new file mode 100644 index 000000000..16ad2ee98 --- /dev/null +++ b/examples/plot_ucurve.py @@ -0,0 +1,56 @@ +""" +============================== +Show U-curve of regularization +============================== +Illustrate the sweet spot of regularization: not too much, not too little. +We showcase that for the Lasso estimator on the ``rcv1.binary`` dataset. +""" + +import numpy as np +from numpy.linalg import norm +import matplotlib.pyplot as plt +from libsvmdata import fetch_libsvm + +from sklearn.model_selection import train_test_split +from sklearn.metrics import mean_squared_error + +from skglm import Lasso + +# %% +# First, we load the dataset and keep 2000 features. +# We also retrain 2000 samples in training dataset. +X, y = fetch_libsvm("rcv1.binary") + +X = X[:, :2000] +X_train, X_test, y_train, y_test = train_test_split(X, y) +X_train, y_train = X_train[:2000], y_train[:2000] + +# %% +# Next, we define the regularization path. +# For Lasso, it is well know that there is an ``alpha_max`` above which the optimal solution is the zero vector. +alpha_max = norm(X_train.T @ y_train, ord=np.inf) / len(y_train) +alphas = alpha_max * np.geomspace(1, 1e-4) + +# %% +# Let's train the estimator along the regularization path and then compute the MSE on train and test data. +mse_train = [] +mse_test = [] + +clf = Lasso(fit_intercept=False, tol=1e-8, warm_start=True) +for idx, alpha in enumerate(alphas): + clf.alpha = alpha + clf.fit(X_train, y_train) + + mse_train.append(mean_squared_error(y_train, clf.predict(X_train))) + mse_test.append(mean_squared_error(y_test, clf.predict(X_test))) + +# %% +# Finally, we can plot the train and test MSE. +# Notice the "sweet spot" at around ``1e-4``, which sits at the boundary between underfitting and overfitting. +plt.close('all') +plt.semilogx(alphas, mse_train, label='train MSE') +plt.semilogx(alphas, mse_test, label='test MSE') +plt.legend() +plt.title("Mean squared error") +plt.xlabel(r"Lasso regularization strength $\lambda$") +plt.show(block=False) From 891f75c3eb74b9dac746f98b8c1de7e0bbdf45cb Mon Sep 17 00:00:00 2001 From: mathurinm Date: Thu, 4 Apr 2024 14:44:31 +0200 Subject: [PATCH 25/52] FIX objective function in docstring ``GroupLasso`` (#241) --- skglm/estimators.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/skglm/estimators.py b/skglm/estimators.py index ed46e7934..cc488a422 100644 --- a/skglm/estimators.py +++ b/skglm/estimators.py @@ -1547,11 +1547,10 @@ class GroupLasso(LinearModel, RegressorMixin): The optimization objective for GroupLasso is: .. math:: - 1 / (2 xx n_"samples") \sum_g ||y - X_{[g]} w_{[g]}||_2 ^ 2 + alpha \sum_g + 1 / (2 xx n_"samples") ||y - X w||_2 ^ 2 + alpha \sum_g weights_g ||w_{[g]}||_2 - with :math:`w_{[g]}` (respectively :math:`X_{[g]}`) being the coefficients - (respectively the columns) of the g-th group. + with :math:`w_{[g]}` the coefficients of the g-th group. Parameters ---------- From e7c25b227fae094a6432bb48955d8b4ab52e57dc Mon Sep 17 00:00:00 2001 From: Badr MOUFAD <65614794+Badr-MOUFAD@users.noreply.github.com> Date: Mon, 8 Apr 2024 11:15:07 +0200 Subject: [PATCH 26/52] DOC - update documentation (#242) --- CITATION.bib | 6 ++++++ doc/conf.py | 3 ++- doc/index.rst | 12 ++++-------- 3 files changed, 12 insertions(+), 9 deletions(-) create mode 100644 CITATION.bib diff --git a/CITATION.bib b/CITATION.bib new file mode 100644 index 000000000..ed5cb1e3d --- /dev/null +++ b/CITATION.bib @@ -0,0 +1,6 @@ +@inproceedings{skglm, + title = {Beyond L1: Faster and better sparse models with skglm}, + author = {Q. Bertrand and Q. Klopfenstein and P.-A. Bannier and G. Gidel and M. Massias}, + booktitle = {NeurIPS}, + year = {2022}, +} diff --git a/doc/conf.py b/doc/conf.py index a976fefcc..18b78cc5d 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -15,6 +15,7 @@ import os import sys import warnings +from datetime import date import sphinx_gallery # noqa import sphinx_bootstrap_theme # noqa @@ -95,7 +96,7 @@ # General information about the project. project = u'skglm' -copyright = u'2022, skglm developers' +copyright = f'2022-{date.today().year}, skglm developers' author = u'skglm developers' # The version info for the project you're documenting, acts as replacement for diff --git a/doc/index.rst b/doc/index.rst index a8b17ce0f..f95e5c3e4 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -6,12 +6,12 @@ ========= ``skglm`` ========= -*— A fast and modular scikit-learn replacement for sparse GLMs —* +*— A fast and modular scikit-learn replacement for regularized GLMs —* -------- -``skglm`` is a Python package that offers **fast estimators** for sparse Generalized Linear Models (GLMs) +``skglm`` is a Python package that offers **fast estimators** for regularized Generalized Linear Models (GLMs) that are **100% compatible with** ``scikit-learn``. It is **highly flexible** and supports a wide range of GLMs. You get to choose from ``skglm``'s already-made estimators or **customize your own** by combining the available datafits and penalties. @@ -21,7 +21,7 @@ Get a hands-on glimpse on ``skglm`` through the :ref:`Getting started page `. Other advanced topics and uses-cases are covered in :ref:`Tutorials `. -.. note:: - - - Currently, ``skglm`` is unavailable on Conda but will be released very soon... - Cite ---- @@ -77,6 +73,7 @@ You are free to use it and if you do so, please cite and G. Gidel and M. Massias}, booktitle = {NeurIPS}, year = {2022}, + } .. it is mandatory to keep the toctree here although it doesn't show up in the page @@ -93,4 +90,3 @@ You are free to use it and if you do so, please cite api.rst contribute.rst changes/whats_new.rst - From 27dec4b5127fb0858bc2d939c6145f3521476d69 Mon Sep 17 00:00:00 2001 From: mathurinm Date: Fri, 26 Apr 2024 15:27:39 +0200 Subject: [PATCH 27/52] DOC add group logistic regression example (#246) --- examples/plot_group_logistic_regression.py | 44 ++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 examples/plot_group_logistic_regression.py diff --git a/examples/plot_group_logistic_regression.py b/examples/plot_group_logistic_regression.py new file mode 100644 index 000000000..7e722f02f --- /dev/null +++ b/examples/plot_group_logistic_regression.py @@ -0,0 +1,44 @@ +""" +=================================== +Group Logistic regression in python +=================================== +Scikit-learn is missing a Group Logistic regression estimator. We show how to implement +one with ``skglm``. +""" + +# Author: Mathurin Massias + +import numpy as np + +from skglm import GeneralizedLinearEstimator +from skglm.datafits import LogisticGroup +from skglm.penalties import WeightedGroupL2 +from skglm.solvers import GroupProxNewton +from skglm.utils.data import make_correlated_data, grp_converter + +n_features = 30 +X, y, _ = make_correlated_data( + n_samples=10, n_features=30, random_state=0) +y = np.sign(y) + + +# %% +# Classifier creation: combination of penalty, datafit and solver. +# +grp_size = 3 # groups are made of groups of 3 consecutive features +n_groups = n_features // grp_size +grp_indices, grp_ptr = grp_converter(grp_size, n_features=n_features) +alpha = 0.01 +weights = np.ones(n_groups) +penalty = WeightedGroupL2(alpha, weights, grp_ptr, grp_indices) +datafit = LogisticGroup(grp_ptr, grp_indices) +solver = GroupProxNewton(verbose=2) + +# %% +# Train the model +clf = GeneralizedLinearEstimator(datafit, penalty, solver) +clf.fit(X, y) + +# %% +# Fit check that groups are either all 0 or all non zero +print(clf.coef_.reshape(-1, grp_size)) \ No newline at end of file From e0c28d7ef0b32d53c03dc2f97369c51543f67859 Mon Sep 17 00:00:00 2001 From: mathurinm Date: Fri, 31 May 2024 17:01:41 +0200 Subject: [PATCH 28/52] ENH implement gradient and allow `y_i = 0` in `Poisson` datafit (#253) Co-authored-by: Badr-MOUFAD --- skglm/datafits/single_task.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/skglm/datafits/single_task.py b/skglm/datafits/single_task.py index 7e8888591..132024d2e 100644 --- a/skglm/datafits/single_task.py +++ b/skglm/datafits/single_task.py @@ -431,15 +431,15 @@ def params_to_dict(self): return dict() def initialize(self, X, y): - if np.any(y <= 0): + if np.any(y < 0): raise ValueError( - "Target vector `y` should only take positive values " + + "Target vector `y` should only take positive values " "when fitting a Poisson model.") def initialize_sparse(self, X_data, X_indptr, X_indices, y): - if np.any(y <= 0): + if np.any(y < 0): raise ValueError( - "Target vector `y` should only take positive values " + + "Target vector `y` should only take positive values " "when fitting a Poisson model.") def raw_grad(self, y, Xw): @@ -453,6 +453,9 @@ def raw_hessian(self, y, Xw): def value(self, y, w, Xw): return np.sum(np.exp(Xw) - y * Xw) / len(y) + def gradient(self, X, y, Xw): + return X.T @ self.raw_grad(y, Xw) + def gradient_scalar(self, X, y, w, Xw, j): return (X[:, j] @ (np.exp(Xw) - y)) / len(y) From 840e8b247d981d7da14f61ff980fa82ede7b44e9 Mon Sep 17 00:00:00 2001 From: mathurinm Date: Fri, 31 May 2024 17:52:10 +0200 Subject: [PATCH 29/52] ENH gradient, raw grad and hessian for Quadratic (#257) --- skglm/datafits/single_task.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/skglm/datafits/single_task.py b/skglm/datafits/single_task.py index 132024d2e..127cacaf4 100644 --- a/skglm/datafits/single_task.py +++ b/skglm/datafits/single_task.py @@ -92,6 +92,17 @@ def gradient_scalar_sparse(self, X_data, X_indptr, X_indices, y, Xw, j): XjTXw += X_data[i] * Xw[X_indices[i]] return (XjTXw - self.Xty[j]) / len(Xw) + def gradient(self, X, y, Xw): + return X.T @ (Xw - y) / len(y) + + def raw_grad(self, y, Xw): + """Compute gradient of datafit w.r.t ``Xw``.""" + return (Xw - y) / len(y) + + def raw_hessian(self, y, Xw): + """Compute Hessian of datafit w.r.t ``Xw``.""" + return np.ones(len(y)) / len(y) + def full_grad_sparse( self, X_data, X_indptr, X_indices, y, Xw): n_features = X_indptr.shape[0] - 1 From e60ca6a436af90d0c133d4c16baca36e7c2194ba Mon Sep 17 00:00:00 2001 From: mathurinm Date: Mon, 3 Jun 2024 07:46:57 +0200 Subject: [PATCH 30/52] FIX ProxNewton solver with fixpoint strategy (#259) Co-authored-by: Badr-MOUFAD --- skglm/solvers/anderson_cd.py | 2 +- skglm/solvers/common.py | 12 ++++++------ skglm/solvers/multitask_bcd.py | 20 +++++++++++--------- skglm/solvers/prox_newton.py | 29 ++++++++++++++++------------- 4 files changed, 34 insertions(+), 29 deletions(-) diff --git a/skglm/solvers/anderson_cd.py b/skglm/solvers/anderson_cd.py index cc4efbc3b..8b3437de4 100644 --- a/skglm/solvers/anderson_cd.py +++ b/skglm/solvers/anderson_cd.py @@ -184,7 +184,7 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): opt_ws = penalty.subdiff_distance(w[:n_features], grad_ws, ws) elif self.ws_strategy == "fixpoint": opt_ws = dist_fix_point_cd( - w[:n_features], grad_ws, lipschitz, datafit, penalty, ws + w[:n_features], grad_ws, lipschitz[ws], datafit, penalty, ws ) stop_crit_in = np.max(opt_ws) diff --git a/skglm/solvers/common.py b/skglm/solvers/common.py index a8a3f4ec3..a5e7216f2 100644 --- a/skglm/solvers/common.py +++ b/skglm/solvers/common.py @@ -3,7 +3,7 @@ @njit -def dist_fix_point_cd(w, grad_ws, lipschitz, datafit, penalty, ws): +def dist_fix_point_cd(w, grad_ws, lipschitz_ws, datafit, penalty, ws): """Compute the violation of the fixed point iterate scheme. Parameters @@ -14,8 +14,8 @@ def dist_fix_point_cd(w, grad_ws, lipschitz, datafit, penalty, ws): grad_ws : array, shape (ws_size,) Gradient restricted to the working set. - lipschitz : array, shape (n_features,) - Coordinatewise gradient Lipschitz constants. + lipschitz_ws : array, shape (len(ws),) + Coordinatewise gradient Lipschitz constants, restricted to working set. datafit: instance of BaseDatafit Datafit. @@ -23,7 +23,7 @@ def dist_fix_point_cd(w, grad_ws, lipschitz, datafit, penalty, ws): penalty: instance of BasePenalty Penalty. - ws : array, shape (ws_size,) + ws : array, shape (len(ws),) The working set. Returns @@ -34,10 +34,10 @@ def dist_fix_point_cd(w, grad_ws, lipschitz, datafit, penalty, ws): dist = np.zeros(ws.shape[0], dtype=w.dtype) for idx, j in enumerate(ws): - if lipschitz[j] == 0.: + if lipschitz_ws[idx] == 0.: continue - step_j = 1 / lipschitz[j] + step_j = 1 / lipschitz_ws[idx] dist[idx] = np.abs( w[j] - penalty.prox_1d(w[j] - step_j * grad_ws[idx], step_j, j) ) diff --git a/skglm/solvers/multitask_bcd.py b/skglm/solvers/multitask_bcd.py index 5a68d0c5e..16301ac4a 100644 --- a/skglm/solvers/multitask_bcd.py +++ b/skglm/solvers/multitask_bcd.py @@ -66,7 +66,9 @@ def solve(self, X, Y, datafit, penalty, W_init=None, XW_init=None): if self.ws_strategy == "subdiff": opt = penalty.subdiff_distance(W, grad, all_feats) elif self.ws_strategy == "fixpoint": - opt = dist_fix_point_bcd(W, grad, datafit, penalty, all_feats) + opt = dist_fix_point_bcd( + W, grad, lipschitz, datafit, penalty, all_feats + ) stop_crit = np.max(opt) if self.verbose: print(f"Stopping criterion max violation: {stop_crit:.2e}") @@ -151,7 +153,7 @@ def solve(self, X, Y, datafit, penalty, W_init=None, XW_init=None): opt_ws = penalty.subdiff_distance(W, grad_ws, ws) elif self.ws_strategy == "fixpoint": opt_ws = dist_fix_point_bcd( - W, grad_ws, lipschitz, datafit, penalty, ws + W, grad_ws, lipschitz[ws], datafit, penalty, ws ) stop_crit_in = np.max(opt_ws) @@ -231,7 +233,7 @@ def path(self, X, Y, datafit, penalty, alphas, W_init=None, return_n_iter=False) @njit -def dist_fix_point_bcd(W, grad_ws, lipschitz, datafit, penalty, ws): +def dist_fix_point_bcd(W, grad_ws, lipschitz_ws, datafit, penalty, ws): """Compute the violation of the fixed point iterate schema. Parameters @@ -239,19 +241,19 @@ def dist_fix_point_bcd(W, grad_ws, lipschitz, datafit, penalty, ws): W : array, shape (n_features, n_tasks) Coefficient matrix. - grad_ws : array, shape (ws_size, n_tasks) + grad_ws : array, shape (len(ws), n_tasks) Gradient restricted to the working set. datafit: instance of BaseMultiTaskDatafit Datafit. - lipschitz : array, shape (n_features,) - Blockwise gradient Lipschitz constants. + lipschitz_ws : array, shape (len(ws),) + Blockwise gradient Lipschitz constants, restricted to working set. penalty: instance of BasePenalty Penalty. - ws : array, shape (ws_size,) + ws : array, shape (len(ws),) The working set. Returns @@ -262,10 +264,10 @@ def dist_fix_point_bcd(W, grad_ws, lipschitz, datafit, penalty, ws): dist = np.zeros(ws.shape[0]) for idx, j in enumerate(ws): - if lipschitz[j] == 0.: + if lipschitz_ws[idx] == 0.: continue - step_j = 1 / lipschitz[j] + step_j = 1 / lipschitz_ws[idx] dist[idx] = norm( W[j] - penalty.prox_1feat(W[j] - step_j * grad_ws[idx], step_j, j) ) diff --git a/skglm/solvers/prox_newton.py b/skglm/solvers/prox_newton.py index b077ea567..4b8e0aaf7 100644 --- a/skglm/solvers/prox_newton.py +++ b/skglm/solvers/prox_newton.py @@ -65,6 +65,9 @@ def __init__(self, p0=10, max_iter=20, max_pn_iter=1000, tol=1e-4, self.verbose = verbose def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): + if self.ws_strategy not in ("subdiff", "fixpoint"): + raise ValueError("ws_strategy must be `subdiff` or `fixpoint`, " + f"got {self.ws_strategy}.") dtype = X.dtype n_samples, n_features = X.shape fit_intercept = self.fit_intercept @@ -206,9 +209,9 @@ def _descent_direction(X, y, w_epoch, Xw_epoch, fit_intercept, grad_ws, datafit, dtype = X.dtype raw_hess = datafit.raw_hessian(y, Xw_epoch) - lipschitz = np.zeros(len(ws), dtype) + lipschitz_ws = np.zeros(len(ws), dtype) for idx, j in enumerate(ws): - lipschitz[idx] = raw_hess @ X[:, j] ** 2 + lipschitz_ws[idx] = raw_hess @ X[:, j] ** 2 # for a less costly stopping criterion, we do not compute the exact gradient, # but store each coordinate-wise gradient every time we update one coordinate @@ -224,12 +227,12 @@ def _descent_direction(X, y, w_epoch, Xw_epoch, fit_intercept, grad_ws, datafit, for cd_iter in range(MAX_CD_ITER): for idx, j in enumerate(ws): # skip when X[:, j] == 0 - if lipschitz[idx] == 0: + if lipschitz_ws[idx] == 0: continue past_grads[idx] = grad_ws[idx] + X[:, j] @ (raw_hess * X_delta_w_ws) old_w_idx = w_ws[idx] - stepsize = 1 / lipschitz[idx] + stepsize = 1 / lipschitz_ws[idx] w_ws[idx] = penalty.prox_1d( old_w_idx - stepsize * past_grads[idx], stepsize, j) @@ -253,7 +256,7 @@ def _descent_direction(X, y, w_epoch, Xw_epoch, fit_intercept, grad_ws, datafit, opt = penalty.subdiff_distance(current_w, past_grads, ws) elif ws_strategy == "fixpoint": opt = dist_fix_point_cd( - current_w, past_grads, lipschitz, datafit, penalty, ws + current_w, past_grads, lipschitz_ws, datafit, penalty, ws ) stop_crit = np.max(opt) @@ -264,7 +267,7 @@ def _descent_direction(X, y, w_epoch, Xw_epoch, fit_intercept, grad_ws, datafit, break # descent direction - return w_ws - w_epoch[ws_intercept], X_delta_w_ws, lipschitz + return w_ws - w_epoch[ws_intercept], X_delta_w_ws, lipschitz_ws # sparse version of _descent_direction @@ -275,10 +278,10 @@ def _descent_direction_s(X_data, X_indptr, X_indices, y, w_epoch, dtype = X_data.dtype raw_hess = datafit.raw_hessian(y, Xw_epoch) - lipschitz = np.zeros(len(ws), dtype) + lipschitz_ws = np.zeros(len(ws), dtype) for idx, j in enumerate(ws): - # equivalent to: lipschitz[idx] += raw_hess * X[:, j] ** 2 - lipschitz[idx] = _sparse_squared_weighted_norm( + # equivalent to: lipschitz_ws[idx] += raw_hess * X[:, j] ** 2 + lipschitz_ws[idx] = _sparse_squared_weighted_norm( X_data, X_indptr, X_indices, j, raw_hess) # see _descent_direction() comment @@ -294,7 +297,7 @@ def _descent_direction_s(X_data, X_indptr, X_indices, y, w_epoch, for cd_iter in range(MAX_CD_ITER): for idx, j in enumerate(ws): # skip when X[:, j] == 0 - if lipschitz[idx] == 0: + if lipschitz_ws[idx] == 0: continue past_grads[idx] = grad_ws[idx] @@ -303,7 +306,7 @@ def _descent_direction_s(X_data, X_indptr, X_indices, y, w_epoch, X_data, X_indptr, X_indices, j, X_delta_w_ws, raw_hess) old_w_idx = w_ws[idx] - stepsize = 1 / lipschitz[idx] + stepsize = 1 / lipschitz_ws[idx] w_ws[idx] = penalty.prox_1d( old_w_idx - stepsize * past_grads[idx], stepsize, j) @@ -328,7 +331,7 @@ def _descent_direction_s(X_data, X_indptr, X_indices, y, w_epoch, opt = penalty.subdiff_distance(current_w, past_grads, ws) elif ws_strategy == "fixpoint": opt = dist_fix_point_cd( - current_w, past_grads, lipschitz, datafit, penalty, ws + current_w, past_grads, lipschitz_ws, datafit, penalty, ws ) stop_crit = np.max(opt) @@ -339,7 +342,7 @@ def _descent_direction_s(X_data, X_indptr, X_indices, y, w_epoch, break # descent direction - return w_ws - w_epoch[ws_intercept], X_delta_w_ws, lipschitz + return w_ws - w_epoch[ws_intercept], X_delta_w_ws, lipschitz_ws @njit From 553c3d68b4d54bd9557470cf46e71bab8e2b77e5 Mon Sep 17 00:00:00 2001 From: SujayP Date: Mon, 24 Jun 2024 10:12:00 -0400 Subject: [PATCH 31/52] FEAT add WeightedQuadratic datafit to allow sample weights (#258) Co-authored-by: mathurinm Co-authored-by: QB3 --- doc/api.rst | 1 + doc/changes/0.4.rst | 1 + examples/plot_survival_analysis.py | 2 +- skglm/datafits/__init__.py | 5 +- skglm/datafits/single_task.py | 120 +++++++++++++++++++++++++++++ skglm/tests/test_datafits.py | 52 ++++++++++++- 6 files changed, 177 insertions(+), 4 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index f925a89f6..6781ac9df 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -69,6 +69,7 @@ Datafits Quadratic QuadraticGroup QuadraticSVC + WeightedQuadratic Solvers diff --git a/doc/changes/0.4.rst b/doc/changes/0.4.rst index 98d14d30a..c131143b6 100644 --- a/doc/changes/0.4.rst +++ b/doc/changes/0.4.rst @@ -4,6 +4,7 @@ Version 0.4 (in progress) ------------------------- - Add :ref:`GroupLasso Estimator ` (PR: :gh:`228`) - Add support and tutorial for positive coefficients to :ref:`Group Lasso Penalty ` (PR: :gh:`221`) +- Add support to weight samples in the quadratic datafit :ref:`Weighted Quadratic Datafit ` (PR: :gh:`258`) Version 0.3.1 (2023/12/21) diff --git a/examples/plot_survival_analysis.py b/examples/plot_survival_analysis.py index 3124b9a82..dca110680 100644 --- a/examples/plot_survival_analysis.py +++ b/examples/plot_survival_analysis.py @@ -53,7 +53,7 @@ # %% # Fitting the Cox Estimator -# ----------------- +# ------------------------- # # After generating the synthetic data, we can now fit a L1-regularized Cox estimator. # Todo so, we need to combine a Cox datafit and a :math:`\ell_1` penalty diff --git a/skglm/datafits/__init__.py b/skglm/datafits/__init__.py index 856b65247..b8515e543 100644 --- a/skglm/datafits/__init__.py +++ b/skglm/datafits/__init__.py @@ -1,5 +1,6 @@ from .base import BaseDatafit, BaseMultitaskDatafit -from .single_task import Quadratic, QuadraticSVC, Logistic, Huber, Poisson, Gamma, Cox +from .single_task import (Quadratic, QuadraticSVC, Logistic, Huber, Poisson, Gamma, + Cox, WeightedQuadratic,) from .multi_task import QuadraticMultiTask from .group import QuadraticGroup, LogisticGroup @@ -8,5 +9,5 @@ BaseDatafit, BaseMultitaskDatafit, Quadratic, QuadraticSVC, Logistic, Huber, Poisson, Gamma, Cox, QuadraticMultiTask, - QuadraticGroup, LogisticGroup + QuadraticGroup, LogisticGroup, WeightedQuadratic ] diff --git a/skglm/datafits/single_task.py b/skglm/datafits/single_task.py index 127cacaf4..5750ea295 100644 --- a/skglm/datafits/single_task.py +++ b/skglm/datafits/single_task.py @@ -119,6 +119,126 @@ def intercept_update_step(self, y, Xw): return np.mean(Xw - y) +class WeightedQuadratic(BaseDatafit): + r"""Weighted Quadratic datafit to handle sample weights. + + The datafit reads: + + .. math:: 1 / (2 xx \sum_(i=1)^(n_"samples") weights_i) + \sum_(i=1)^(n_"samples") weights_i (y_i - (Xw)_i)^ 2 + + Attributes + ---------- + Xtwy : array, shape (n_features,) + Pre-computed quantity used during the gradient evaluation. + Equal to ``X.T @ (samples_weights * y)``. + sample_weights : array, shape (n_samples,) + Weights for each sample. + + Note + ---- + The class is jit compiled at fit time using Numba compiler. + This allows for faster computations. + """ + + def __init__(self, sample_weights): + self.sample_weights = sample_weights + + def get_spec(self): + spec = ( + ('Xtwy', float64[:]), + ('sample_weights', float64[:]), + ) + return spec + + def params_to_dict(self): + return {'sample_weights': self.sample_weights} + + def get_lipschitz(self, X, y): + n_features = X.shape[1] + lipschitz = np.zeros(n_features, dtype=X.dtype) + w_sum = self.sample_weights.sum() + + for j in range(n_features): + lipschitz[j] = (self.sample_weights * X[:, j] ** 2).sum() / w_sum + + return lipschitz + + def get_lipschitz_sparse(self, X_data, X_indptr, X_indices, y): + n_features = len(X_indptr) - 1 + lipschitz = np.zeros(n_features, dtype=X_data.dtype) + w_sum = self.sample_weights.sum() + + for j in range(n_features): + nrm2 = 0. + for idx in range(X_indptr[j], X_indptr[j + 1]): + nrm2 += self.sample_weights[X_indices[idx]] * X_data[idx] ** 2 + + lipschitz[j] = nrm2 / w_sum + + return lipschitz + + def initialize(self, X, y): + self.Xtwy = X.T @ (self.sample_weights * y) + + def initialize_sparse(self, X_data, X_indptr, X_indices, y): + n_features = len(X_indptr) - 1 + self.Xty = np.zeros(n_features, dtype=X_data.dtype) + + for j in range(n_features): + xty = 0 + for idx in range(X_indptr[j], X_indptr[j + 1]): + xty += (X_data[idx] * self.sample_weights[X_indices[idx]] + * y[X_indices[idx]]) + self.Xty[j] = xty + + def get_global_lipschitz(self, X, y): + w_sum = self.sample_weights.sum() + return norm(X.T @ np.sqrt(self.sample_weights), ord=2) ** 2 / w_sum + + def get_global_lipschitz_sparse(self, X_data, X_indptr, X_indices, y): + return spectral_norm( + X_data * np.sqrt(self.sample_weights[X_indices]), + X_indptr, X_indices, len(y)) ** 2 / self.sample_weights.sum() + + def value(self, y, w, Xw): + w_sum = self.sample_weights.sum() + return np.sum(self.sample_weights * (y - Xw) ** 2) / (2 * w_sum) + + def gradient_scalar(self, X, y, w, Xw, j): + return (X[:, j] @ (self.sample_weights * (Xw - y))) / self.sample_weights.sum() + + def gradient_scalar_sparse(self, X_data, X_indptr, X_indices, y, Xw, j): + XjTXw = 0. + for i in range(X_indptr[j], X_indptr[j + 1]): + XjTXw += X_data[i] * self.sample_weights[X_indices[i]] * Xw[X_indices[i]] + return (XjTXw - self.Xty[j]) / self.sample_weights.sum() + + def gradient(self, X, y, Xw): + return X.T @ (self.sample_weights * (Xw - y)) / self.sample_weights.sum() + + def raw_grad(self, y, Xw): + return (self.sample_weights * (Xw - y)) / self.sample_weights.sum() + + def raw_hessian(self, y, Xw): + return self.sample_weights / self.sample_weights.sum() + + def full_grad_sparse(self, X_data, X_indptr, X_indices, y, Xw): + n_features = X_indptr.shape[0] - 1 + grad = np.zeros(n_features, dtype=Xw.dtype) + + for j in range(n_features): + XjTXw = 0. + for i in range(X_indptr[j], X_indptr[j + 1]): + XjTXw += (X_data[i] * self.sample_weights[X_indices[i]] + * Xw[X_indices[i]]) + grad[j] = (XjTXw - self.Xty[j]) / self.sample_weights.sum() + return grad + + def intercept_update_step(self, y, Xw): + return np.sum(self.sample_weights * (Xw - y)) / self.sample_weights.sum() + + @njit def sigmoid(x): """Vectorwise sigmoid.""" diff --git a/skglm/tests/test_datafits.py b/skglm/tests/test_datafits.py index 827a70d64..a6068b2ec 100644 --- a/skglm/tests/test_datafits.py +++ b/skglm/tests/test_datafits.py @@ -5,7 +5,8 @@ from sklearn.linear_model import HuberRegressor from numpy.testing import assert_allclose, assert_array_less -from skglm.datafits import Huber, Logistic, Poisson, Gamma, Cox +from skglm.datafits import (Huber, Logistic, Poisson, Gamma, Cox, WeightedQuadratic, + Quadratic,) from skglm.penalties import L1, WeightedL1 from skglm.solvers import AndersonCD, ProxNewton from skglm import GeneralizedLinearEstimator @@ -169,5 +170,54 @@ def test_cox(use_efron): np.testing.assert_allclose(positive_eig, 0., atol=1e-6) +@pytest.mark.parametrize("fit_intercept", [True, False]) +def test_sample_weights(fit_intercept): + """Test that integers sample weights give same result as duplicating rows.""" + + rng = np.random.RandomState(0) + + n_samples = 20 + n_features = 100 + X, y, _ = make_correlated_data( + n_samples=n_samples, n_features=n_features, random_state=0) + + indices = rng.choice(n_samples, 3 * n_samples) + + sample_weights = np.zeros(n_samples) + for i in indices: + sample_weights[i] += 1 + + X_overs, y_overs = X[indices], y[indices] + + df_weight = WeightedQuadratic(sample_weights=sample_weights) + df_overs = Quadratic() + + # same df value + w = np.random.randn(n_features) + val_overs = df_overs.value(y_overs, X_overs, X_overs @ w) + val_weight = df_weight.value(y, X, X @ w) + np.testing.assert_allclose(val_overs, val_weight) + + pen = L1(alpha=1) + alpha_max = pen.alpha_max(df_weight.gradient(X, y, np.zeros(X.shape[0]))) + pen.alpha = alpha_max / 10 + solver = AndersonCD(tol=1e-12, verbose=10, fit_intercept=fit_intercept) + + model_weight = GeneralizedLinearEstimator(df_weight, pen, solver) + model_weight.fit(X, y) + print("#" * 80) + res = model_weight.coef_ + model = GeneralizedLinearEstimator(df_overs, pen, solver) + model.fit(X_overs, y_overs) + res_overs = model.coef_ + + np.testing.assert_allclose(res, res_overs) + # n_iter = model.n_iter_ + # n_iter_overs = model.n_iter_ + # due to numerical errors the assert fails, but (inspecting the verbose output) + # everything matches up to numerical precision errors in tol: + # np.testing.assert_equal(n_iter, n_iter_overs) + + if __name__ == '__main__': pass From cac34ae1a50b44d5e5faadf4eb60b1e99565b0ca Mon Sep 17 00:00:00 2001 From: Wassim MAZOUZ <165368548+wassimmazouz@users.noreply.github.com> Date: Tue, 2 Jul 2024 17:41:45 +0200 Subject: [PATCH 32/52] DOC L1-regularization parameter tutorial (#264) Co-authored-by: mathurinm Co-authored-by: Badr-MOUFAD --- doc/tutorials/alpha_max.rst | 98 +++++++++++++++++++++++++++++++++++++ doc/tutorials/tutorials.rst | 9 +++- 2 files changed, 105 insertions(+), 2 deletions(-) create mode 100644 doc/tutorials/alpha_max.rst diff --git a/doc/tutorials/alpha_max.rst b/doc/tutorials/alpha_max.rst new file mode 100644 index 000000000..8c105f87d --- /dev/null +++ b/doc/tutorials/alpha_max.rst @@ -0,0 +1,98 @@ +.. _alpha_max: + +========================================================== +Critical regularization strength above which solution is 0 +========================================================== + +This tutorial shows that for :math:`\lambda \geq \lambda_{\text{max}} = || \nabla f(0) ||_{\infty}`, the solution to +:math:`\min f(x) + \lambda || x ||_1` is 0. + +In skglm, we thus frequently use + +.. code-block:: + + alpha_max = np.max(np.abs(gradient0)) + +and choose for the regularization strength :\math:`\alpha` a fraction of this critical value, e.g. ``alpha = 0.01 * alpha_max``. + +Problem setup +============= + +Consider the optimization problem: + +.. math:: + \min_x f(x) + \lambda || x||_1 + +where: + +- :math:`f: \mathbb{R}^d \to \mathbb{R}` is a convex differentiable function, +- :math:`|| x ||_1` is the L1 norm of :math:`x`, +- :math:`\lambda > 0` is the regularization parameter. + +We aim to determine the conditions under which the solution to this problem is :math:`x = 0`. + +Theoretical background +====================== + + +Let + +.. math:: + + g(x) = f(x) + \lambda || x||_1 + +According to Fermat's rule, 0 is the minimizer of :math:`g` if and only if 0 is in the subdifferential of :math:`g` at 0. +The subdifferential of :math:`|| x ||_1` at 0 is the L-infinity unit ball: + +.. math:: + \partial || \cdot ||_1 (0) = \{ u \in \mathbb{R}^d : ||u||_{\infty} \leq 1 \} + +Thus, + +.. math:: + :nowrap: + + \begin{equation} + \begin{aligned} + 0 \in \text{argmin} ~ g(x) + &\Leftrightarrow 0 \in \partial g(0) \\ + &\Leftrightarrow + 0 \in \nabla f(0) + \lambda \partial || \cdot ||_1 (0) \\ + &\Leftrightarrow - \nabla f(0) \in \lambda \{ u \in \mathbb{R}^d : ||u||_{\infty} \leq 1 \} \\ + &\Leftrightarrow || \nabla f(0) ||_\infty \leq \lambda + \end{aligned} + \end{equation} + + +We have just shown that the minimizer of :math:`g = f + \lambda || \cdot ||_1` is 0 if and only if :math:`\lambda \geq ||\nabla f(0)||_{\infty}`. + +Example +======= + +Consider the loss function for Ordinary Least Squares :math:`f(x) = \frac{1}{2n} ||Ax - b||_2^2`, where :math:`n` is the number of samples. We have: + +.. math:: + \nabla f(x) = \frac{1}{n}A^T (Ax - b) + +At :math:`x=0`: + +.. math:: + \nabla f(0) = -\frac{1}{n}A^T b + +The infinity norm of the gradient at 0 is: + +.. math:: + ||\nabla f(0)||_{\infty} = \frac{1}{n}||A^T b||_{\infty} + +For :math:`\lambda \geq \frac{1}{n}||A^T b||_{\infty}`, the solution to :math:`\min_x \frac{1}{2n} ||Ax - b||_2^2 + \lambda || x||_1` is :math:`x=0`. + + + +References +========== + +Refer to Section 3.1 and Proposition 4 in particular of [1] for more details. + +.. _1: + +[1] Eugene Ndiaye, Olivier Fercoq, Alexandre Gramfort, and Joseph Salmon. 2017. Gap safe screening rules for sparsity enforcing penalties. J. Mach. Learn. Res. 18, 1 (January 2017), 4671–4703. diff --git a/doc/tutorials/tutorials.rst b/doc/tutorials/tutorials.rst index 24652a871..d86b58840 100644 --- a/doc/tutorials/tutorials.rst +++ b/doc/tutorials/tutorials.rst @@ -25,11 +25,16 @@ Explore how ``skglm`` fits an unpenalized intercept. :ref:`Mathematics behind Cox datafit ` ------------------------------------------------------------------ +--------------------------------------------------------- Get details about Cox datafit equations. :ref:`Details on the group Lasso ` ------------------------------------------------------------------ +------------------------------------------------------- Mathematical details about the group Lasso, in particular with nonnegativity constraints. + +:ref:`Critical regularization strength above which solution is 0 ` +----------------------------------------------------------------------------- + +How to choose the regularization strength in L1-regularization? From 3260ebd81b8236c96142b5fccd6ecb1441533a25 Mon Sep 17 00:00:00 2001 From: mathurinm Date: Sun, 14 Jul 2024 20:37:30 +0200 Subject: [PATCH 33/52] FEAT implement sparse group lasso penalty and ws_strategy="fixpoint" for BCD (#267) --- examples/plot_group_logistic_regression.py | 2 +- examples/plot_logreg_various_penalties.py | 3 - examples/plot_sparse_group_lasso.py | 61 ++++++++++++ skglm/penalties/__init__.py | 4 +- skglm/penalties/block_separable.py | 108 ++++++++++++++++++++- skglm/solvers/anderson_cd.py | 2 + skglm/solvers/common.py | 57 ++++++++++- skglm/solvers/group_bcd.py | 30 +++++- skglm/tests/test_group.py | 62 +++++++++++- 9 files changed, 315 insertions(+), 14 deletions(-) create mode 100644 examples/plot_sparse_group_lasso.py diff --git a/examples/plot_group_logistic_regression.py b/examples/plot_group_logistic_regression.py index 7e722f02f..5c1e8f712 100644 --- a/examples/plot_group_logistic_regression.py +++ b/examples/plot_group_logistic_regression.py @@ -41,4 +41,4 @@ # %% # Fit check that groups are either all 0 or all non zero -print(clf.coef_.reshape(-1, grp_size)) \ No newline at end of file +print(clf.coef_.reshape(-1, grp_size)) diff --git a/examples/plot_logreg_various_penalties.py b/examples/plot_logreg_various_penalties.py index 0f898b316..1a14c4cca 100644 --- a/examples/plot_logreg_various_penalties.py +++ b/examples/plot_logreg_various_penalties.py @@ -55,7 +55,6 @@ clf_enet.coef_[clf_enet.coef_ != 0], markerfmt="x", label="Elastic net coefficients", - use_line_collection=True, ) plt.setp([m, s], color="#2ca02c") m, s, _ = plt.stem( @@ -63,7 +62,6 @@ clf_mcp.coef_[clf_mcp.coef_ != 0], markerfmt="x", label="MCP coefficients", - use_line_collection=True, ) plt.setp([m, s], color="#ff7f0e") plt.stem( @@ -71,7 +69,6 @@ w_star[w_star != 0], label="true coefficients", markerfmt="bx", - use_line_collection=True, ) plt.legend(loc="best") diff --git a/examples/plot_sparse_group_lasso.py b/examples/plot_sparse_group_lasso.py new file mode 100644 index 000000000..48f2cd640 --- /dev/null +++ b/examples/plot_sparse_group_lasso.py @@ -0,0 +1,61 @@ +""" +================================= +Fast Sparse Group Lasso in python +================================= +Scikit-learn is missing a Sparse Group Lasso regression estimator. We show how to +implement one with ``skglm``. +""" + +# Author: Mathurin Massias + +# %% +import numpy as np +import matplotlib.pyplot as plt + +from skglm.solvers import GroupBCD +from skglm.datafits import QuadraticGroup +from skglm import GeneralizedLinearEstimator +from skglm.penalties import WeightedL1GroupL2 +from skglm.utils.data import make_correlated_data, grp_converter + +n_features = 30 +X, y, _ = make_correlated_data( + n_samples=10, n_features=30, random_state=0) + + +# %% +# Model creation: combination of penalty, datafit and solver. +# +# penalty: +grp_size = 10 # take groups of 10 consecutive features +n_groups = n_features // grp_size +grp_indices, grp_ptr = grp_converter(grp_size, n_features) +n_groups = len(grp_ptr) - 1 +weights_g = np.ones(n_groups, dtype=np.float64) +weights_f = 0.5 * np.ones(n_features) +penalty = WeightedL1GroupL2( + alpha=0.5, weights_groups=weights_g, + weights_features=weights_f, grp_indices=grp_indices, grp_ptr=grp_ptr) + +# %% Datafit and solver +datafit = QuadraticGroup(grp_ptr, grp_indices) +solver = GroupBCD(ws_strategy="fixpoint", verbose=1, fit_intercept=False, tol=1e-10) + +model = GeneralizedLinearEstimator(datafit, penalty, solver=solver) + +# %% +# Train the model +clf = GeneralizedLinearEstimator(datafit, penalty, solver) +clf.fit(X, y) + +# %% +# Some groups are fully 0, and inside non zero groups, +# some values are 0 too +plt.imshow(clf.coef_.reshape(-1, grp_size) != 0, cmap='Greys') +plt.title("Non zero values (in black) in model coefficients") +plt.ylabel('Group index') +plt.xlabel('Feature index inside group') +plt.xticks(np.arange(grp_size)) +plt.yticks(np.arange(n_groups)); + +# %% diff --git a/skglm/penalties/__init__.py b/skglm/penalties/__init__.py index eab796032..2d4d7a912 100644 --- a/skglm/penalties/__init__.py +++ b/skglm/penalties/__init__.py @@ -4,7 +4,7 @@ WeightedL1, IndicatorBox, PositiveConstraint, LogSumPenalty ) from .block_separable import ( - L2_05, L2_1, BlockMCPenalty, BlockSCAD, WeightedGroupL2 + L2_05, L2_1, BlockMCPenalty, BlockSCAD, WeightedGroupL2, WeightedL1GroupL2 ) from .non_separable import SLOPE @@ -14,5 +14,5 @@ BasePenalty, L1_plus_L2, L0_5, L1, L2, L2_3, MCPenalty, WeightedMCPenalty, SCAD, WeightedL1, IndicatorBox, PositiveConstraint, L2_05, L2_1, BlockMCPenalty, BlockSCAD, - WeightedGroupL2, SLOPE, LogSumPenalty + WeightedGroupL2, WeightedL1GroupL2, SLOPE, LogSumPenalty ] diff --git a/skglm/penalties/block_separable.py b/skglm/penalties/block_separable.py index a96ac8260..47161080e 100644 --- a/skglm/penalties/block_separable.py +++ b/skglm/penalties/block_separable.py @@ -6,7 +6,7 @@ from skglm.penalties.base import BasePenalty from skglm.utils.prox_funcs import ( - BST, prox_block_2_05, prox_SCAD, value_SCAD, prox_MCP, value_MCP) + BST, ST_vec, prox_block_2_05, prox_SCAD, value_SCAD, prox_MCP, value_MCP) class L2_1(BasePenalty): @@ -382,3 +382,109 @@ def generalized_support(self, w): gsupp[g] = True return gsupp + + +class WeightedL1GroupL2(BasePenalty): + r"""Weighted Group L2 penalty, aka sparse group Lasso. + + The penalty reads + + .. math:: + sum_{g=1}^{n_"groups"} "weights"^1_g ||w_{[g]}|| + + sum_{j=1}^{n_"features"} "weights"^2_j ||w_{j}|| + + with :math:`w_{[g]}` being the coefficients of the g-th group and + + Attributes + ---------- + alpha : float + The regularization parameter. + + weights_groups : array, shape (n_groups,) + The penalization weights of the groups. + + weights_features : array, shape (n_features,) + The penalization weights of the features. + + grp_indices : array, shape (n_features,) + The group indices stacked contiguously + ([grp1_indices, grp2_indices, ...]). + + grp_ptr : array, shape (n_groups + 1,) + The group pointers such that two consecutive elements delimit + the indices of a group in ``grp_indices``. + + """ + + def __init__( + self, alpha, weights_groups, weights_features, grp_ptr, grp_indices): + self.alpha = alpha + self.grp_ptr, self.grp_indices = grp_ptr, grp_indices + self.weights_groups = weights_groups + self.weights_features = weights_features + + def get_spec(self): + spec = ( + ('alpha', float64), + ('weights_groups', float64[:]), + ('weights_features', float64[:]), + ('grp_ptr', int32[:]), + ('grp_indices', int32[:]), + ) + return spec + + def params_to_dict(self): + return dict(alpha=self.alpha, weights_features=self.weights_features, + weights_groups=self.weights_groups, grp_ptr=self.grp_ptr, + grp_indices=self.grp_indices) + + def value(self, w): + """Value of penalty at vector ``w``.""" + grp_ptr, grp_indices = self.grp_ptr, self.grp_indices + n_grp = len(grp_ptr) - 1 + + sum_penalty = 0. + for g in range(n_grp): + grp_g_indices = grp_indices[grp_ptr[g]: grp_ptr[g+1]] + w_g = w[grp_g_indices] + + sum_penalty += self.weights_groups[g] * norm(w_g) + sum_penalty += np.sum(self.weights_features * np.abs(w)) + + return self.alpha * sum_penalty + + def prox_1group(self, value, stepsize, g): + """Compute the proximal operator of group ``g``.""" + res = ST_vec(value, self.alpha * stepsize * self.weights_features[g]) + return BST(res, self.alpha * stepsize * self.weights_groups[g]) + + def subdiff_distance(self, w, grad_ws, ws): + """Compute distance to the subdifferential at ``w`` of negative gradient. + + Refer to :ref:`subdiff_positive_group_lasso` for details of the derivation. + + Note: + ---- + ``grad_ws`` is a stacked array of gradients ``[grad_ws_1, grad_ws_2, ...]``. + """ + raise NotImplementedError("Too hard for now") + + def is_penalized(self, n_groups): + return np.ones(n_groups, dtype=np.bool_) + + def generalized_support(self, w): + grp_indices, grp_ptr = self.grp_indices, self.grp_ptr + n_groups = len(grp_ptr) - 1 + is_penalized = self.is_penalized(n_groups) + + gsupp = np.zeros(n_groups, dtype=np.bool_) + for g in range(n_groups): + if not is_penalized[g]: + gsupp[g] = True + continue + + grp_g_indices = grp_indices[grp_ptr[g]: grp_ptr[g+1]] + if np.any(w[grp_g_indices]): + gsupp[g] = True + + return gsupp diff --git a/skglm/solvers/anderson_cd.py b/skglm/solvers/anderson_cd.py index 8b3437de4..87bd92008 100644 --- a/skglm/solvers/anderson_cd.py +++ b/skglm/solvers/anderson_cd.py @@ -103,6 +103,8 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): # The intercept is not taken into account in the optimality conditions since # the derivative w.r.t. to the intercept may be very large. It is not likely # to change significantly the optimality conditions. + # TODO: MM I don't understand the comment above: the intercept is + # taken into account intercept_opt 6 lines below if self.ws_strategy == "subdiff": opt = penalty.subdiff_distance(w[:n_features], grad, all_feats) elif self.ws_strategy == "fixpoint": diff --git a/skglm/solvers/common.py b/skglm/solvers/common.py index a5e7216f2..cbdb58537 100644 --- a/skglm/solvers/common.py +++ b/skglm/solvers/common.py @@ -1,10 +1,11 @@ import numpy as np from numba import njit +from numpy.linalg import norm @njit def dist_fix_point_cd(w, grad_ws, lipschitz_ws, datafit, penalty, ws): - """Compute the violation of the fixed point iterate scheme. + """Compute the violation of the fixed point iterate scheme for CD. Parameters ---------- @@ -44,6 +45,60 @@ def dist_fix_point_cd(w, grad_ws, lipschitz_ws, datafit, penalty, ws): return dist +@njit +def dist_fix_point_bcd( + w, grad_ws, lipschitz_ws, datafit, penalty, ws): + """Compute the violation of the fixed point iterate scheme for BCD. + + Parameters + ---------- + w : array, shape (n_features,) + Coefficient vector. + + grad_ws : array, shape (ws_size,) + Gradient restricted to the working set. + + lipschitz_ws : array, shape (len(ws),) + Coordinatewise gradient Lipschitz constants, restricted to working set. + + datafit: instance of BaseDatafit + Datafit. + + penalty: instance of BasePenalty + Penalty. + + ws : array, shape (len(ws),) + The working set. + + Returns + ------- + dist : array, shape (n_groups,) + Violation score for every group. + + Note: + ---- + ``grad_ws`` is a stacked array of gradients ``[grad_ws_1, grad_ws_2, ...]``. + """ + n_groups = len(penalty.grp_ptr) - 1 + dist = np.zeros(n_groups, dtype=w.dtype) + + grad_ptr = 0 + for idx, g in enumerate(ws): + if lipschitz_ws[idx] == 0.: + continue + grp_g_indices = penalty.grp_indices[penalty.grp_ptr[g]: penalty.grp_ptr[g+1]] + + grad_g = grad_ws[grad_ptr: grad_ptr + len(grp_g_indices)] + grad_ptr += len(grp_g_indices) + + step_g = 1 / lipschitz_ws[idx] + w_g = w[grp_g_indices] + dist[idx] = norm( + w_g - penalty.prox_1group(w_g - grad_g * step_g, step_g, g) + ) + return dist + + @njit def construct_grad(X, y, w, Xw, datafit, ws): """Compute the gradient of the datafit restricted to the working set. diff --git a/skglm/solvers/group_bcd.py b/skglm/solvers/group_bcd.py index fcadbbe05..b2163a160 100644 --- a/skglm/solvers/group_bcd.py +++ b/skglm/solvers/group_bcd.py @@ -5,6 +5,7 @@ from skglm.solvers.base import BaseSolver from skglm.utils.anderson import AndersonAcceleration from skglm.utils.validation import check_group_compatible +from skglm.solvers.common import dist_fix_point_bcd class GroupBCD(BaseSolver): @@ -36,17 +37,22 @@ class GroupBCD(BaseSolver): Amount of verbosity. 0/False is silent. """ - def __init__(self, max_iter=1000, max_epochs=100, p0=10, tol=1e-4, - fit_intercept=False, warm_start=False, verbose=0): + def __init__( + self, max_iter=1000, max_epochs=100, p0=10, tol=1e-4, fit_intercept=False, + warm_start=False, ws_strategy="subdiff", verbose=0): self.max_iter = max_iter self.max_epochs = max_epochs self.p0 = p0 self.tol = tol self.fit_intercept = fit_intercept self.warm_start = warm_start + self.ws_strategy = ws_strategy self.verbose = verbose def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): + if self.ws_strategy not in ("subdiff", "fixpoint"): + raise ValueError( + 'Unsupported value for self.ws_strategy:', self.ws_strategy) check_group_compatible(datafit) check_group_compatible(penalty) @@ -86,7 +92,14 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): X.data, X.indptr, X.indices, y, w, Xw, datafit, all_groups) else: grad = _construct_grad(X, y, w, Xw, datafit, all_groups) - opt = penalty.subdiff_distance(w, grad, all_groups) + + if self.ws_strategy == "subdiff": + # MM TODO: AndersonCD passes w[:n_features] here + opt = penalty.subdiff_distance(w, grad, all_groups) + elif self.ws_strategy == "fixpoint": + opt = dist_fix_point_bcd( + w[:n_features], grad, lipschitz, datafit, penalty, all_groups + ) if self.fit_intercept: intercept_opt = np.abs(datafit.intercept_update_step(y, Xw)) @@ -144,8 +157,15 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): else: grad_ws = _construct_grad(X, y, w, Xw, datafit, ws) - opt_in = penalty.subdiff_distance(w, grad_ws, ws) - stop_crit_in = np.max(opt_in) + if self.ws_strategy == "subdiff": + # TODO MM: AndersonCD uses w[:n_features] here + opt_ws = penalty.subdiff_distance(w, grad_ws, ws) + elif self.ws_strategy == "fixpoint": + opt_ws = dist_fix_point_bcd( + w, grad_ws, lipschitz[ws], datafit, penalty, ws + ) + + stop_crit_in = np.max(opt_ws) if max(self.verbose - 1, 0): p_obj = datafit.value(y, w, Xw) + penalty.value(w) diff --git a/skglm/tests/test_group.py b/skglm/tests/test_group.py index 990ba5aec..6ec839466 100644 --- a/skglm/tests/test_group.py +++ b/skglm/tests/test_group.py @@ -6,7 +6,10 @@ from skglm.penalties import L1 from skglm.datafits import Quadratic -from skglm.penalties.block_separable import WeightedGroupL2 +from skglm import GeneralizedLinearEstimator +from skglm.penalties.block_separable import ( + WeightedL1GroupL2, WeightedGroupL2 +) from skglm.datafits.group import QuadraticGroup, LogisticGroup from skglm.solvers import GroupBCD, GroupProxNewton @@ -135,6 +138,63 @@ def test_vs_celer_grouplasso(n_groups, n_features, shuffle): np.testing.assert_allclose(model.coef_, w, atol=1e-5) +def test_ws_strategy(): + n_features = 300 + X, y, _ = make_correlated_data(n_features=n_features, random_state=0) + + grp_indices, grp_ptr = grp_converter(3, n_features) + n_groups = len(grp_ptr) - 1 + weights_g = np.ones(n_groups) + alpha_max = _alpha_max_group_lasso(X, y, grp_indices, grp_ptr, weights_g) + pen = WeightedGroupL2( + alpha=alpha_max/10, weights=weights_g, grp_indices=grp_indices, grp_ptr=grp_ptr) + + solver = GroupBCD(ws_strategy="subdiff", verbose=3, fit_intercept=False, tol=1e-10) + + model = GeneralizedLinearEstimator( + QuadraticGroup(grp_ptr, grp_indices), pen, solver=solver) + + model.fit(X, y) + w_subdiff = model.coef_ + print("####") + model.solver.ws_strategy = "fixpoint" + model.fit(X, y) + w_fixpoint = model.coef_ + # should not be the eaxct same solution: + np.testing.assert_array_less(0, norm(w_fixpoint - w_subdiff)) + # but still should be close: + np.testing.assert_allclose(w_fixpoint, w_subdiff, atol=1e-8) + + +def test_sparse_group(): + n_features = 30 + X, y, _ = make_correlated_data(n_features=n_features, random_state=0) + + grp_indices, grp_ptr = grp_converter(3, n_features) + n_groups = len(grp_ptr) - 1 + + weights_g = np.ones(n_groups, dtype=np.float64) + weights_f = 0.5 * np.ones(n_features) + pen = WeightedL1GroupL2( + alpha=0.1, weights_groups=weights_g, + weights_features=weights_f, grp_indices=grp_indices, grp_ptr=grp_ptr) + + solver = GroupBCD(ws_strategy="fixpoint", verbose=3, + fit_intercept=False, tol=1e-10) + + model = GeneralizedLinearEstimator( + QuadraticGroup(grp_ptr, grp_indices), pen, solver=solver) + + model.fit(X, y) + w = model.coef_.reshape(-1, 3) + # some lines are 0: + np.testing.assert_equal(w[0], 0) + # some non zero lines have 0 entry + np.testing.assert_array_less(0, norm(w[1])) + # sign error -0 is not 0 without np.abs + np.testing.assert_equal(np.abs(w[1, 0]), 0) + + def test_intercept_grouplasso(): n_groups, n_features, shuffle = 15, 50, False n_samples = 100 From 07c49bb9c6cec9bcbe3e6034c38c3a7e94846ae3 Mon Sep 17 00:00:00 2001 From: PAB Date: Mon, 15 Jul 2024 13:41:12 +0200 Subject: [PATCH 34/52] ENH - check ``datafit + penalty`` compatibility with solver (#137) Co-authored-by: Badr-MOUFAD Co-authored-by: Badr MOUFAD <65614794+Badr-MOUFAD@users.noreply.github.com> Co-authored-by: Quentin Bertrand Co-authored-by: mathurinm Co-authored-by: mathurinm --- doc/changes/0.4.rst | 1 + skglm/__init__.py | 2 +- skglm/datafits/single_task.py | 3 - skglm/experimental/pdcd_ws.py | 37 +++----- .../tests/test_quantile_regression.py | 6 +- skglm/experimental/tests/test_sqrt_lasso.py | 6 +- skglm/penalties/block_separable.py | 11 --- skglm/penalties/non_separable.py | 10 --- skglm/solvers/anderson_cd.py | 21 ++++- skglm/solvers/base.py | 83 ++++++++++++++++- skglm/solvers/fista.py | 40 ++++++--- skglm/solvers/gram_cd.py | 13 ++- skglm/solvers/group_bcd.py | 27 +++++- skglm/solvers/group_prox_newton.py | 20 ++++- skglm/solvers/lbfgs.py | 14 ++- skglm/solvers/multitask_bcd.py | 21 ++++- skglm/solvers/prox_newton.py | 13 ++- skglm/tests/test_validation.py | 90 +++++++++++++++++++ skglm/utils/validation.py | 74 +++++++++++++++ 19 files changed, 408 insertions(+), 84 deletions(-) create mode 100644 skglm/tests/test_validation.py diff --git a/doc/changes/0.4.rst b/doc/changes/0.4.rst index c131143b6..904401fd9 100644 --- a/doc/changes/0.4.rst +++ b/doc/changes/0.4.rst @@ -4,6 +4,7 @@ Version 0.4 (in progress) ------------------------- - Add :ref:`GroupLasso Estimator ` (PR: :gh:`228`) - Add support and tutorial for positive coefficients to :ref:`Group Lasso Penalty ` (PR: :gh:`221`) +- Check compatibility with datafit and penalty in solver (PR :gh:`137`) - Add support to weight samples in the quadratic datafit :ref:`Weighted Quadratic Datafit ` (PR: :gh:`258`) diff --git a/skglm/__init__.py b/skglm/__init__.py index c134c98f2..d80de3c17 100644 --- a/skglm/__init__.py +++ b/skglm/__init__.py @@ -1,4 +1,4 @@ -__version__ = '0.3.2dev' +__version__ = '0.4dev' from skglm.estimators import ( # noqa F401 Lasso, WeightedLasso, ElasticNet, MCPRegression, MultiTaskLasso, LinearSVC, diff --git a/skglm/datafits/single_task.py b/skglm/datafits/single_task.py index 5750ea295..1ccb218aa 100644 --- a/skglm/datafits/single_task.py +++ b/skglm/datafits/single_task.py @@ -661,9 +661,6 @@ def value(self, y, w, Xw): def gradient_scalar(self, X, y, w, Xw, j): return X[:, j] @ (1 - y * np.exp(-Xw)) / len(y) - def gradient_scalar_sparse(self, X_data, X_indptr, X_indices, y, Xw, j): - pass - def intercept_update_step(self, y, Xw): return np.sum(self.raw_grad(y, Xw)) diff --git a/skglm/experimental/pdcd_ws.py b/skglm/experimental/pdcd_ws.py index b81a68f5f..81e72da8c 100644 --- a/skglm/experimental/pdcd_ws.py +++ b/skglm/experimental/pdcd_ws.py @@ -5,11 +5,12 @@ from scipy.sparse import issparse from numba import njit -from skglm.utils.jit_compilation import compiled_clone +from skglm.solvers import BaseSolver + from sklearn.exceptions import ConvergenceWarning -class PDCD_WS: +class PDCD_WS(BaseSolver): r"""Primal-Dual Coordinate Descent solver with working sets. It solves @@ -78,6 +79,9 @@ class PDCD_WS: https://arxiv.org/abs/2204.07826 """ + _datafit_required_attr = ('prox_conjugate',) + _penalty_required_attr = ("prox_1d",) + def __init__(self, max_iter=1000, max_epochs=1000, dual_init=None, p0=100, tol=1e-6, verbose=False): self.max_iter = max_iter @@ -87,11 +91,7 @@ def __init__(self, max_iter=1000, max_epochs=1000, dual_init=None, self.tol = tol self.verbose = verbose - def solve(self, X, y, datafit_, penalty_, w_init=None, Xw_init=None): - if issparse(X): - raise ValueError("Sparse matrices are not yet support in PDCD_WS solver.") - - datafit, penalty = PDCD_WS._validate_init(datafit_, penalty_) + def _solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): n_samples, n_features = X.shape # init steps @@ -196,27 +196,12 @@ def _solve_subproblem(y, X, w, Xw, z, z_bar, datafit, penalty, if stop_crit_in <= tol_in: break - @staticmethod - def _validate_init(datafit_, penalty_): - # validate datafit - missing_attrs = [] - for attr in ('prox_conjugate', 'subdiff_distance'): - if not hasattr(datafit_, attr): - missing_attrs.append(f"`{attr}`") - - if len(missing_attrs): - raise AttributeError( - "Datafit is not compatible with PDCD_WS solver.\n" - "Datafit must implement `prox_conjugate` and `subdiff_distance`.\n" - f"Missing {' and '.join(missing_attrs)}." + def custom_checks(self, X, y, datafit, penalty): + if issparse(X): + raise ValueError( + "Sparse matrices are not yet supported in `PDCD_WS` solver." ) - # jit compile classes - compiled_datafit = compiled_clone(datafit_) - compiled_penalty = compiled_clone(penalty_) - - return compiled_datafit, compiled_penalty - @njit def _scores_primal(X, w, z, penalty, primal_steps, ws): diff --git a/skglm/experimental/tests/test_quantile_regression.py b/skglm/experimental/tests/test_quantile_regression.py index 509b7079c..65e0c1e65 100644 --- a/skglm/experimental/tests/test_quantile_regression.py +++ b/skglm/experimental/tests/test_quantile_regression.py @@ -5,6 +5,7 @@ from skglm.penalties import L1 from skglm.experimental.pdcd_ws import PDCD_WS from skglm.experimental.quantile_regression import Pinball +from skglm.utils.jit_compilation import compiled_clone from skglm.utils.data import make_correlated_data from sklearn.linear_model import QuantileRegressor @@ -21,9 +22,12 @@ def test_PDCD_WS(quantile_level): alpha_max = norm(X.T @ (np.sign(y)/2 + (quantile_level - 0.5)), ord=np.inf) alpha = alpha_max / 5 + datafit = compiled_clone(Pinball(quantile_level)) + penalty = compiled_clone(L1(alpha)) + w = PDCD_WS( dual_init=np.sign(y)/2 + (quantile_level - 0.5) - ).solve(X, y, Pinball(quantile_level), L1(alpha))[0] + ).solve(X, y, datafit, penalty)[0] clf = QuantileRegressor( quantile=quantile_level, diff --git a/skglm/experimental/tests/test_sqrt_lasso.py b/skglm/experimental/tests/test_sqrt_lasso.py index ae0d77935..b63619884 100644 --- a/skglm/experimental/tests/test_sqrt_lasso.py +++ b/skglm/experimental/tests/test_sqrt_lasso.py @@ -7,6 +7,7 @@ from skglm.experimental.sqrt_lasso import (SqrtLasso, SqrtQuadratic, _chambolle_pock_sqrt) from skglm.experimental.pdcd_ws import PDCD_WS +from skglm.utils.jit_compilation import compiled_clone def test_alpha_max(): @@ -72,7 +73,10 @@ def test_PDCD_WS(with_dual_init): dual_init = y / norm(y) if with_dual_init else None - w = PDCD_WS(dual_init=dual_init).solve(X, y, SqrtQuadratic(), L1(alpha))[0] + datafit = compiled_clone(SqrtQuadratic()) + penalty = compiled_clone(L1(alpha)) + + w = PDCD_WS(dual_init=dual_init).solve(X, y, datafit, penalty)[0] clf = SqrtLasso(alpha=alpha, fit_intercept=False, tol=1e-12).fit(X, y) np.testing.assert_allclose(clf.coef_, w, atol=1e-6) diff --git a/skglm/penalties/block_separable.py b/skglm/penalties/block_separable.py index 47161080e..091392601 100644 --- a/skglm/penalties/block_separable.py +++ b/skglm/penalties/block_separable.py @@ -458,17 +458,6 @@ def prox_1group(self, value, stepsize, g): res = ST_vec(value, self.alpha * stepsize * self.weights_features[g]) return BST(res, self.alpha * stepsize * self.weights_groups[g]) - def subdiff_distance(self, w, grad_ws, ws): - """Compute distance to the subdifferential at ``w`` of negative gradient. - - Refer to :ref:`subdiff_positive_group_lasso` for details of the derivation. - - Note: - ---- - ``grad_ws`` is a stacked array of gradients ``[grad_ws_1, grad_ws_2, ...]``. - """ - raise NotImplementedError("Too hard for now") - def is_penalized(self, n_groups): return np.ones(n_groups, dtype=np.bool_) diff --git a/skglm/penalties/non_separable.py b/skglm/penalties/non_separable.py index c27079323..58f0b8c2e 100644 --- a/skglm/penalties/non_separable.py +++ b/skglm/penalties/non_separable.py @@ -48,13 +48,3 @@ def prox_vec(self, x, stepsize): prox[sorted_indices] = prox_SLOPE(abs_x[sorted_indices], alphas * stepsize) return np.sign(x) * prox - - def prox_1d(self, value, stepsize, j): - raise ValueError( - "No coordinate-wise proximal operator for SLOPE. Use `prox_vec` instead." - ) - - def subdiff_distance(self, w, grad, ws): - return ValueError( - "No subdifferential distance for SLOPE. Use `opt_strategy='fixpoint'`" - ) diff --git a/skglm/solvers/anderson_cd.py b/skglm/solvers/anderson_cd.py index 87bd92008..d39a24086 100644 --- a/skglm/solvers/anderson_cd.py +++ b/skglm/solvers/anderson_cd.py @@ -7,6 +7,7 @@ ) from skglm.solvers.base import BaseSolver from skglm.utils.anderson import AndersonAcceleration +from skglm.utils.validation import check_attrs class AndersonCD(BaseSolver): @@ -47,6 +48,9 @@ class AndersonCD(BaseSolver): code: https://github.com/mathurinm/andersoncd """ + _datafit_required_attr = ("get_lipschitz", "gradient_scalar") + _penalty_required_attr = ("prox_1d",) + def __init__(self, max_iter=50, max_epochs=50_000, p0=10, tol=1e-4, ws_strategy="subdiff", fit_intercept=True, warm_start=False, verbose=0): @@ -59,7 +63,7 @@ def __init__(self, max_iter=50, max_epochs=50_000, p0=10, self.warm_start = warm_start self.verbose = verbose - def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): + def _solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): if self.ws_strategy not in ("subdiff", "fixpoint"): raise ValueError( 'Unsupported value for self.ws_strategy:', self.ws_strategy) @@ -269,6 +273,21 @@ def path(self, X, y, datafit, penalty, alphas=None, w_init=None, results += (n_iters,) return results + def custom_checks(self, X, y, datafit, penalty): + # check datafit support sparse data + check_attrs( + datafit, solver=self, + required_attr=self._datafit_required_attr, + support_sparse=sparse.issparse(X) + ) + + # ws strategy + if self.ws_strategy == "subdiff" and not hasattr(penalty, "subdiff_distance"): + raise AttributeError( + "Penalty must implement `subdiff_distance` " + "to use ws_strategy='subdiff' in solver AndersonCD." + ) + @njit def _cd_epoch(X, y, w, Xw, lc, datafit, penalty, ws): diff --git a/skglm/solvers/base.py b/skglm/solvers/base.py index 9b5c5b121..06a08a690 100644 --- a/skglm/solvers/base.py +++ b/skglm/solvers/base.py @@ -1,11 +1,38 @@ -from abc import abstractmethod +from abc import abstractmethod, ABC +from skglm.utils.validation import check_attrs -class BaseSolver(): - """Base class for solvers.""" +class BaseSolver(ABC): + """Base class for solvers. + + Attributes + ---------- + _datafit_required_attr : list + List of attributes that must be implemented in Datafit. + + _penalty_required_attr : list + List of attributes that must be implemented in Penalty. + + Notes + ----- + For required attributes, if an attribute is given as a list of attributes + it means at least one of them should be implemented. + For instance, if + + _datafit_required_attr = ( + "get_global_lipschitz", + ("gradient", "gradient_scalar") + ) + + it mean datafit must implement the methods ``get_global_lipschitz`` + and (``gradient`` or ``gradient_scaler``). + """ + + _datafit_required_attr: list + _penalty_required_attr: list @abstractmethod - def solve(self, X, y, datafit, penalty, w_init, Xw_init): + def _solve(self, X, y, datafit, penalty, w_init, Xw_init): """Solve an optimization problem. Parameters @@ -39,3 +66,51 @@ def solve(self, X, y, datafit, penalty, w_init, Xw_init): stop_crit : float Value of stopping criterion at convergence. """ + + def custom_checks(self, X, y, datafit, penalty): + """Ensure the solver is suited for the `datafit` + `penalty` problem. + + This method includes extra checks to perform + aside from checking attributes compatibility. + + Parameters + ---------- + X : array, shape (n_samples, n_features) + Training data. + + y : array, shape (n_samples,) + Target values. + + datafit : instance of BaseDatafit + Datafit. + + penalty : instance of BasePenalty + Penalty. + """ + pass + + def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None, + *, run_checks=True): + """Solve the optimization problem after validating its compatibility. + + A proxy of ``_solve`` method that implicitly ensures the compatibility + of ``datafit`` and ``penalty`` with the solver. + + Examples + -------- + >>> ... + >>> coefs, obj_out, stop_crit = solver.solve(X, y, datafit, penalty) + """ + if run_checks: + self._validate(X, y, datafit, penalty) + + return self._solve(X, y, datafit, penalty, w_init, Xw_init) + + def _validate(self, X, y, datafit, penalty): + # execute: `custom_checks` then check attributes + self.custom_checks(X, y, datafit, penalty) + + # do not check for sparse support here, make the check at the solver level + # some solvers like ProxNewton don't require methods for sparse support + check_attrs(datafit, self, self._datafit_required_attr) + check_attrs(penalty, self, self._penalty_required_attr) diff --git a/skglm/solvers/fista.py b/skglm/solvers/fista.py index b4653079a..e0933a111 100644 --- a/skglm/solvers/fista.py +++ b/skglm/solvers/fista.py @@ -3,6 +3,7 @@ from skglm.solvers.base import BaseSolver from skglm.solvers.common import construct_grad, construct_grad_sparse from skglm.utils.prox_funcs import _prox_vec +from skglm.utils.validation import check_attrs class FISTA(BaseSolver): @@ -27,6 +28,9 @@ class FISTA(BaseSolver): https://epubs.siam.org/doi/10.1137/080716542 """ + _datafit_required_attr = ("get_global_lipschitz", ("gradient", "gradient_scalar")) + _penalty_required_attr = (("prox_1d", "prox_vec"),) + def __init__(self, max_iter=100, tol=1e-4, opt_strategy="subdiff", verbose=0): self.max_iter = max_iter self.tol = tol @@ -35,7 +39,7 @@ def __init__(self, max_iter=100, tol=1e-4, opt_strategy="subdiff", verbose=0): self.fit_intercept = False # needed to be passed to GeneralizedLinearEstimator self.warm_start = False - def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): + def _solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): p_objs_out = [] n_samples, n_features = X.shape all_features = np.arange(n_features) @@ -46,19 +50,12 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): z = w_init.copy() if w_init is not None else np.zeros(n_features) Xw = Xw_init.copy() if Xw_init is not None else np.zeros(n_samples) - try: - if X_is_sparse: - lipschitz = datafit.get_global_lipschitz_sparse( - X.data, X.indptr, X.indices, y - ) - else: - lipschitz = datafit.get_global_lipschitz(X, y) - except AttributeError as e: - sparse_suffix = '_sparse' if X_is_sparse else '' - - raise Exception( - "Datafit is not compatible with FISTA solver.\n Datafit must " - f"implement `get_global_lipschitz{sparse_suffix}` method") from e + if X_is_sparse: + lipschitz = datafit.get_global_lipschitz_sparse( + X.data, X.indptr, X.indices, y + ) + else: + lipschitz = datafit.get_global_lipschitz(X, y) for n_iter in range(self.max_iter): t_old = t_new @@ -111,3 +108,18 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): print(f"Stopping criterion max violation: {stop_crit:.2e}") break return w, np.array(p_objs_out), stop_crit + + def custom_checks(self, X, y, datafit, penalty): + # check datafit support sparse data + check_attrs( + datafit, solver=self, + required_attr=self._datafit_required_attr, + support_sparse=issparse(X) + ) + + # optimality check + if self.opt_strategy == "subdiff" and not hasattr(penalty, "subdiff_distance"): + raise AttributeError( + "Penalty must implement `subdiff_distance` " + "to use `opt_strategy='subdiff'` in Fista solver." + ) diff --git a/skglm/solvers/gram_cd.py b/skglm/solvers/gram_cd.py index d18b165b3..9ecf42bfb 100644 --- a/skglm/solvers/gram_cd.py +++ b/skglm/solvers/gram_cd.py @@ -2,6 +2,7 @@ import numpy as np from numba import njit from scipy.sparse import issparse + from skglm.solvers.base import BaseSolver from skglm.utils.anderson import AndersonAcceleration @@ -49,6 +50,9 @@ class GramCD(BaseSolver): Amount of verbosity. 0/False is silent. """ + _datafit_required_attr = () + _penalty_required_attr = ("prox_1d", "subdiff_distance") + def __init__(self, max_iter=100, use_acc=False, greedy_cd=True, tol=1e-4, fit_intercept=True, warm_start=False, verbose=0): self.max_iter = max_iter @@ -59,7 +63,7 @@ def __init__(self, max_iter=100, use_acc=False, greedy_cd=True, tol=1e-4, self.warm_start = warm_start self.verbose = verbose - def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): + def _solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): # we don't pass Xw_init as the solver uses Gram updates # to keep the gradient up-to-date instead of Xw n_samples, n_features = X.shape @@ -132,6 +136,13 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): p_objs_out.append(p_obj) return w, np.array(p_objs_out), stop_crit + def custom_checks(self, X, y, datafit, penalty): + if datafit is not None: + raise AttributeError( + "`GramCD` supports only `Quadratic` datafit and fits it implicitly, " + f"argument `datafit` must be `None`, got {datafit.__class__.__name__}." + ) + @njit def _gram_cd_epoch(scaled_gram, w, grad, penalty, greedy_cd): diff --git a/skglm/solvers/group_bcd.py b/skglm/solvers/group_bcd.py index b2163a160..c7b515dad 100644 --- a/skglm/solvers/group_bcd.py +++ b/skglm/solvers/group_bcd.py @@ -4,7 +4,7 @@ from skglm.solvers.base import BaseSolver from skglm.utils.anderson import AndersonAcceleration -from skglm.utils.validation import check_group_compatible +from skglm.utils.validation import check_group_compatible, check_attrs from skglm.solvers.common import dist_fix_point_bcd @@ -37,6 +37,9 @@ class GroupBCD(BaseSolver): Amount of verbosity. 0/False is silent. """ + _datafit_required_attr = ("get_lipschitz", "gradient_g") + _penalty_required_attr = ("prox_1group",) + def __init__( self, max_iter=1000, max_epochs=100, p0=10, tol=1e-4, fit_intercept=False, warm_start=False, ws_strategy="subdiff", verbose=0): @@ -49,12 +52,10 @@ def __init__( self.ws_strategy = ws_strategy self.verbose = verbose - def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): + def _solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): if self.ws_strategy not in ("subdiff", "fixpoint"): raise ValueError( 'Unsupported value for self.ws_strategy:', self.ws_strategy) - check_group_compatible(datafit) - check_group_compatible(penalty) n_samples, n_features = X.shape n_groups = len(penalty.grp_ptr) - 1 @@ -181,6 +182,24 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): return w, p_objs_out, stop_crit + def custom_checks(self, X, y, datafit, penalty): + check_group_compatible(datafit) + check_group_compatible(penalty) + + # check datafit support sparse data + check_attrs( + datafit, solver=self, + required_attr=self._datafit_required_attr, + support_sparse=issparse(X) + ) + + # ws strategy + if self.ws_strategy == "subdiff" and not hasattr(penalty, "subdiff_distance"): + raise AttributeError( + "Penalty must implement `subdiff_distance` " + "to use ws_strategy='subdiff'." + ) + @njit def _bcd_epoch(X, y, w, Xw, lipschitz, datafit, penalty, ws): diff --git a/skglm/solvers/group_prox_newton.py b/skglm/solvers/group_prox_newton.py index 929ab853b..1492651c3 100644 --- a/skglm/solvers/group_prox_newton.py +++ b/skglm/solvers/group_prox_newton.py @@ -1,9 +1,12 @@ import numpy as np from numba import njit from numpy.linalg import norm +from scipy.sparse import issparse + from skglm.solvers.base import BaseSolver from skglm.utils.validation import check_group_compatible + EPS_TOL = 0.3 MAX_CD_ITER = 20 MAX_BACKTRACK_ITER = 20 @@ -41,6 +44,9 @@ class GroupProxNewton(BaseSolver): code: https://github.com/tbjohns/BlitzL1 """ + _datafit_required_attr = ("raw_grad", "raw_hessian") + _penalty_required_attr = ("prox_1group", "subdiff_distance") + def __init__(self, p0=10, max_iter=20, max_pn_iter=1000, tol=1e-4, fit_intercept=False, warm_start=False, verbose=0): self.p0 = p0 @@ -51,10 +57,7 @@ def __init__(self, p0=10, max_iter=20, max_pn_iter=1000, tol=1e-4, self.warm_start = warm_start self.verbose = verbose - def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): - check_group_compatible(datafit) - check_group_compatible(penalty) - + def _solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): fit_intercept = self.fit_intercept n_samples, n_features = X.shape grp_ptr, grp_indices = penalty.grp_ptr, penalty.grp_indices @@ -142,6 +145,15 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): p_objs_out.append(p_obj) return w, np.asarray(p_objs_out), stop_crit + def custom_checks(self, X, y, datafit, penalty): + check_group_compatible(datafit) + check_group_compatible(penalty) + + if issparse(X): + raise ValueError( + "Sparse matrices are not yet supported in `GroupProxNewton` solver." + ) + @njit def _descent_direction(X, y, w_epoch, Xw_epoch, fit_intercept, grad_ws, datafit, diff --git a/skglm/solvers/lbfgs.py b/skglm/solvers/lbfgs.py index 5e7e03051..438c8b97b 100644 --- a/skglm/solvers/lbfgs.py +++ b/skglm/solvers/lbfgs.py @@ -7,6 +7,7 @@ from scipy.sparse import issparse from skglm.solvers import BaseSolver +from skglm.utils.validation import check_attrs class LBFGS(BaseSolver): @@ -27,12 +28,15 @@ class LBFGS(BaseSolver): Amount of verbosity. 0/False is silent. """ + _datafit_required_attr = ("gradient",) + _penalty_required_attr = ("gradient",) + def __init__(self, max_iter=50, tol=1e-4, verbose=False): self.max_iter = max_iter self.tol = tol self.verbose = verbose - def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): + def _solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): def objective(w): Xw = X @ w @@ -102,3 +106,11 @@ def callback_post_iter(w_k): stop_crit = norm(result.jac, ord=np.inf) return w, np.asarray(p_objs_out), stop_crit + + def custom_checks(self, X, y, datafit, penalty): + # check datafit support sparse data + check_attrs( + datafit, solver=self, + required_attr=self._datafit_required_attr, + support_sparse=issparse(X) + ) diff --git a/skglm/solvers/multitask_bcd.py b/skglm/solvers/multitask_bcd.py index 16301ac4a..5a8dfa5e6 100644 --- a/skglm/solvers/multitask_bcd.py +++ b/skglm/solvers/multitask_bcd.py @@ -4,11 +4,15 @@ from numpy.linalg import norm from sklearn.utils import check_array from skglm.solvers.base import BaseSolver +from skglm.utils.validation import check_attrs class MultiTaskBCD(BaseSolver): """Block coordinate descent solver for multi-task problems.""" + _datafit_required_attr = ("get_lipschitz", "gradient_j") + _penalty_required_attr = ("prox_1feat",) + def __init__(self, max_iter=100, max_epochs=50_000, p0=10, tol=1e-6, use_acc=True, ws_strategy="subdiff", fit_intercept=True, warm_start=False, verbose=0): @@ -22,7 +26,7 @@ def __init__(self, max_iter=100, max_epochs=50_000, p0=10, tol=1e-6, self.warm_start = warm_start self.verbose = verbose - def solve(self, X, Y, datafit, penalty, W_init=None, XW_init=None): + def _solve(self, X, Y, datafit, penalty, W_init=None, XW_init=None): n_samples, n_features = X.shape n_tasks = Y.shape[1] pen = penalty.is_penalized(n_features) @@ -231,6 +235,21 @@ def path(self, X, Y, datafit, penalty, alphas, W_init=None, return_n_iter=False) return results + def custom_checks(self, X, y, datafit, penalty): + # check datafit support sparse data + check_attrs( + datafit, solver=self, + required_attr=self._datafit_required_attr, + support_sparse=sparse.issparse(X) + ) + + # ws strategy + if self.ws_strategy == "subdiff" and not hasattr(penalty, "subdiff_distance"): + raise AttributeError( + "Penalty must implement `subdiff_distance` " + "to use ws_strategy='subdiff'." + ) + @njit def dist_fix_point_bcd(W, grad_ws, lipschitz_ws, datafit, penalty, ws): diff --git a/skglm/solvers/prox_newton.py b/skglm/solvers/prox_newton.py index 4b8e0aaf7..76867c7d8 100644 --- a/skglm/solvers/prox_newton.py +++ b/skglm/solvers/prox_newton.py @@ -52,6 +52,9 @@ class ProxNewton(BaseSolver): code: https://github.com/tbjohns/BlitzL1 """ + _datafit_required_attr = ("raw_grad", "raw_hessian") + _penalty_required_attr = ("prox_1d",) + def __init__(self, p0=10, max_iter=20, max_pn_iter=1000, tol=1e-4, ws_strategy="subdiff", fit_intercept=True, warm_start=False, verbose=0): @@ -64,7 +67,7 @@ def __init__(self, p0=10, max_iter=20, max_pn_iter=1000, tol=1e-4, self.warm_start = warm_start self.verbose = verbose - def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): + def _solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): if self.ws_strategy not in ("subdiff", "fixpoint"): raise ValueError("ws_strategy must be `subdiff` or `fixpoint`, " f"got {self.ws_strategy}.") @@ -196,6 +199,14 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): ) return w, np.asarray(p_objs_out), stop_crit + def custom_checks(self, X, y, datafit, penalty): + # ws strategy + if self.ws_strategy == "subdiff" and not hasattr(penalty, "subdiff_distance"): + raise AttributeError( + "Penalty must implement `subdiff_distance` " + "to use ws_strategy='subdiff' in ProxNewton solver" + ) + @njit def _descent_direction(X, y, w_epoch, Xw_epoch, fit_intercept, grad_ws, datafit, diff --git a/skglm/tests/test_validation.py b/skglm/tests/test_validation.py new file mode 100644 index 000000000..7e998bfb8 --- /dev/null +++ b/skglm/tests/test_validation.py @@ -0,0 +1,90 @@ +import pytest +import numpy as np +from scipy import sparse + +from skglm.penalties import L1, WeightedL1GroupL2, WeightedGroupL2 +from skglm.datafits import Poisson, Huber, QuadraticGroup, LogisticGroup +from skglm.solvers import FISTA, ProxNewton, GroupBCD, GramCD, GroupProxNewton + +from skglm.utils.data import grp_converter +from skglm.utils.data import make_correlated_data +from skglm.utils.jit_compilation import compiled_clone + + +def test_datafit_penalty_solver_compatibility(): + grp_size, n_features = 3, 9 + n_samples = 10 + X, y, _ = make_correlated_data(n_samples, n_features) + X_sparse = sparse.csc_array(X) + + n_groups = n_features // grp_size + weights_groups = np.ones(n_groups) + weights_features = np.ones(n_features) + grp_indices, grp_ptr = grp_converter(grp_size, n_features) + + # basic compatibility checks + with pytest.raises( + AttributeError, match="Missing `raw_grad` and `raw_hessian`" + ): + ProxNewton()._validate( + X, y, compiled_clone(Huber(1.)), compiled_clone(L1(1.)) + ) + with pytest.raises( + AttributeError, match="Missing `get_global_lipschitz`" + ): + FISTA()._validate( + X, y, compiled_clone(Poisson()), compiled_clone(L1(1.)) + ) + with pytest.raises( + AttributeError, match="Missing `get_global_lipschitz`" + ): + FISTA()._validate( + X, y, compiled_clone(Poisson()), compiled_clone(L1(1.)) + ) + # check Gram Solver + with pytest.raises( + AttributeError, match="`GramCD` supports only `Quadratic` datafit" + ): + GramCD()._validate( + X, y, compiled_clone(Poisson()), compiled_clone(L1(1.)) + ) + # check working set strategy subdiff + with pytest.raises( + AttributeError, match="Penalty must implement `subdiff_distance`" + ): + GroupBCD()._validate( + X, y, + datafit=compiled_clone(QuadraticGroup(grp_ptr, grp_indices)), + penalty=compiled_clone( + WeightedL1GroupL2( + 1., weights_groups, weights_features, grp_ptr, grp_indices) + ) + ) + # checks for sparsity + with pytest.raises( + ValueError, + match="Sparse matrices are not yet supported in `GroupProxNewton` solver." + ): + GroupProxNewton()._validate( + X_sparse, y, + datafit=compiled_clone(QuadraticGroup(grp_ptr, grp_indices)), + penalty=compiled_clone( + WeightedL1GroupL2( + 1., weights_groups, weights_features, grp_ptr, grp_indices) + ) + ) + with pytest.raises( + AttributeError, + match="LogisticGroup is not compatible with solver GroupBCD with sparse data." + ): + GroupBCD()._validate( + X_sparse, y, + datafit=compiled_clone(LogisticGroup(grp_ptr, grp_indices)), + penalty=compiled_clone( + WeightedGroupL2(1., weights_groups, grp_ptr, grp_indices) + ) + ) + + +if __name__ == "__main__": + pass diff --git a/skglm/utils/validation.py b/skglm/utils/validation.py index 0da22df40..264ad4bb7 100644 --- a/skglm/utils/validation.py +++ b/skglm/utils/validation.py @@ -1,3 +1,8 @@ +import re + + +SPARSE_SUFFIX = "_sparse" + def check_group_compatible(obj): """Check whether ``obj`` is compatible with ``bcd_solver``. @@ -23,3 +28,72 @@ def check_group_compatible(obj): f"'{obj_name}' is not block-separable. " f"Missing '{attr}' attribute." ) + + +def check_attrs(obj, solver, required_attr, support_sparse=False): + """Check whether datafit or penalty is compatible with solver. + + Parameters + ---------- + obj : Instance of Datafit or Penalty + The instance Datafit (or Penalty) to check. + + solver : Instance of Solver + The instance of Solver to check. + + required_attr : List or tuple of strings + The attributes that ``obj`` must have. + + support_sparse : bool, default False + If ``True`` adds a ``SPARSE_SUFFIX`` to check compatibility with sparse data. + + Raises + ------ + AttributeError + if any of the attribute in ``required_attr`` is missing + from ``obj`` attributes. + """ + missing_attrs = [] + suffix = SPARSE_SUFFIX if support_sparse else "" + + # if `attr` is a list, check that at least one of them + # is within `obj` attributes + for attr in required_attr: + attributes = attr if not isinstance(attr, str) else (attr,) + + for a in attributes: + if hasattr(obj, f"{a}{suffix}"): + break + else: + missing_attrs.append(_join_attrs_with_or(attributes, suffix)) + + if len(missing_attrs): + required_attr = [_join_attrs_with_or(attrs, suffix) for attrs in required_attr] + + # get name obj and solver + name_matcher = re.compile(r"\.(\w+)'>") + + obj_name = name_matcher.search(str(obj.__class__)).group(1) + solver_name = name_matcher.search(str(solver.__class__)).group(1) + + if not support_sparse: + err_message = f"{obj_name} is not compatible with solver {solver_name}." + else: + err_message = (f"{obj_name} is not compatible with solver {solver_name} " + "with sparse data.") + + err_message += (f" It must implement {' and '.join(required_attr)}.\n" + f"Missing {' and '.join(missing_attrs)}.") + + raise AttributeError(err_message) + + +def _join_attrs_with_or(attrs, suffix=""): + if isinstance(attrs, str): + return f"`{attrs}{suffix}`" + + if len(attrs) == 1: + return f"`{attrs[0]}{suffix}`" + + out = " or ".join([f"`{a}{suffix}`" for a in attrs]) + return f"({out})" From ab8b9d715368ce27081418ca0e29c6d142e6c514 Mon Sep 17 00:00:00 2001 From: mathurinm Date: Wed, 17 Jul 2024 09:03:02 +0200 Subject: [PATCH 35/52] MNT Update pull_request_template.md (#271) --- .github/pull_request_template.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 68ecdb2ae..eacb1985e 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -15,5 +15,5 @@ Any detail to be able to relate the PR changes ### Checks before merging PR - [ ] added documentation for any new feature -- [ ] added unittests -- [ ] edited the [what's new](../doc/changes/whats_new.rst)(if applicable) +- [ ] added unit tests +- [ ] edited the [what's new](../doc/changes/whats_new.rst) (if applicable) From b35ff5b1a11c058cba58f0f9b70da979d412fd71 Mon Sep 17 00:00:00 2001 From: mathurinm Date: Thu, 1 Aug 2024 16:44:23 +0200 Subject: [PATCH 36/52] FIX make PDCD_WS solver usable in GeneralizedLinearEstimator (#274) Co-authored-by: Badr-MOUFAD --- skglm/experimental/pdcd_ws.py | 8 ++++++-- skglm/experimental/tests/test_quantile_regression.py | 8 ++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/skglm/experimental/pdcd_ws.py b/skglm/experimental/pdcd_ws.py index 81e72da8c..5ef49e5d4 100644 --- a/skglm/experimental/pdcd_ws.py +++ b/skglm/experimental/pdcd_ws.py @@ -82,13 +82,17 @@ class PDCD_WS(BaseSolver): _datafit_required_attr = ('prox_conjugate',) _penalty_required_attr = ("prox_1d",) - def __init__(self, max_iter=1000, max_epochs=1000, dual_init=None, - p0=100, tol=1e-6, verbose=False): + def __init__( + self, max_iter=1000, max_epochs=1000, dual_init=None, p0=100, tol=1e-6, + fit_intercept=False, warm_start=True, verbose=False + ): self.max_iter = max_iter self.max_epochs = max_epochs self.dual_init = dual_init self.p0 = p0 self.tol = tol + self.fit_intercept = fit_intercept # TODO not handled + self.warm_start = warm_start # TODO not handled self.verbose = verbose def _solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): diff --git a/skglm/experimental/tests/test_quantile_regression.py b/skglm/experimental/tests/test_quantile_regression.py index 65e0c1e65..f4d1aa914 100644 --- a/skglm/experimental/tests/test_quantile_regression.py +++ b/skglm/experimental/tests/test_quantile_regression.py @@ -3,6 +3,7 @@ from numpy.linalg import norm from skglm.penalties import L1 +from skglm import GeneralizedLinearEstimator from skglm.experimental.pdcd_ws import PDCD_WS from skglm.experimental.quantile_regression import Pinball from skglm.utils.jit_compilation import compiled_clone @@ -37,6 +38,13 @@ def test_PDCD_WS(quantile_level): ).fit(X, y) np.testing.assert_allclose(w, clf.coef_, atol=1e-5) + # test compatibility when inside GLM: + estimator = GeneralizedLinearEstimator( + datafit=Pinball(.2), + penalty=L1(alpha=1.), + solver=PDCD_WS(), + ) + estimator.fit(X, y) if __name__ == '__main__': From c6325e9de87ee24a906da7dea9716e1fbc9b2399 Mon Sep 17 00:00:00 2001 From: AnavAgrawal <70776064+AnavAgrawal@users.noreply.github.com> Date: Wed, 6 Nov 2024 01:51:53 +0530 Subject: [PATCH 37/52] ENH - Adds support for L1 + L2 regularization in SparseLogisticRegression (#278) Co-authored-by: mathurinm --- skglm/estimators.py | 14 +++++++++++--- skglm/tests/test_estimators.py | 20 ++++++++++++++++++++ 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/skglm/estimators.py b/skglm/estimators.py index cc488a422..8fb32e75b 100644 --- a/skglm/estimators.py +++ b/skglm/estimators.py @@ -967,6 +967,12 @@ class SparseLogisticRegression(LinearClassifierMixin, SparseCoefMixin, BaseEstim alpha : float, default=1.0 Regularization strength; must be a positive float. + l1_ratio : float, default=1.0 + The ElasticNet mixing parameter, with ``0 <= l1_ratio <= 1``. For + ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it + is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a + combination of L1 and L2. + tol : float, optional Stopping criterion for the optimization. @@ -1003,10 +1009,11 @@ class SparseLogisticRegression(LinearClassifierMixin, SparseCoefMixin, BaseEstim Number of subproblems solved to reach the specified tolerance. """ - def __init__(self, alpha=1.0, tol=1e-4, max_iter=20, max_epochs=1_000, verbose=0, - fit_intercept=True, warm_start=False): + def __init__(self, alpha=1.0, l1_ratio=1.0, tol=1e-4, max_iter=20, max_epochs=1_000, + verbose=0, fit_intercept=True, warm_start=False): super().__init__() self.alpha = alpha + self.l1_ratio = l1_ratio self.tol = tol self.max_iter = max_iter self.max_epochs = max_epochs @@ -1035,7 +1042,8 @@ def fit(self, X, y): max_iter=self.max_iter, max_pn_iter=self.max_epochs, tol=self.tol, fit_intercept=self.fit_intercept, warm_start=self.warm_start, verbose=self.verbose) - return _glm_fit(X, y, self, Logistic(), L1(self.alpha), solver) + return _glm_fit(X, y, self, Logistic(), L1_plus_L2(self.alpha, self.l1_ratio), + solver) def predict_proba(self, X): """Probability estimates. diff --git a/skglm/tests/test_estimators.py b/skglm/tests/test_estimators.py index 0998cfbe5..ec7536f19 100644 --- a/skglm/tests/test_estimators.py +++ b/skglm/tests/test_estimators.py @@ -600,5 +600,25 @@ def test_GroupLasso_estimator_sparse_vs_dense(positive): np.testing.assert_allclose(coef_sparse, coef_dense, atol=1e-7, rtol=1e-5) +@pytest.mark.parametrize("X, l1_ratio", product([X, X_sparse], [1., 0.7, 0.])) +def test_SparseLogReg_elasticnet(X, l1_ratio): + + estimator_sk = clone(dict_estimators_sk['LogisticRegression']) + estimator_ours = clone(dict_estimators_ours['LogisticRegression']) + estimator_sk.set_params(fit_intercept=True, solver='saga', + penalty='elasticnet', l1_ratio=l1_ratio, max_iter=10_000) + estimator_ours.set_params(fit_intercept=True, l1_ratio=l1_ratio, max_iter=10_000) + + estimator_sk.fit(X, y) + estimator_ours.fit(X, y) + coef_sk = estimator_sk.coef_ + coef_ours = estimator_ours.coef_ + + np.testing.assert_array_less(1e-5, norm(coef_ours)) + np.testing.assert_allclose(coef_ours, coef_sk, atol=1e-6) + np.testing.assert_allclose( + estimator_sk.intercept_, estimator_ours.intercept_, rtol=1e-4) + + if __name__ == "__main__": pass From 7d274d898a8e34062a4ec496aee54250d3813869 Mon Sep 17 00:00:00 2001 From: Titouan Vayer Date: Thu, 7 Nov 2024 17:42:34 +0100 Subject: [PATCH 38/52] FEAT Add QuadraticHessian datafit (#279) Co-authored-by: mathurinm --- doc/api.rst | 3 ++- skglm/datafits/__init__.py | 5 +++-- skglm/datafits/single_task.py | 35 +++++++++++++++++++++++++++++++++++ skglm/estimators.py | 2 +- skglm/tests/test_datafits.py | 21 ++++++++++++++++++++- 5 files changed, 61 insertions(+), 5 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 6781ac9df..5ad56081b 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -68,6 +68,7 @@ Datafits Poisson Quadratic QuadraticGroup + QuadraticHessian QuadraticSVC WeightedQuadratic @@ -102,4 +103,4 @@ Experimental PDCD_WS Pinball SqrtQuadratic - SqrtLasso \ No newline at end of file + SqrtLasso diff --git a/skglm/datafits/__init__.py b/skglm/datafits/__init__.py index b8515e543..74e0e5d75 100644 --- a/skglm/datafits/__init__.py +++ b/skglm/datafits/__init__.py @@ -1,6 +1,6 @@ from .base import BaseDatafit, BaseMultitaskDatafit from .single_task import (Quadratic, QuadraticSVC, Logistic, Huber, Poisson, Gamma, - Cox, WeightedQuadratic,) + Cox, WeightedQuadratic, QuadraticHessian,) from .multi_task import QuadraticMultiTask from .group import QuadraticGroup, LogisticGroup @@ -9,5 +9,6 @@ BaseDatafit, BaseMultitaskDatafit, Quadratic, QuadraticSVC, Logistic, Huber, Poisson, Gamma, Cox, QuadraticMultiTask, - QuadraticGroup, LogisticGroup, WeightedQuadratic + QuadraticGroup, LogisticGroup, WeightedQuadratic, + QuadraticHessian ] diff --git a/skglm/datafits/single_task.py b/skglm/datafits/single_task.py index 1ccb218aa..a0bd28736 100644 --- a/skglm/datafits/single_task.py +++ b/skglm/datafits/single_task.py @@ -239,6 +239,41 @@ def intercept_update_step(self, y, Xw): return np.sum(self.sample_weights * (Xw - y)) / self.sample_weights.sum() +class QuadraticHessian(BaseDatafit): + r"""Quadratic datafit where we pass the Hessian A directly. + + The datafit reads: + + .. math:: 1 / 2 x^(\top) A x + \langle b, x \rangle + + For a symmetric A. Up to a constant, it is the same as a Quadratic, with + :math:`A = 1 / (n_"samples") X^(\top)X` and :math:`b = - 1 / n_"samples" X^(\top)y`. + When the Hessian is available, this datafit is more efficient than using Quadratic. + """ + + def __init__(self): + pass + + def get_spec(self): + pass + + def params_to_dict(self): + return dict() + + def get_lipschitz(self, A, b): + n_features = A.shape[0] + lipschitz = np.zeros(n_features, dtype=A.dtype) + for j in range(n_features): + lipschitz[j] = A[j, j] + return lipschitz + + def gradient_scalar(self, A, b, w, Ax, j): + return Ax[j] + b[j] + + def value(self, b, x, Ax): + return 0.5 * (x*Ax).sum() + (b*x).sum() + + @njit def sigmoid(x): """Vectorwise sigmoid.""" diff --git a/skglm/estimators.py b/skglm/estimators.py index 8fb32e75b..a785fd5c5 100644 --- a/skglm/estimators.py +++ b/skglm/estimators.py @@ -21,7 +21,7 @@ from skglm.utils.jit_compilation import compiled_clone from skglm.solvers import AndersonCD, MultiTaskBCD, GroupBCD from skglm.datafits import (Cox, Quadratic, Logistic, QuadraticSVC, - QuadraticMultiTask, QuadraticGroup) + QuadraticMultiTask, QuadraticGroup,) from skglm.penalties import (L1, WeightedL1, L1_plus_L2, L2, WeightedGroupL2, MCPenalty, WeightedMCPenalty, IndicatorBox, L2_1) from skglm.utils.data import grp_converter diff --git a/skglm/tests/test_datafits.py b/skglm/tests/test_datafits.py index a6068b2ec..cdd77df47 100644 --- a/skglm/tests/test_datafits.py +++ b/skglm/tests/test_datafits.py @@ -6,7 +6,7 @@ from numpy.testing import assert_allclose, assert_array_less from skglm.datafits import (Huber, Logistic, Poisson, Gamma, Cox, WeightedQuadratic, - Quadratic,) + Quadratic, QuadraticHessian) from skglm.penalties import L1, WeightedL1 from skglm.solvers import AndersonCD, ProxNewton from skglm import GeneralizedLinearEstimator @@ -219,5 +219,24 @@ def test_sample_weights(fit_intercept): # np.testing.assert_equal(n_iter, n_iter_overs) +def test_HessianQuadratic(): + n_samples = 20 + n_features = 10 + X, y, _ = make_correlated_data( + n_samples=n_samples, n_features=n_features, random_state=0) + A = X.T @ X / n_samples + b = -X.T @ y / n_samples + alpha = np.max(np.abs(b)) / 10 + + pen = L1(alpha) + solv = AndersonCD(warm_start=False, verbose=2, fit_intercept=False) + lasso = GeneralizedLinearEstimator(Quadratic(), pen, solv).fit(X, y) + qpl1 = GeneralizedLinearEstimator(QuadraticHessian(), pen, solv).fit(A, b) + + np.testing.assert_allclose(lasso.coef_, qpl1.coef_) + # check that it's not just because we got alpha too high and thus 0 coef + np.testing.assert_array_less(0.1, np.max(np.abs(qpl1.coef_))) + + if __name__ == '__main__': pass From 75b92cc606bc83b10de7f9ed4ed8df3b55f22995 Mon Sep 17 00:00:00 2001 From: floko Date: Wed, 2 Apr 2025 12:44:16 +0200 Subject: [PATCH 39/52] Docstring update for L2 penalty in SparseLogisticRegression (#281) --- doc/changes/0.4.rst | 2 +- skglm/estimators.py | 20 ++++++++++++++------ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/doc/changes/0.4.rst b/doc/changes/0.4.rst index 904401fd9..3a998c4d0 100644 --- a/doc/changes/0.4.rst +++ b/doc/changes/0.4.rst @@ -6,7 +6,7 @@ Version 0.4 (in progress) - Add support and tutorial for positive coefficients to :ref:`Group Lasso Penalty ` (PR: :gh:`221`) - Check compatibility with datafit and penalty in solver (PR :gh:`137`) - Add support to weight samples in the quadratic datafit :ref:`Weighted Quadratic Datafit ` (PR: :gh:`258`) - +- Add support for ElasticNet regularization (`penalty="l1_plus_l2"`) to :ref:`SparseLogisticRegression ` (PR: :gh:`244`) Version 0.3.1 (2023/12/21) -------------------------- diff --git a/skglm/estimators.py b/skglm/estimators.py index a785fd5c5..4d8bda0ed 100644 --- a/skglm/estimators.py +++ b/skglm/estimators.py @@ -959,8 +959,15 @@ class SparseLogisticRegression(LinearClassifierMixin, SparseCoefMixin, BaseEstim The optimization objective for sparse Logistic regression is: - .. math:: 1 / n_"samples" sum_(i=1)^(n_"samples") log(1 + exp(-y_i x_i^T w)) - + alpha ||w||_1 + .. math:: + 1 / n_"samples" \sum_{i=1}^{n_"samples"} log(1 + exp(-y_i x_i^T w)) + + tt"l1_ratio" xx alpha ||w||_1 + + (1 - tt"l1_ratio") xx alpha/2 ||w||_2 ^ 2 + + By default, ``l1_ratio=1.0`` corresponds to Lasso (pure L1 penalty). + When ``0 < l1_ratio < 1``, the penalty is a convex combination of L1 and L2 + (i.e., ElasticNet). ``l1_ratio=0.0`` corresponds to Ridge (pure L2), but note + that pure Ridge is not typically used with this class. Parameters ---------- @@ -968,10 +975,11 @@ class SparseLogisticRegression(LinearClassifierMixin, SparseCoefMixin, BaseEstim Regularization strength; must be a positive float. l1_ratio : float, default=1.0 - The ElasticNet mixing parameter, with ``0 <= l1_ratio <= 1``. For - ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it - is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a - combination of L1 and L2. + The ElasticNet mixing parameter, with ``0 <= l1_ratio <= 1``. + Only used when ``penalty="l1_plus_l2"``. + For ``l1_ratio = 0`` the penalty is an L2 penalty. + ``For l1_ratio = 1`` it is an L1 penalty. + For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2. tol : float, optional Stopping criterion for the optimization. From 083736564cd1529d1955fe68fdb5e2ca1dbf3314 Mon Sep 17 00:00:00 2001 From: mathurinm Date: Fri, 4 Apr 2025 17:35:26 +0200 Subject: [PATCH 40/52] FIX/MNT install R with conda and use python 3.10 on test workflow (#282) --- .github/workflows/main.yml | 7 ++++--- skglm/estimators.py | 14 +++++++------- skglm/tests/test_penalties.py | 21 ++++++++++----------- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 656344f52..ef8d7efa7 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -13,10 +13,10 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - name: Set up Python 3.8 + - name: Set up Python 3.10 uses: actions/setup-python@v3 with: - python-version: 3.8 + python-version: "3.10" - name: Install package and testing tools run: | python -m pip install --upgrade pip @@ -24,9 +24,10 @@ jobs: pip install .[test] - name: Install other dependencies run: | + conda install rpy2 -c conda-forge pip install celer pip install statsmodels cvxopt - pip install git+https://github.com/jolars/pyslope.git + pip install git+https://github.com/jolars/sortedl1.git # for testing Cox estimator pip install lifelines pip install pandas diff --git a/skglm/estimators.py b/skglm/estimators.py index 4d8bda0ed..870b1cbe7 100644 --- a/skglm/estimators.py +++ b/skglm/estimators.py @@ -10,7 +10,7 @@ from sklearn.utils.validation import (check_is_fitted, check_array, check_consistent_length) from sklearn.linear_model._base import ( - LinearModel, RegressorMixin, + RegressorMixin, LinearModel, LinearClassifierMixin, SparseCoefMixin, BaseEstimator ) from sklearn.utils.extmath import softmax @@ -302,7 +302,7 @@ def get_params(self, deep=False): return params -class Lasso(LinearModel, RegressorMixin): +class Lasso(RegressorMixin, LinearModel): r"""Lasso estimator based on Celer solver and primal extrapolation. The optimization objective for Lasso is: @@ -449,7 +449,7 @@ def path(self, X, y, alphas, coef_init=None, return_n_iter=True, **params): return solver.path(X, y, datafit, penalty, alphas, coef_init, return_n_iter) -class WeightedLasso(LinearModel, RegressorMixin): +class WeightedLasso(RegressorMixin, LinearModel): r"""WeightedLasso estimator based on Celer solver and primal extrapolation. The optimization objective for WeightedLasso is: @@ -612,7 +612,7 @@ def fit(self, X, y): return _glm_fit(X, y, self, Quadratic(), penalty, solver) -class ElasticNet(LinearModel, RegressorMixin): +class ElasticNet(RegressorMixin, LinearModel): r"""Elastic net estimator. The optimization objective for Elastic net is: @@ -766,7 +766,7 @@ def fit(self, X, y): L1_plus_L2(self.alpha, self.l1_ratio, self.positive), solver) -class MCPRegression(LinearModel, RegressorMixin): +class MCPRegression(RegressorMixin, LinearModel): r"""Linear regression with MCP penalty estimator. The optimization objective for MCPRegression is, with :math:`x >= 0`: @@ -1381,7 +1381,7 @@ def fit(self, X, y): return self -class MultiTaskLasso(LinearModel, RegressorMixin): +class MultiTaskLasso(RegressorMixin, LinearModel): r"""MultiTaskLasso estimator. The optimization objective for MultiTaskLasso is: @@ -1557,7 +1557,7 @@ def path(self, X, Y, alphas, coef_init=None, return_n_iter=False, **params): return solver.path(X, Y, datafit, penalty, alphas, coef_init, return_n_iter) -class GroupLasso(LinearModel, RegressorMixin): +class GroupLasso(RegressorMixin, LinearModel): r"""GroupLasso estimator based on Celer solver and primal extrapolation. The optimization objective for GroupLasso is: diff --git a/skglm/tests/test_penalties.py b/skglm/tests/test_penalties.py index 9af234c15..0f1bddd92 100644 --- a/skglm/tests/test_penalties.py +++ b/skglm/tests/test_penalties.py @@ -86,25 +86,24 @@ def test_slope_lasso(): def test_slope(): - # compare solutions with `pyslope`: https://github.com/jolars/pyslope + # compare solutions with `sortedl1`: https://github.com/jolars/sortedl1 try: - from slope.solvers import pgd_slope # noqa - from slope.utils import lambda_sequence # noqa + from sortedl1 import Slope as SlopeEst # noqa except ImportError: pytest.xfail( "This test requires slope to run.\n" - "https://github.com/jolars/pyslope") + "https://github.com/jolars/sortedl1") - q = 0.1 - alphas = lambda_sequence( - X, y, fit_intercept=False, reg=alpha / alpha_max, q=q) + # q = 0.1 + # alphas = lambda_sequence( + # X, y, fit_intercept=False, reg=alpha / alpha_max, q=q) + clf = SlopeEst(alpha=0.01, fit_intercept=False).fit(X, y) + alphas = clf.lambda_ ours = GeneralizedLinearEstimator( - penalty=SLOPE(alphas), + penalty=SLOPE(clf.alpha * alphas), solver=FISTA(max_iter=1000, tol=tol, opt_strategy="fixpoint"), ).fit(X, y) - pyslope_out = pgd_slope( - X, y, alphas, fit_intercept=False, max_it=1000, gap_tol=tol) - np.testing.assert_allclose(ours.coef_, pyslope_out["beta"], rtol=1e-5) + np.testing.assert_allclose(ours.coef_, clf.coef_, rtol=1e-5) @pytest.mark.parametrize("fit_intercept", [True, False]) From a334d2a7c2ba12bd2b540dbf6e98812ad8125430 Mon Sep 17 00:00:00 2001 From: mathurinm Date: Mon, 7 Apr 2025 16:10:38 +0200 Subject: [PATCH 41/52] MNT move citation up in readme (#284) --- README.md | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index e845dfeec..bf8c350ca 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ skglm logo + ## A fast ⚡ and modular ⚒️ scikit-learn replacement for sparse GLMs ![build](https://github.com/scikit-learn-contrib/skglm/workflows/pytest/badge.svg) @@ -19,6 +20,24 @@ You get to choose from ``skglm``'s already-made estimators or **customize your o Excited to have a tour on ``skglm`` [documentation](https://contrib.scikit-learn.org/skglm/)? +# Cite + +``skglm`` is the result of perseverant research. It is licensed under [BSD 3-Clause](https://github.com/scikit-learn-contrib/skglm/blob/main/LICENSE). You are free to use it and if you do so, please cite + +```bibtex +@inproceedings{skglm, + title = {Beyond L1: Faster and better sparse models with skglm}, + author = {Q. Bertrand and Q. Klopfenstein and P.-A. Bannier and G. Gidel and M. Massias}, + booktitle = {NeurIPS}, + year = {2022}, +} + +@article{moufad2023skglm, + title={skglm: improving scikit-learn for regularized Generalized Linear Models}, + author={Moufad, Badr and Bannier, Pierre-Antoine and Bertrand, Quentin and Klopfenstein, Quentin and Massias, Mathurin}, + year={2023} +} +``` # Why ``skglm``? @@ -108,18 +127,7 @@ You can also take our tutorial to learn how to create your own datafit and penal - **pull request**: you may have fixed a bug, added a features, or even fixed a small typo in the documentation, ... you can submit a [pull request](https://github.com/scikit-learn-contrib/skglm/pulls) and we will reach out to you asap. -# Cite -``skglm`` is the result of perseverant research. It is licensed under [BSD 3-Clause](https://github.com/scikit-learn-contrib/skglm/blob/main/LICENSE). You are free to use it and if you do so, please cite - -```bibtex -@inproceedings{skglm, - title = {Beyond L1: Faster and better sparse models with skglm}, - author = {Q. Bertrand and Q. Klopfenstein and P.-A. Bannier and G. Gidel and M. Massias}, - booktitle = {NeurIPS}, - year = {2022}, -} -``` # Useful links From 554a93c59dec55616e92d2b68d632668dafa1565 Mon Sep 17 00:00:00 2001 From: mathurinm Date: Tue, 8 Apr 2025 17:49:21 +0200 Subject: [PATCH 42/52] REL release 0.4 (#285) --- doc/changes/0.4.rst | 2 +- pyproject.toml | 4 ++++ skglm/__init__.py | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/changes/0.4.rst b/doc/changes/0.4.rst index 3a998c4d0..165d23cdb 100644 --- a/doc/changes/0.4.rst +++ b/doc/changes/0.4.rst @@ -1,6 +1,6 @@ .. _changes_0_4: -Version 0.4 (in progress) +Version 0.4 (2023/04/08) ------------------------- - Add :ref:`GroupLasso Estimator ` (PR: :gh:`228`) - Add support and tutorial for positive coefficients to :ref:`Group Lasso Penalty ` (PR: :gh:`221`) diff --git a/pyproject.toml b/pyproject.toml index 34ec1a49f..a7529dbf9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,3 +55,7 @@ doc = [ "furo", "lifelines", ] + + +[tool.setuptools] +license-files = [] diff --git a/skglm/__init__.py b/skglm/__init__.py index d80de3c17..2348fcf9a 100644 --- a/skglm/__init__.py +++ b/skglm/__init__.py @@ -1,4 +1,4 @@ -__version__ = '0.4dev' +__version__ = '0.4' from skglm.estimators import ( # noqa F401 Lasso, WeightedLasso, ElasticNet, MCPRegression, MultiTaskLasso, LinearSVC, From 397b842268674f4892df424ac358293c2a82afc0 Mon Sep 17 00:00:00 2001 From: mathurinm Date: Tue, 8 Apr 2025 17:52:43 +0200 Subject: [PATCH 43/52] MNT start dev of 0.5 version (#286) --- doc/changes/0.5.rst | 4 ++++ doc/changes/whats_new.rst | 2 ++ skglm/__init__.py | 2 +- 3 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 doc/changes/0.5.rst diff --git a/doc/changes/0.5.rst b/doc/changes/0.5.rst new file mode 100644 index 000000000..cf65cee72 --- /dev/null +++ b/doc/changes/0.5.rst @@ -0,0 +1,4 @@ +.. _changes_0_5: + +Version 0.5 (in progress) +------------------------- diff --git a/doc/changes/whats_new.rst b/doc/changes/whats_new.rst index c12324428..a3d29813b 100644 --- a/doc/changes/whats_new.rst +++ b/doc/changes/whats_new.rst @@ -5,6 +5,8 @@ What's new .. currentmodule:: skglm +.. include:: 0.5.rst + .. include:: 0.4.rst .. include:: 0.3.rst diff --git a/skglm/__init__.py b/skglm/__init__.py index 2348fcf9a..2ee5351b9 100644 --- a/skglm/__init__.py +++ b/skglm/__init__.py @@ -1,4 +1,4 @@ -__version__ = '0.4' +__version__ = '0.5dev' from skglm.estimators import ( # noqa F401 Lasso, WeightedLasso, ElasticNet, MCPRegression, MultiTaskLasso, LinearSVC, From 369294419cb3c745045533c22ed1d9fbb39d7705 Mon Sep 17 00:00:00 2001 From: Johan Larsson <13087841+jolars@users.noreply.github.com> Date: Thu, 10 Apr 2025 15:41:52 +0200 Subject: [PATCH 44/52] MNT add celer test dep (#288) Co-authored-by: mathurinm --- .github/workflows/main.yml | 1 - pyproject.toml | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index ef8d7efa7..f8c41f0f0 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -25,7 +25,6 @@ jobs: - name: Install other dependencies run: | conda install rpy2 -c conda-forge - pip install celer pip install statsmodels cvxopt pip install git+https://github.com/jolars/sortedl1.git # for testing Cox estimator diff --git a/pyproject.toml b/pyproject.toml index a7529dbf9..ff51d11d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ test = [ "flake8", "coverage", "numpydoc", + "celer", ] doc = [ From 8985aaa32918882540e6efcb4cf331721ed5f1ad Mon Sep 17 00:00:00 2001 From: Johan Larsson <13087841+jolars@users.noreply.github.com> Date: Thu, 10 Apr 2025 17:58:38 +0200 Subject: [PATCH 45/52] MNT add python version requirement (#289) Co-authored-by: mathurinm --- .circleci/config.yml | 9 ++++++++- pyproject.toml | 10 ++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index fe5ac1f18..9aec8baa9 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -2,7 +2,7 @@ version: 2 jobs: build_docs: docker: - - image: circleci/python:3.8.1-buster + - image: cimg/python:3.10 steps: - checkout - run: @@ -32,6 +32,13 @@ jobs: keys: - pip-cache + # Install Xvfb and related dependencies + - run: + name: Install Xvfb and dependencies + command: | + sudo apt-get update + sudo apt-get install -y xvfb + - run: name: Spin up Xvfb command: | diff --git a/pyproject.toml b/pyproject.toml index ff51d11d8..7484b29d7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,16 @@ dependencies = [ ] dynamic = ["version"] +requires-python = ">=3.9" + +classifiers = [ + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", +] [tool.setuptools.dynamic] version = {attr = "skglm.__version__"} From cb284b0b9a8fc9224e08756053bb251fd4146c6a Mon Sep 17 00:00:00 2001 From: Johan Larsson <13087841+jolars@users.noreply.github.com> Date: Mon, 14 Apr 2025 09:01:07 +0200 Subject: [PATCH 46/52] MNT fix failing slope test (#287) --- .github/workflows/main.yml | 2 +- skglm/tests/test_penalties.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f8c41f0f0..25b47b88d 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -26,7 +26,7 @@ jobs: run: | conda install rpy2 -c conda-forge pip install statsmodels cvxopt - pip install git+https://github.com/jolars/sortedl1.git + pip install sortedl1 # for testing Cox estimator pip install lifelines pip install pandas diff --git a/skglm/tests/test_penalties.py b/skglm/tests/test_penalties.py index 0f1bddd92..ad562923f 100644 --- a/skglm/tests/test_penalties.py +++ b/skglm/tests/test_penalties.py @@ -97,13 +97,15 @@ def test_slope(): # q = 0.1 # alphas = lambda_sequence( # X, y, fit_intercept=False, reg=alpha / alpha_max, q=q) - clf = SlopeEst(alpha=0.01, fit_intercept=False).fit(X, y) + clf = SlopeEst( + alpha=0.01, fit_intercept=False, tol=1e-6 + ).fit(X, y) alphas = clf.lambda_ ours = GeneralizedLinearEstimator( penalty=SLOPE(clf.alpha * alphas), solver=FISTA(max_iter=1000, tol=tol, opt_strategy="fixpoint"), ).fit(X, y) - np.testing.assert_allclose(ours.coef_, clf.coef_, rtol=1e-5) + np.testing.assert_allclose(ours.coef_, np.squeeze(clf.coef_), rtol=1e-3) @pytest.mark.parametrize("fit_intercept", [True, False]) From 185c17f8eaabe6d16544481749b8679b28659687 Mon Sep 17 00:00:00 2001 From: mathurinm Date: Tue, 15 Apr 2025 14:34:15 +0200 Subject: [PATCH 47/52] MNT add tags for sklearn (#293) --- .github/workflows/flake8.yml | 2 +- skglm/estimators.py | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/.github/workflows/flake8.yml b/.github/workflows/flake8.yml index 53f81fd5a..1cc8b843e 100644 --- a/.github/workflows/flake8.yml +++ b/.github/workflows/flake8.yml @@ -30,4 +30,4 @@ jobs: - name: Check doc style with pydocstyle run: | pip install pydocstyle - pydocstyle skglm --ignore='D100',D102,'D104','D107','D203','D213','D413' + pydocstyle skglm --ignore='D100',D102,'D104','D105','D107','D203','D213','D413', diff --git a/skglm/estimators.py b/skglm/estimators.py index 870b1cbe7..6197101cd 100644 --- a/skglm/estimators.py +++ b/skglm/estimators.py @@ -448,6 +448,11 @@ def path(self, X, y, alphas, coef_init=None, return_n_iter=True, **params): warm_start=self.warm_start, verbose=self.verbose) return solver.path(X, y, datafit, penalty, alphas, coef_init, return_n_iter) + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + class WeightedLasso(RegressorMixin, LinearModel): r"""WeightedLasso estimator based on Celer solver and primal extrapolation. @@ -611,6 +616,11 @@ def fit(self, X, y): warm_start=self.warm_start, verbose=self.verbose) return _glm_fit(X, y, self, Quadratic(), penalty, solver) + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + class ElasticNet(RegressorMixin, LinearModel): r"""Elastic net estimator. @@ -765,6 +775,11 @@ def fit(self, X, y): return _glm_fit(X, y, self, Quadratic(), L1_plus_L2(self.alpha, self.l1_ratio, self.positive), solver) + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + class MCPRegression(RegressorMixin, LinearModel): r"""Linear regression with MCP penalty estimator. @@ -953,6 +968,11 @@ def fit(self, X, y): warm_start=self.warm_start, verbose=self.verbose) return _glm_fit(X, y, self, Quadratic(), penalty, solver) + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + class SparseLogisticRegression(LinearClassifierMixin, SparseCoefMixin, BaseEstimator): r"""Sparse Logistic regression estimator. @@ -1380,6 +1400,11 @@ def fit(self, X, y): return self + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.sparse = True + return tags + class MultiTaskLasso(RegressorMixin, LinearModel): r"""MultiTaskLasso estimator. From 7222e009f507c3dc219a890f50ab6ef304f57cb9 Mon Sep 17 00:00:00 2001 From: Badr MOUFAD <65614794+Badr-MOUFAD@users.noreply.github.com> Date: Tue, 15 Apr 2025 17:38:50 +0200 Subject: [PATCH 48/52] ENH - jit-compile datafits and penalties inside solver (#270) Co-authored-by: mathurinm --- examples/plot_sparse_recovery.py | 3 +- examples/plot_survival_analysis.py | 11 ++--- skglm/estimators.py | 47 ++++++++----------- skglm/experimental/reweighted.py | 4 +- skglm/experimental/sqrt_lasso.py | 5 +- .../tests/test_quantile_regression.py | 5 +- skglm/experimental/tests/test_sqrt_lasso.py | 5 +- skglm/solvers/base.py | 33 ++++++++++++- skglm/solvers/common.py | 3 +- skglm/solvers/fista.py | 2 + skglm/solvers/group_prox_newton.py | 7 +++ skglm/solvers/lbfgs.py | 19 +++++--- skglm/solvers/prox_newton.py | 6 +++ skglm/tests/test_datafits.py | 3 +- skglm/tests/test_estimators.py | 28 ++++++----- skglm/tests/test_fista.py | 18 +++---- skglm/tests/test_gram_solver.py | 3 +- skglm/tests/test_group.py | 16 ------- skglm/tests/test_lbfgs_solver.py | 30 +++++++----- skglm/tests/test_prox_newton.py | 5 +- skglm/tests/test_validation.py | 31 +++++------- skglm/utils/jit_compilation.py | 4 +- 22 files changed, 150 insertions(+), 138 deletions(-) diff --git a/examples/plot_sparse_recovery.py b/examples/plot_sparse_recovery.py index a8439a4dc..a2818049e 100644 --- a/examples/plot_sparse_recovery.py +++ b/examples/plot_sparse_recovery.py @@ -18,7 +18,6 @@ from skglm.utils.data import make_correlated_data from skglm.solvers import AndersonCD from skglm.datafits import Quadratic -from skglm.utils.jit_compilation import compiled_clone from skglm.penalties import L1, MCPenalty, L0_5, L2_3, SCAD cmap = plt.get_cmap('tab10') @@ -74,7 +73,7 @@ for idx, estimator in enumerate(penalties.keys()): print(f'Running {estimator}...') estimator_path = solver.path( - X, y, compiled_clone(datafit), compiled_clone(penalties[estimator]), + X, y, datafit, penalties[estimator], alphas=alphas) f1_temp = np.zeros(n_alphas) diff --git a/examples/plot_survival_analysis.py b/examples/plot_survival_analysis.py index dca110680..93e8c4347 100644 --- a/examples/plot_survival_analysis.py +++ b/examples/plot_survival_analysis.py @@ -15,6 +15,7 @@ # Let's first generate synthetic data on which to run the Cox estimator, # using ``skglm`` data utils. # + from skglm.utils.data import make_dummy_survival_data n_samples, n_features = 500, 100 @@ -59,18 +60,16 @@ # Todo so, we need to combine a Cox datafit and a :math:`\ell_1` penalty # and solve the resulting problem using skglm Proximal Newton solver ``ProxNewton``. # We set the intensity of the :math:`\ell_1` regularization to ``alpha=1e-2``. -from skglm.datafits import Cox from skglm.penalties import L1 +from skglm.datafits import Cox from skglm.solvers import ProxNewton -from skglm.utils.jit_compilation import compiled_clone - # regularization intensity alpha = 1e-2 # skglm internals: init datafit and penalty -datafit = compiled_clone(Cox()) -penalty = compiled_clone(L1(alpha)) +datafit = Cox() +penalty = L1(alpha) datafit.initialize(X, y) @@ -230,7 +229,7 @@ # We only need to pass in ``use_efron=True`` to the ``Cox`` datafit. # ensure using Efron estimate -datafit = compiled_clone(Cox(use_efron=True)) +datafit = Cox(use_efron=True) datafit.initialize(X, y) # solve the problem diff --git a/skglm/estimators.py b/skglm/estimators.py index 6197101cd..c161f5324 100644 --- a/skglm/estimators.py +++ b/skglm/estimators.py @@ -18,7 +18,6 @@ from sklearn.utils._param_validation import Interval, StrOptions from sklearn.multiclass import OneVsRestClassifier, check_classification_targets -from skglm.utils.jit_compilation import compiled_clone from skglm.solvers import AndersonCD, MultiTaskBCD, GroupBCD from skglm.datafits import (Cox, Quadratic, Logistic, QuadraticSVC, QuadraticMultiTask, QuadraticGroup,) @@ -102,12 +101,10 @@ def _glm_fit(X, y, model, datafit, penalty, solver): n_samples, n_features = X_.shape - penalty_jit = compiled_clone(penalty) - datafit_jit = compiled_clone(datafit, to_float32=X.dtype == np.float32) if issparse(X): - datafit_jit.initialize_sparse(X_.data, X_.indptr, X_.indices, y) + datafit.initialize_sparse(X_.data, X_.indptr, X_.indices, y) else: - datafit_jit.initialize(X_, y) + datafit.initialize(X_, y) # if model.warm_start and hasattr(model, 'coef_') and model.coef_ is not None: if solver.warm_start and hasattr(model, 'coef_') and model.coef_ is not None: @@ -136,7 +133,7 @@ def _glm_fit(X, y, model, datafit, penalty, solver): "The size of the WeightedL1 penalty weights should be n_features, " "expected %i, got %i." % (X_.shape[1], len(penalty.weights))) - coefs, p_obj, kkt = solver.solve(X_, y, datafit_jit, penalty_jit, w, Xw) + coefs, p_obj, kkt = solver.solve(X_, y, datafit, penalty, w, Xw) model.coef_, model.stop_crit_ = coefs[:n_features], kkt if y.ndim == 1: model.intercept_ = coefs[-1] if fit_intercept else 0. @@ -440,8 +437,8 @@ def path(self, X, y, alphas, coef_init=None, return_n_iter=True, **params): The number of iterations along the path. If return_n_iter is set to ``True``. """ - penalty = compiled_clone(L1(self.alpha, self.positive)) - datafit = compiled_clone(Quadratic(), to_float32=X.dtype == np.float32) + penalty = L1(self.alpha, self.positive) + datafit = Quadratic() solver = AndersonCD( self.max_iter, self.max_epochs, self.p0, tol=self.tol, ws_strategy=self.ws_strategy, fit_intercept=self.fit_intercept, @@ -581,8 +578,8 @@ def path(self, X, y, alphas, coef_init=None, return_n_iter=True, **params): raise ValueError("The number of weights must match the number of \ features. Got %s, expected %s." % ( len(weights), X.shape[1])) - penalty = compiled_clone(WeightedL1(self.alpha, weights, self.positive)) - datafit = compiled_clone(Quadratic(), to_float32=X.dtype == np.float32) + penalty = WeightedL1(self.alpha, weights, self.positive) + datafit = Quadratic() solver = AndersonCD( self.max_iter, self.max_epochs, self.p0, tol=self.tol, ws_strategy=self.ws_strategy, fit_intercept=self.fit_intercept, @@ -744,8 +741,8 @@ def path(self, X, y, alphas, coef_init=None, return_n_iter=True, **params): The number of iterations along the path. If return_n_iter is set to ``True``. """ - penalty = compiled_clone(L1_plus_L2(self.alpha, self.l1_ratio, self.positive)) - datafit = compiled_clone(Quadratic(), to_float32=X.dtype == np.float32) + penalty = L1_plus_L2(self.alpha, self.l1_ratio, self.positive) + datafit = Quadratic() solver = AndersonCD( self.max_iter, self.max_epochs, self.p0, tol=self.tol, ws_strategy=self.ws_strategy, fit_intercept=self.fit_intercept, @@ -917,19 +914,17 @@ def path(self, X, y, alphas, coef_init=None, return_n_iter=True, **params): ``True``. """ if self.weights is None: - penalty = compiled_clone( - MCPenalty(self.alpha, self.gamma, self.positive) - ) + penalty = MCPenalty(self.alpha, self.gamma, self.positive) else: if X.shape[1] != len(self.weights): raise ValueError( "The number of weights must match the number of features. " f"Got {len(self.weights)}, expected {X.shape[1]}." ) - penalty = compiled_clone( - WeightedMCPenalty(self.alpha, self.gamma, self.weights, self.positive) - ) - datafit = compiled_clone(Quadratic(), to_float32=X.dtype == np.float32) + penalty = WeightedMCPenalty( + self.alpha, self.gamma, self.weights, self.positive) + + datafit = Quadratic() solver = AndersonCD( self.max_iter, self.max_epochs, self.p0, tol=self.tol, ws_strategy=self.ws_strategy, fit_intercept=self.fit_intercept, @@ -1369,10 +1364,6 @@ def fit(self, X, y): else: penalty = L2(self.alpha) - # skglm internal: JIT compile classes - datafit = compiled_clone(datafit) - penalty = compiled_clone(penalty) - # init solver if self.l1_ratio == 0.: solver = LBFGS(max_iter=self.max_iter, tol=self.tol, verbose=self.verbose) @@ -1518,14 +1509,14 @@ def fit(self, X, Y): if not self.warm_start or not hasattr(self, "coef_"): self.coef_ = None - datafit_jit = compiled_clone(QuadraticMultiTask(), X.dtype == np.float32) - penalty_jit = compiled_clone(L2_1(self.alpha), X.dtype == np.float32) + datafit = QuadraticMultiTask() + penalty = L2_1(self.alpha) solver = MultiTaskBCD( self.max_iter, self.max_epochs, self.p0, tol=self.tol, ws_strategy=self.ws_strategy, fit_intercept=self.fit_intercept, warm_start=self.warm_start, verbose=self.verbose) - W, obj_out, kkt = solver.solve(X, Y, datafit_jit, penalty_jit) + W, obj_out, kkt = solver.solve(X, Y, datafit, penalty) self.coef_ = W[:X.shape[1], :].T self.intercept_ = self.fit_intercept * W[-1, :] @@ -1573,8 +1564,8 @@ def path(self, X, Y, alphas, coef_init=None, return_n_iter=False, **params): The number of iterations along the path. If return_n_iter is set to ``True``. """ - datafit = compiled_clone(QuadraticMultiTask(), to_float32=X.dtype == np.float32) - penalty = compiled_clone(L2_1(self.alpha)) + datafit = QuadraticMultiTask() + penalty = L2_1(self.alpha) solver = MultiTaskBCD( self.max_iter, self.max_epochs, self.p0, tol=self.tol, ws_strategy=self.ws_strategy, fit_intercept=self.fit_intercept, diff --git a/skglm/experimental/reweighted.py b/skglm/experimental/reweighted.py index cf3d7dc75..64d33f906 100644 --- a/skglm/experimental/reweighted.py +++ b/skglm/experimental/reweighted.py @@ -69,9 +69,9 @@ def fit(self, X, y): f"penalty {self.penalty.__class__.__name__}") n_features = X.shape[1] - _penalty = compiled_clone(WeightedL1(self.penalty.alpha, np.ones(n_features))) - self.datafit = compiled_clone(self.datafit) + # we need to compile this as it is not passed to solver.solve: self.penalty = compiled_clone(self.penalty) + _penalty = WeightedL1(self.penalty.alpha, np.ones(n_features)) self.loss_history_ = [] diff --git a/skglm/experimental/sqrt_lasso.py b/skglm/experimental/sqrt_lasso.py index 55ba908ae..573fa6d81 100644 --- a/skglm/experimental/sqrt_lasso.py +++ b/skglm/experimental/sqrt_lasso.py @@ -6,7 +6,6 @@ from skglm.penalties import L1 from skglm.utils.prox_funcs import ST_vec, proj_L2ball, BST -from skglm.utils.jit_compilation import compiled_clone from skglm.datafits.base import BaseDatafit from skglm.solvers.prox_newton import ProxNewton @@ -186,8 +185,8 @@ def path(self, X, y, alphas=None, eps=1e-3, n_alphas=10): alphas = np.sort(alphas)[::-1] n_features = X.shape[1] - sqrt_quadratic = compiled_clone(SqrtQuadratic()) - l1_penalty = compiled_clone(L1(1.)) # alpha is set along the path + sqrt_quadratic = SqrtQuadratic() + l1_penalty = L1(1.) # alpha is set along the path coefs = np.zeros((n_alphas, n_features + self.fit_intercept)) diff --git a/skglm/experimental/tests/test_quantile_regression.py b/skglm/experimental/tests/test_quantile_regression.py index f4d1aa914..b2d685625 100644 --- a/skglm/experimental/tests/test_quantile_regression.py +++ b/skglm/experimental/tests/test_quantile_regression.py @@ -6,7 +6,6 @@ from skglm import GeneralizedLinearEstimator from skglm.experimental.pdcd_ws import PDCD_WS from skglm.experimental.quantile_regression import Pinball -from skglm.utils.jit_compilation import compiled_clone from skglm.utils.data import make_correlated_data from sklearn.linear_model import QuantileRegressor @@ -23,8 +22,8 @@ def test_PDCD_WS(quantile_level): alpha_max = norm(X.T @ (np.sign(y)/2 + (quantile_level - 0.5)), ord=np.inf) alpha = alpha_max / 5 - datafit = compiled_clone(Pinball(quantile_level)) - penalty = compiled_clone(L1(alpha)) + datafit = Pinball(quantile_level) + penalty = L1(alpha) w = PDCD_WS( dual_init=np.sign(y)/2 + (quantile_level - 0.5) diff --git a/skglm/experimental/tests/test_sqrt_lasso.py b/skglm/experimental/tests/test_sqrt_lasso.py index b63619884..ee970ca41 100644 --- a/skglm/experimental/tests/test_sqrt_lasso.py +++ b/skglm/experimental/tests/test_sqrt_lasso.py @@ -7,7 +7,6 @@ from skglm.experimental.sqrt_lasso import (SqrtLasso, SqrtQuadratic, _chambolle_pock_sqrt) from skglm.experimental.pdcd_ws import PDCD_WS -from skglm.utils.jit_compilation import compiled_clone def test_alpha_max(): @@ -73,8 +72,8 @@ def test_PDCD_WS(with_dual_init): dual_init = y / norm(y) if with_dual_init else None - datafit = compiled_clone(SqrtQuadratic()) - penalty = compiled_clone(L1(alpha)) + datafit = SqrtQuadratic() + penalty = L1(alpha) w = PDCD_WS(dual_init=dual_init).solve(X, y, datafit, penalty)[0] clf = SqrtLasso(alpha=alpha, fit_intercept=False, tol=1e-12).fit(X, y) diff --git a/skglm/solvers/base.py b/skglm/solvers/base.py index 06a08a690..a550eaa73 100644 --- a/skglm/solvers/base.py +++ b/skglm/solvers/base.py @@ -1,5 +1,10 @@ +import warnings from abc import abstractmethod, ABC + +import numpy as np + from skglm.utils.validation import check_attrs +from skglm.utils.jit_compilation import compiled_clone class BaseSolver(ABC): @@ -89,8 +94,9 @@ def custom_checks(self, X, y, datafit, penalty): """ pass - def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None, - *, run_checks=True): + def solve( + self, X, y, datafit, penalty, w_init=None, Xw_init=None, *, run_checks=True + ): """Solve the optimization problem after validating its compatibility. A proxy of ``_solve`` method that implicitly ensures the compatibility @@ -101,6 +107,29 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None, >>> ... >>> coefs, obj_out, stop_crit = solver.solve(X, y, datafit, penalty) """ + # TODO check for datafit/penalty being jit-compiled properly + # instead of searching for a string + if "jitclass" in str(type(datafit)): + warnings.warn( + "Passing in a compiled datafit is deprecated since skglm v0.5 " + "Compilation is now done inside solver." + "This will raise an error starting skglm v0.6 onwards." + ) + elif datafit is not None: + datafit = compiled_clone(datafit, to_float32=X.dtype == np.float32) + + if "jitclass" in str(type(penalty)): + warnings.warn( + "Passing in a compiled penalty is deprecated since skglm v0.5 " + "Compilation is now done inside solver. " + "This will raise an error starting skglm v0.6 onwards." + ) + elif penalty is not None: + penalty = compiled_clone(penalty) + # TODO add support for bool spec in compiled_clone + # currently, doing so break the code + # penalty = compiled_clone(penalty, to_float32=X.dtype == np.float32) + if run_checks: self._validate(X, y, datafit, penalty) diff --git a/skglm/solvers/common.py b/skglm/solvers/common.py index cbdb58537..17b1e8a52 100644 --- a/skglm/solvers/common.py +++ b/skglm/solvers/common.py @@ -46,8 +46,7 @@ def dist_fix_point_cd(w, grad_ws, lipschitz_ws, datafit, penalty, ws): @njit -def dist_fix_point_bcd( - w, grad_ws, lipschitz_ws, datafit, penalty, ws): +def dist_fix_point_bcd(w, grad_ws, lipschitz_ws, datafit, penalty, ws): """Compute the violation of the fixed point iterate scheme for BCD. Parameters diff --git a/skglm/solvers/fista.py b/skglm/solvers/fista.py index e0933a111..ccd35db8c 100644 --- a/skglm/solvers/fista.py +++ b/skglm/solvers/fista.py @@ -51,10 +51,12 @@ def _solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): Xw = Xw_init.copy() if Xw_init is not None else np.zeros(n_samples) if X_is_sparse: + datafit.initialize_sparse(X.data, X.indptr, X.indices, y) lipschitz = datafit.get_global_lipschitz_sparse( X.data, X.indptr, X.indices, y ) else: + datafit.initialize(X, y) lipschitz = datafit.get_global_lipschitz(X, y) for n_iter in range(self.max_iter): diff --git a/skglm/solvers/group_prox_newton.py b/skglm/solvers/group_prox_newton.py index 1492651c3..d717e8fba 100644 --- a/skglm/solvers/group_prox_newton.py +++ b/skglm/solvers/group_prox_newton.py @@ -69,6 +69,13 @@ def _solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): stop_crit = 0. p_objs_out = [] + # TODO: to be isolated in a seperated method + is_sparse = issparse(X) + if is_sparse: + datafit.initialize_sparse(X.data, X.indptr, X.indices, y) + else: + datafit.initialize(X, y) + for iter in range(self.max_iter): grad = _construct_grad(X, y, w, Xw, datafit, all_groups) diff --git a/skglm/solvers/lbfgs.py b/skglm/solvers/lbfgs.py index 438c8b97b..854be64e1 100644 --- a/skglm/solvers/lbfgs.py +++ b/skglm/solvers/lbfgs.py @@ -38,6 +38,13 @@ def __init__(self, max_iter=50, tol=1e-4, verbose=False): def _solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): + # TODO: to be isolated in a seperated method + is_sparse = issparse(X) + if is_sparse: + datafit.initialize_sparse(X.data, X.indptr, X.indices, y) + else: + datafit.initialize(X, y) + def objective(w): Xw = X @ w datafit_value = datafit.value(y, w, Xw) @@ -70,8 +77,7 @@ def callback_post_iter(w_k): it = len(p_objs_out) print( - f"Iteration {it}: {p_obj:.10f}, " - f"stopping crit: {stop_crit:.2e}" + f"Iteration {it}: {p_obj:.10f}, " f"stopping crit: {stop_crit:.2e}" ) n_features = X.shape[1] @@ -87,7 +93,7 @@ def callback_post_iter(w_k): options=dict( maxiter=self.max_iter, gtol=self.tol, - ftol=0. # set ftol=0. to control convergence using only gtol + ftol=0.0, # set ftol=0. to control convergence using only gtol ), callback=callback_post_iter, ) @@ -97,7 +103,7 @@ def callback_post_iter(w_k): f"`LBFGS` did not converge for tol={self.tol:.3e} " f"and max_iter={self.max_iter}.\n" "Consider increasing `max_iter` and/or `tol`.", - category=ConvergenceWarning + category=ConvergenceWarning, ) w = result.x @@ -110,7 +116,8 @@ def callback_post_iter(w_k): def custom_checks(self, X, y, datafit, penalty): # check datafit support sparse data check_attrs( - datafit, solver=self, + datafit, + solver=self, required_attr=self._datafit_required_attr, - support_sparse=issparse(X) + support_sparse=issparse(X), ) diff --git a/skglm/solvers/prox_newton.py b/skglm/solvers/prox_newton.py index 76867c7d8..baf055238 100644 --- a/skglm/solvers/prox_newton.py +++ b/skglm/solvers/prox_newton.py @@ -85,6 +85,12 @@ def _solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None): if is_sparse: X_bundles = (X.data, X.indptr, X.indices) + # TODO: to be isolated in a seperated method + if is_sparse: + datafit.initialize_sparse(X.data, X.indptr, X.indices, y) + else: + datafit.initialize(X, y) + if self.ws_strategy == "fixpoint": X_square = X.multiply(X) if is_sparse else X ** 2 diff --git a/skglm/tests/test_datafits.py b/skglm/tests/test_datafits.py index cdd77df47..18d652216 100644 --- a/skglm/tests/test_datafits.py +++ b/skglm/tests/test_datafits.py @@ -11,7 +11,6 @@ from skglm.solvers import AndersonCD, ProxNewton from skglm import GeneralizedLinearEstimator from skglm.utils.data import make_correlated_data -from skglm.utils.jit_compilation import compiled_clone from skglm.utils.data import make_dummy_survival_data @@ -132,7 +131,7 @@ def test_cox(use_efron): Xw = X @ w # check datafit - cox_df = compiled_clone(Cox(use_efron)) + cox_df = Cox(use_efron) cox_df.initialize(X, y) cox_df.value(y, w, Xw) diff --git a/skglm/tests/test_estimators.py b/skglm/tests/test_estimators.py index ec7536f19..954ca2256 100644 --- a/skglm/tests/test_estimators.py +++ b/skglm/tests/test_estimators.py @@ -26,7 +26,6 @@ from skglm.datafits import Logistic, Quadratic, QuadraticSVC, QuadraticMultiTask, Cox from skglm.penalties import L1, IndicatorBox, L1_plus_L2, MCPenalty, WeightedL1, SLOPE from skglm.solvers import AndersonCD, FISTA, ProxNewton -from skglm.utils.jit_compilation import compiled_clone n_samples = 50 n_tasks = 9 @@ -175,8 +174,10 @@ def test_mtl_path(): @pytest.mark.parametrize("use_efron, use_float_32", - product([True, False], [True, False])) + # product([True, False], [True, False])) + product([True, False], [False])) def test_CoxEstimator(use_efron, use_float_32): + # TODO: fix test for float_32, same for CoxEstimator_sparse try: from lifelines import CoxPHFitter except ModuleNotFoundError: @@ -187,7 +188,7 @@ def test_CoxEstimator(use_efron, use_float_32): reg = 1e-2 # norms of solutions differ when n_features > n_samples - n_samples, n_features = 100, 30 + n_samples, n_features = 50, 15 random_state = 1265 X, y = make_dummy_survival_data(n_samples, n_features, normalize=True, @@ -203,8 +204,8 @@ def test_CoxEstimator(use_efron, use_float_32): alpha = reg * alpha_max # fit Cox using ProxNewton solver - datafit = compiled_clone(Cox(use_efron)) - penalty = compiled_clone(L1(alpha)) + datafit = Cox(use_efron) + penalty = L1(alpha) datafit.initialize(X, y) @@ -232,10 +233,11 @@ def test_CoxEstimator(use_efron, use_float_32): @pytest.mark.parametrize("use_efron, use_float_32", - product([True, False], [True, False])) + # product([True, False], [True, False])) + product([True, False], [True])) def test_CoxEstimator_sparse(use_efron, use_float_32): reg = 1e-2 - n_samples, n_features = 100, 30 + n_samples, n_features = 50, 15 X_density, random_state = 0.5, 1265 X, y = make_dummy_survival_data(n_samples, n_features, X_density=X_density, @@ -251,8 +253,8 @@ def test_CoxEstimator_sparse(use_efron, use_float_32): alpha = reg * alpha_max # fit Cox using ProxNewton solver - datafit = compiled_clone(Cox(use_efron)) - penalty = compiled_clone(L1(alpha)) + datafit = Cox(use_efron) + penalty = L1(alpha) datafit.initialize_sparse(X.data, X.indptr, X.indices, y) @@ -343,7 +345,7 @@ def test_equivalence_cox_SLOPE_cox_L1(use_efron, issparse): random_state=0) # init datafit - datafit = compiled_clone(Cox(use_efron)) + datafit = Cox(use_efron) if not issparse: datafit.initialize(X, y) @@ -357,7 +359,7 @@ def test_equivalence_cox_SLOPE_cox_L1(use_efron, issparse): # init penalty alpha = reg * alpha_max alphas = alpha * np.ones(n_features) - penalty = compiled_clone(SLOPE(alphas)) + penalty = SLOPE(alphas) solver = FISTA(opt_strategy="fixpoint", max_iter=10_000, tol=1e-9) @@ -378,7 +380,7 @@ def test_cox_SLOPE(use_efron): n_samples, n_features, with_ties=use_efron, random_state=0) # init datafit - datafit = compiled_clone(Cox(use_efron)) + datafit = Cox(use_efron) datafit.initialize(X, y) # compute alpha_max @@ -388,7 +390,7 @@ def test_cox_SLOPE(use_efron): # init penalty alpha = reg * alpha_ref alphas = alpha / np.arange(n_features + 1)[1:] - penalty = compiled_clone(SLOPE(alphas)) + penalty = SLOPE(alphas) solver = FISTA(opt_strategy="fixpoint", max_iter=10_000, tol=1e-9) diff --git a/skglm/tests/test_fista.py b/skglm/tests/test_fista.py index 04f9c1ea8..dc6ecb0ce 100644 --- a/skglm/tests/test_fista.py +++ b/skglm/tests/test_fista.py @@ -3,14 +3,13 @@ import numpy as np from numpy.linalg import norm -from scipy.sparse import csc_matrix, issparse +from scipy.sparse import csc_matrix -from skglm.penalties import L1, IndicatorBox +from skglm.penalties import L1 from skglm.solvers import FISTA, AndersonCD -from skglm.datafits import Quadratic, Logistic, QuadraticSVC +from skglm.datafits import Quadratic, Logistic from skglm.utils.data import make_correlated_data -from skglm.utils.jit_compilation import compiled_clone random_state = 113 @@ -32,17 +31,12 @@ @pytest.mark.parametrize("Datafit, Penalty", [ (Quadratic, L1), (Logistic, L1), - (QuadraticSVC, IndicatorBox), + # (QuadraticSVC, IndicatorBox), ]) def test_fista_solver(X, Datafit, Penalty): _y = y if isinstance(Datafit, Quadratic) else y_classif - datafit = compiled_clone(Datafit()) - _init = y @ X.T if isinstance(Datafit, QuadraticSVC) else X - if issparse(X): - datafit.initialize_sparse(_init.data, _init.indptr, _init.indices, _y) - else: - datafit.initialize(_init, _y) - penalty = compiled_clone(Penalty(alpha)) + datafit = Datafit() + penalty = Penalty(alpha) solver = FISTA(max_iter=1000, tol=tol) w_fista = solver.solve(X, _y, datafit, penalty)[0] diff --git a/skglm/tests/test_gram_solver.py b/skglm/tests/test_gram_solver.py index 669cc38a3..2a2d4dcd8 100644 --- a/skglm/tests/test_gram_solver.py +++ b/skglm/tests/test_gram_solver.py @@ -9,7 +9,6 @@ from skglm.solvers import GramCD from skglm.utils.data import make_correlated_data -from skglm.utils.jit_compilation import compiled_clone @pytest.mark.parametrize("rho, X_density, greedy_cd", @@ -23,7 +22,7 @@ def test_vs_lasso_sklearn(rho, X_density, greedy_cd): sk_lasso = Lasso(alpha, fit_intercept=False, tol=1e-9) sk_lasso.fit(X, y) - l1_penalty = compiled_clone(L1(alpha)) + l1_penalty = L1(alpha) w = GramCD(tol=1e-9, max_iter=1000, greedy_cd=greedy_cd).solve( X, y, None, l1_penalty)[0] np.testing.assert_allclose(w, sk_lasso.coef_.flatten(), rtol=1e-7, atol=1e-7) diff --git a/skglm/tests/test_group.py b/skglm/tests/test_group.py index 6ec839466..4b052ab81 100644 --- a/skglm/tests/test_group.py +++ b/skglm/tests/test_group.py @@ -14,7 +14,6 @@ from skglm.solvers import GroupBCD, GroupProxNewton from skglm.utils.anderson import AndersonAcceleration -from skglm.utils.jit_compilation import compiled_clone from skglm.utils.data import (make_correlated_data, grp_converter, _alpha_max_group_lasso) @@ -71,9 +70,6 @@ def test_alpha_max(n_groups, n_features, shuffle): alpha=alpha_max, grp_ptr=grp_ptr, grp_indices=grp_indices, weights=weights) - # compile classes - quad_group = compiled_clone(quad_group, to_float32=X.dtype == np.float32) - group_penalty = compiled_clone(group_penalty) w = GroupBCD(tol=1e-12).solve(X, y, quad_group, group_penalty)[0] np.testing.assert_allclose(norm(w), 0, atol=1e-14) @@ -96,9 +92,6 @@ def test_equivalence_lasso(positive): alpha=alpha, grp_ptr=grp_ptr, grp_indices=grp_indices, weights=weights, positive=positive) - # compile classes - quad_group = compiled_clone(quad_group, to_float32=X.dtype == np.float32) - group_penalty = compiled_clone(group_penalty) w = GroupBCD(tol=1e-12).solve(X, y, quad_group, group_penalty)[0] celer_lasso = Lasso( @@ -126,9 +119,6 @@ def test_vs_celer_grouplasso(n_groups, n_features, shuffle): alpha=alpha, grp_ptr=grp_ptr, grp_indices=grp_indices, weights=weights) - # compile classes - quad_group = compiled_clone(quad_group, to_float32=X.dtype == np.float32) - group_penalty = compiled_clone(group_penalty) w = GroupBCD(tol=1e-12).solve(X, y, quad_group, group_penalty)[0] model = GroupLasso(groups=groups, alpha=alpha, weights=weights, @@ -218,8 +208,6 @@ def test_intercept_grouplasso(): alpha=alpha, grp_ptr=grp_ptr, grp_indices=grp_indices, weights=weights) - quad_group = compiled_clone(quad_group, to_float32=X.dtype == np.float32) - group_penalty = compiled_clone(group_penalty) w = GroupBCD(fit_intercept=True, tol=1e-12).solve( X, y, quad_group, group_penalty)[0] model = GroupLasso(groups=groups, alpha=alpha, weights=weights, @@ -247,8 +235,6 @@ def test_equivalence_logreg(solver, rho): alpha=alpha, grp_ptr=grp_ptr, grp_indices=grp_indices, weights=weights) - group_logistic = compiled_clone(group_logistic, to_float32=X.dtype == np.float32) - group_penalty = compiled_clone(group_penalty) w = solver(tol=1e-12).solve(X, y, group_logistic, group_penalty)[0] sk_logreg = LogisticRegression(penalty='l1', C=1/(n_samples * alpha), @@ -280,8 +266,6 @@ def test_group_logreg(solver, n_groups, rho, fit_intercept): group_logistic = LogisticGroup(grp_ptr=grp_ptr, grp_indices=grp_indices) group_penalty = WeightedGroupL2(alpha, weights, grp_ptr, grp_indices) - group_logistic = compiled_clone(group_logistic, to_float32=X.dtype == np.float32) - group_penalty = compiled_clone(group_penalty) stop_crit = solver(tol=1e-12, fit_intercept=fit_intercept).solve( X, y, group_logistic, group_penalty)[2] diff --git a/skglm/tests/test_lbfgs_solver.py b/skglm/tests/test_lbfgs_solver.py index f62c9d082..878e8c7d5 100644 --- a/skglm/tests/test_lbfgs_solver.py +++ b/skglm/tests/test_lbfgs_solver.py @@ -8,29 +8,31 @@ from sklearn.linear_model import LogisticRegression -from skglm.utils.jit_compilation import compiled_clone from skglm.utils.data import make_correlated_data, make_dummy_survival_data @pytest.mark.parametrize("X_sparse", [True, False]) def test_lbfgs_L2_logreg(X_sparse): - reg = 1. - X_density = 1. if not X_sparse else 0.5 + reg = 1.0 + X_density = 1.0 if not X_sparse else 0.5 n_samples, n_features = 100, 50 X, y, _ = make_correlated_data( - n_samples, n_features, random_state=0, X_density=X_density, + n_samples, + n_features, + random_state=0, + X_density=X_density, ) y = np.sign(y) # fit L-BFGS - datafit = compiled_clone(Logistic()) - penalty = compiled_clone(L2(reg)) + datafit = Logistic() + penalty = L2(reg) w, *_ = LBFGS(tol=1e-12).solve(X, y, datafit, penalty) # fit scikit learn estimator = LogisticRegression( - penalty='l2', + penalty="l2", C=1 / (n_samples * reg), fit_intercept=False, tol=1e-12, @@ -49,16 +51,18 @@ def test_L2_Cox(use_efron): "Run `pip install lifelines`" ) - alpha = 10. + alpha = 10.0 n_samples, n_features = 100, 50 X, y = make_dummy_survival_data( - n_samples, n_features, normalize=True, - with_ties=use_efron, random_state=0) + n_samples, n_features, normalize=True, with_ties=use_efron, random_state=0 + ) - datafit = compiled_clone(Cox(use_efron)) - penalty = compiled_clone(L2(alpha)) + datafit = Cox(use_efron) + penalty = L2(alpha) + # XXX: intialize is needed here although it is done in LBFGS + # is used to evaluate the objective datafit.initialize(X, y) w, *_ = LBFGS().solve(X, y, datafit, penalty) @@ -66,7 +70,7 @@ def test_L2_Cox(use_efron): stacked_y_X = np.hstack((y, X)) df = pd.DataFrame(stacked_y_X) - estimator = CoxPHFitter(penalizer=alpha, l1_ratio=0.).fit( + estimator = CoxPHFitter(penalizer=alpha, l1_ratio=0.0).fit( df, duration_col=0, event_col=1 ) w_ll = estimator.params_.values diff --git a/skglm/tests/test_prox_newton.py b/skglm/tests/test_prox_newton.py index d5b10e0cd..66d2f9a11 100644 --- a/skglm/tests/test_prox_newton.py +++ b/skglm/tests/test_prox_newton.py @@ -6,7 +6,6 @@ from skglm.datafits import Logistic from skglm.solvers.prox_newton import ProxNewton -from skglm.utils.jit_compilation import compiled_clone from skglm.utils.data import make_correlated_data @@ -29,8 +28,8 @@ def test_pn_vs_sklearn(X_density, fit_intercept, ws_strategy): tol=1e-12, solver='saga', max_iter=1_000_000) sk_log_reg.fit(X, y) - log_datafit = compiled_clone(Logistic()) - l1_penalty = compiled_clone(L1(alpha)) + log_datafit = Logistic() + l1_penalty = L1(alpha) prox_solver = ProxNewton( fit_intercept=fit_intercept, tol=1e-12, ws_strategy=ws_strategy) w = prox_solver.solve(X, y, log_datafit, l1_penalty)[0] diff --git a/skglm/tests/test_validation.py b/skglm/tests/test_validation.py index 7e998bfb8..d9d1780c5 100644 --- a/skglm/tests/test_validation.py +++ b/skglm/tests/test_validation.py @@ -8,7 +8,6 @@ from skglm.utils.data import grp_converter from skglm.utils.data import make_correlated_data -from skglm.utils.jit_compilation import compiled_clone def test_datafit_penalty_solver_compatibility(): @@ -27,26 +26,26 @@ def test_datafit_penalty_solver_compatibility(): AttributeError, match="Missing `raw_grad` and `raw_hessian`" ): ProxNewton()._validate( - X, y, compiled_clone(Huber(1.)), compiled_clone(L1(1.)) + X, y, Huber(1.), L1(1.) ) with pytest.raises( AttributeError, match="Missing `get_global_lipschitz`" ): FISTA()._validate( - X, y, compiled_clone(Poisson()), compiled_clone(L1(1.)) + X, y, Poisson(), L1(1.) ) with pytest.raises( AttributeError, match="Missing `get_global_lipschitz`" ): FISTA()._validate( - X, y, compiled_clone(Poisson()), compiled_clone(L1(1.)) + X, y, Poisson(), L1(1.) ) # check Gram Solver with pytest.raises( AttributeError, match="`GramCD` supports only `Quadratic` datafit" ): GramCD()._validate( - X, y, compiled_clone(Poisson()), compiled_clone(L1(1.)) + X, y, Poisson(), L1(1.) ) # check working set strategy subdiff with pytest.raises( @@ -54,11 +53,9 @@ def test_datafit_penalty_solver_compatibility(): ): GroupBCD()._validate( X, y, - datafit=compiled_clone(QuadraticGroup(grp_ptr, grp_indices)), - penalty=compiled_clone( - WeightedL1GroupL2( - 1., weights_groups, weights_features, grp_ptr, grp_indices) - ) + datafit=QuadraticGroup(grp_ptr, grp_indices), + penalty=WeightedL1GroupL2( + 1., weights_groups, weights_features, grp_ptr, grp_indices) ) # checks for sparsity with pytest.raises( @@ -67,11 +64,9 @@ def test_datafit_penalty_solver_compatibility(): ): GroupProxNewton()._validate( X_sparse, y, - datafit=compiled_clone(QuadraticGroup(grp_ptr, grp_indices)), - penalty=compiled_clone( - WeightedL1GroupL2( - 1., weights_groups, weights_features, grp_ptr, grp_indices) - ) + datafit=QuadraticGroup(grp_ptr, grp_indices), + penalty=WeightedL1GroupL2( + 1., weights_groups, weights_features, grp_ptr, grp_indices) ) with pytest.raises( AttributeError, @@ -79,10 +74,8 @@ def test_datafit_penalty_solver_compatibility(): ): GroupBCD()._validate( X_sparse, y, - datafit=compiled_clone(LogisticGroup(grp_ptr, grp_indices)), - penalty=compiled_clone( - WeightedGroupL2(1., weights_groups, grp_ptr, grp_indices) - ) + datafit=LogisticGroup(grp_ptr, grp_indices), + penalty=WeightedGroupL2(1., weights_groups, grp_ptr, grp_indices) ) diff --git a/skglm/utils/jit_compilation.py b/skglm/utils/jit_compilation.py index 57ef01865..cf63e357e 100644 --- a/skglm/utils/jit_compilation.py +++ b/skglm/utils/jit_compilation.py @@ -29,7 +29,9 @@ def spec_to_float32(spec): else: dtype32 = dtype else: - raise ValueError(f"Unknown spec type {dtype}") + # raise ValueError(f"Unknown spec type {dtype}") + # bool types and others are not affected: + dtype32 = dtype spec32.append((name, dtype32)) return spec32 From 4629e5935436e5f886b55a308ed32a2be08bfa04 Mon Sep 17 00:00:00 2001 From: floriankozikowski Date: Tue, 22 Apr 2025 18:06:27 +0200 Subject: [PATCH 49/52] first try at unit test --- skglm/.DS_Store | Bin 0 -> 6148 bytes skglm/experimental/.DS_Store | Bin 0 -> 6148 bytes skglm/experimental/tests/test_sqrt_lasso.py | 28 ++++++++++++++++++++ 3 files changed, 28 insertions(+) create mode 100644 skglm/.DS_Store create mode 100644 skglm/experimental/.DS_Store diff --git a/skglm/.DS_Store b/skglm/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..d9796921bd7680534bf25430ee4c7d5337f1987b GIT binary patch literal 6148 zcmeHK%Sr=55Ukb+0X^jCael!+SVH^)f51l~2+>GZ!Qbk-qG3l_lndj8aRuhWF z?aa3*hjodHQa}nEDsY<1nfL!|`VaH}AxSGKAO-%F0ybIPEarTr>aC-f^IqHNxAdK{ oHp&^I6%(TsbK$M{a<8uWGxxi~Au;I82c4)N0oO$)1^z;TFW6WcO8@`> literal 0 HcmV?d00001 diff --git a/skglm/experimental/.DS_Store b/skglm/experimental/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..f062d0511a7867e9e02c3af4175aa77b902741da GIT binary patch literal 6148 zcmeH~Jr2S!425kd5)um|V-^m;4I)%dzy%N#CM1T!o}=^Zcxhoq6?&HJ7duIRzM-i_ zME9?5C$biiCEQfD7DlGX7xI*=^oaef>o4cW<8Eb{#ac_?xW;fkrYR&q0wh2JBtQZa zBH)KO&i~bfo{5h_0wgdE0``4KaMK)Gs`{q`!CL@mgR&dO-b+A}6`(n^R7C}*(LHEv zRc}MA?%Sa$*5%MrwQd)U=0oGoYF!LW>vqwE1g6=AfdoikL||U?wVnS5__zLlw1p`N zkief2(55}Md%RSftsk#v^;1-BT;Nb&jxhKMKw?Mn2JVLWWCLgpEmcv0@khWhFp$7c G3A_OZR}-KB literal 0 HcmV?d00001 diff --git a/skglm/experimental/tests/test_sqrt_lasso.py b/skglm/experimental/tests/test_sqrt_lasso.py index ee970ca41..c99332236 100644 --- a/skglm/experimental/tests/test_sqrt_lasso.py +++ b/skglm/experimental/tests/test_sqrt_lasso.py @@ -7,6 +7,7 @@ from skglm.experimental.sqrt_lasso import (SqrtLasso, SqrtQuadratic, _chambolle_pock_sqrt) from skglm.experimental.pdcd_ws import PDCD_WS +from skglm import Lasso def test_alpha_max(): @@ -80,5 +81,32 @@ def test_PDCD_WS(with_dual_init): np.testing.assert_allclose(clf.coef_, w, atol=1e-6) +def test_sqrt_lasso_with_intercept(): + np.random.seed(0) + X = np.random.randn(10, 20) + y = np.random.randn(10) + y += 1 + + n = len(y) + alpha_max = norm(X.T @ y, ord=np.inf) / n + alpha = alpha_max / 10 + + # Fit standard Lasso with intercept + lass = Lasso(alpha=alpha, fit_intercept=True, tol=1e-8).fit(X, y) + w_lass = lass.coef_ + assert norm(w_lass) > 0 + + scal = n / norm(y - lass.predict(X)) + + # Fit SqrtLasso with intercept + sqrt = SqrtLasso(alpha=alpha * scal, fit_intercept=True, tol=1e-8).fit(X, y) + + # Make sure intercept was learned + assert abs(sqrt.intercept_) > 1e-6 + + y_pred = sqrt.predict(X) + assert y_pred.shape == y.shape + + if __name__ == '__main__': pass From e7568bd96278f04c49b4ba30c86def1c5eade332 Mon Sep 17 00:00:00 2001 From: floriankozikowski Date: Tue, 22 Apr 2025 19:18:59 +0200 Subject: [PATCH 50/52] first try, add support for fit_intercept in sqrtLasso, TODOS: review if correct, clean up, pot. add support for sparse X (not sure if that works), enhance docstring --- .DS_Store | Bin 0 -> 6148 bytes skglm/experimental/sqrt_lasso.py | 21 +++++++++++++++++++++ 2 files changed, 21 insertions(+) create mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..98053944a2099f7bfebe1b2395ebd88435c72cd4 GIT binary patch literal 6148 zcmeH~J&pn~427RrRzli_a?3OvfEz@JJpmU$y9f$E6lnV#ooB}l12r0r2wyw<3^dk@$ KG>E`U3ETjMT@!@> literal 0 HcmV?d00001 diff --git a/skglm/experimental/sqrt_lasso.py b/skglm/experimental/sqrt_lasso.py index 573fa6d81..61e0ccc36 100644 --- a/skglm/experimental/sqrt_lasso.py +++ b/skglm/experimental/sqrt_lasso.py @@ -106,7 +106,11 @@ class SqrtLasso(LinearModel, RegressorMixin): """ def __init__(self, alpha=1., max_iter=100, max_pn_iter=100, p0=10, +<<<<<<< HEAD tol=1e-4, verbose=0, fit_intercept=True): +======= + tol=1e-4, verbose=0, fit_intercept=False): +>>>>>>> 69bf74f (first try, add support for fit_intercept in sqrtLasso, TODOS: review if correct, clean up, pot. add support for sparse X (not sure if that works), enhance docstring) super().__init__() self.alpha = alpha self.max_iter = max_iter @@ -134,9 +138,26 @@ def fit(self, X, y): self : Fitted estimator. """ +<<<<<<< HEAD self.coef_ = self.path(X, y, alphas=[self.alpha])[1][0] if self.fit_intercept: self.intercept_ = self.coef_[-1] +======= + # self.coef_ = self.path(X, y, alphas=[self.alpha])[1][0] + if self.fit_intercept: + X_mean = X.mean(axis=0) + y_mean = y.mean() + X_centered = X - X_mean + y_centered = y - y_mean + else: + X_centered = X + y_centered = y + + self.coef_ = self.path(X_centered, y_centered, alphas=[self.alpha])[1][0] + + if self.fit_intercept: + self.intercept_ = y_mean - X_mean @ self.coef_ +>>>>>>> 69bf74f (first try, add support for fit_intercept in sqrtLasso, TODOS: review if correct, clean up, pot. add support for sparse X (not sure if that works), enhance docstring) else: self.intercept_ = 0. return self From 91a56089515e4a7056394726463ef71dc3e0d159 Mon Sep 17 00:00:00 2001 From: floriankozikowski Date: Tue, 22 Apr 2025 19:34:57 +0200 Subject: [PATCH 51/52] fix merge errors --- skglm/experimental/sqrt_lasso.py | 6 ------ skglm/experimental/tests/test_sqrt_lasso.py | 6 ++---- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/skglm/experimental/sqrt_lasso.py b/skglm/experimental/sqrt_lasso.py index b569be85d..dc3ae1c5a 100644 --- a/skglm/experimental/sqrt_lasso.py +++ b/skglm/experimental/sqrt_lasso.py @@ -106,13 +106,7 @@ class SqrtLasso(LinearModel, RegressorMixin): """ def __init__(self, alpha=1., max_iter=100, max_pn_iter=100, p0=10, -<< << << < HEAD tol=1e-4, verbose=0, fit_intercept=True): - - -== == == = - tol = 1e-4, verbose = 0, fit_intercept = False): ->>>>>> > 69bf74f (first try , add support for fit_intercept in sqrtLasso, TODOS: review if correct, clean up, pot. add support for sparse X (not sure if that works), enhance docstring) super().__init__() self.alpha = alpha self.max_iter = max_iter diff --git a/skglm/experimental/tests/test_sqrt_lasso.py b/skglm/experimental/tests/test_sqrt_lasso.py index 4168951ae..bdca12fc2 100644 --- a/skglm/experimental/tests/test_sqrt_lasso.py +++ b/skglm/experimental/tests/test_sqrt_lasso.py @@ -77,11 +77,9 @@ def test_PDCD_WS(with_dual_init): penalty = L1(alpha) w = PDCD_WS(dual_init=dual_init).solve(X, y, datafit, penalty)[0] -<<<<<<< HEAD + clf = SqrtLasso(alpha=alpha, fit_intercept=False, tol=1e-12).fit(X, y) -======= - clf = SqrtLasso(alpha=alpha, tol=1e-12).fit(X, y) ->>>>>>> origin/main + np.testing.assert_allclose(clf.coef_, w, atol=1e-6) From 791c8cdd9def426ab508f05e70e8509bf3d1495c Mon Sep 17 00:00:00 2001 From: floriankozikowski Date: Wed, 23 Apr 2025 10:33:24 +0200 Subject: [PATCH 52/52] fix pytest, should work now --- skglm/experimental/sqrt_lasso.py | 4 +++- skglm/experimental/tests/test_sqrt_lasso.py | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/skglm/experimental/sqrt_lasso.py b/skglm/experimental/sqrt_lasso.py index dc3ae1c5a..29f9fd9cc 100644 --- a/skglm/experimental/sqrt_lasso.py +++ b/skglm/experimental/sqrt_lasso.py @@ -107,6 +107,7 @@ class SqrtLasso(LinearModel, RegressorMixin): def __init__(self, alpha=1., max_iter=100, max_pn_iter=100, p0=10, tol=1e-4, verbose=0, fit_intercept=True): + super().__init__() self.alpha = alpha self.max_iter = max_iter @@ -147,7 +148,8 @@ def fit(self, X, y): self.coef_ = self.path(X_centered, y_centered, alphas=[self.alpha])[1][0] if self.fit_intercept: - self.intercept_ = y_mean - X_mean @ self.coef_ + self.intercept_ = y_mean - X_mean @ self.coef_[:-1] + self.coef_ = self.coef_[:-1] else: self.intercept_ = 0. return self diff --git a/skglm/experimental/tests/test_sqrt_lasso.py b/skglm/experimental/tests/test_sqrt_lasso.py index bdca12fc2..e67dd96e0 100644 --- a/skglm/experimental/tests/test_sqrt_lasso.py +++ b/skglm/experimental/tests/test_sqrt_lasso.py @@ -109,6 +109,24 @@ def test_sqrt_lasso_with_intercept(): y_pred = sqrt.predict(X) assert y_pred.shape == y.shape + # Check that coef_ and intercept_ are handled separately + assert sqrt.coef_.shape == (20,) + assert np.isscalar(sqrt.intercept_) + + # Confirm prediction matches manual computation + manual_pred = X @ sqrt.coef_ + sqrt.intercept_ + np.testing.assert_allclose(manual_pred, y_pred, rtol=1e-6) + + np.testing.assert_allclose( + sqrt.intercept_, y.mean() - X.mean(axis=0) @ sqrt.coef_, rtol=1e-6 + ) + + sqrt_no_intercept = SqrtLasso( + alpha=alpha * scal, fit_intercept=False, tol=1e-8).fit(X, y) + assert np.isscalar(sqrt_no_intercept.intercept_) + np.testing.assert_allclose(sqrt_no_intercept.predict( + X), X @ sqrt_no_intercept.coef_ + sqrt_no_intercept.intercept_) + if __name__ == '__main__': pass