FEAT - GeneralizedLinearEstimatorCV for automatic CV of models with Elastic penalty (#311)

floriankozikowski · web-flow · commit 4b2344ce6eff · 2025-06-26T11:31:42.000+02:00
diff --git a/doc/api.rst b/doc/api.rst
@@ -18,6 +18,7 @@ Estimators
    :toctree: generated/
 
    GeneralizedLinearEstimator
+   GeneralizedLinearEstimatorCV
    CoxEstimator
    ElasticNet
    GroupLasso
diff --git a/doc/changes/0.5.rst b/doc/changes/0.5.rst
@@ -4,3 +4,4 @@ Version 0.5 (in progress)
 -------------------------
 - Add support for fitting an intercept in :ref:`SqrtLasso <skglm.experimental.sqrt_lasso.SqrtLasso>` (PR: :gh:`298`)
 - Add experimental :ref:`QuantileHuber <skglm.experimental.quantile_huber.QuantileHuber>` and :ref:`SmoothQuantileRegressor <skglm.experimental.quantile_huber.SmoothQuantileRegressor>` for quantile regression, and an example script (PR: :gh:`312`).
+- Add :ref:`GeneralizedLinearEstimatorCV <skglm.cv.GeneralizedLinearEstimatorCV>` for cross-validation with automatic parameter selection for L1 and elastic-net penalties (PR: :gh:`299`)
diff --git a/examples/plot_generalized_linear_estimator_cv.py b/examples/plot_generalized_linear_estimator_cv.py
@@ -0,0 +1,138 @@
+"""
+===================================
+Cross-Validation for Generalized Linear Models
+===================================
+
+This example shows how to use cross-validation to automatically select
+the optimal regularization parameter for generalized linear models.
+"""
+
+# Author: Florian Kozikowski
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+from skglm.utils.data import make_correlated_data
+from skglm.cv import GeneralizedLinearEstimatorCV
+from skglm.estimators import GeneralizedLinearEstimator
+from skglm.datafits import Quadratic
+from skglm.penalties import L1_plus_L2
+from skglm.solvers import AndersonCD
+
+# %%
+# Generate correlated data with sparse ground truth
+# --------------------------------------------------
+X, y, true_coef = make_correlated_data(
+    n_samples=150, n_features=300, random_state=42
+)
+
+# %%
+# Fit model using cross-validation
+# --------------------------------
+# The CV estimator automatically finds the best regularization strength
+estimator = GeneralizedLinearEstimatorCV(
+    datafit=Quadratic(),
+    penalty=L1_plus_L2(alpha=1.0, l1_ratio=0.5),
+    solver=AndersonCD(max_iter=100),
+    cv=5,
+    n_alphas=50,
+)
+estimator.fit(X, y)
+
+print(f"Best alpha: {estimator.alpha_:.3f}")
+n_nonzero = np.sum(estimator.coef_ != 0)
+n_true_nonzero = np.sum(true_coef != 0)
+print(f"Non-zero coefficients: {n_nonzero} (true: {n_true_nonzero})")
+
+# %%
+# Visualize the cross-validation path
+# -----------------------------------
+# Plot shows how CV balances model complexity with prediction performance
+
+# Get mean CV scores
+mean_scores = np.mean(estimator.scores_path_, axis=1)
+std_scores = np.std(estimator.scores_path_, axis=1)
+best_idx = np.argmax(mean_scores)
+best_alpha = estimator.alphas_[best_idx]
+
+# Compute coefficient paths
+coef_paths = []
+for alpha in estimator.alphas_:
+    est_temp = GeneralizedLinearEstimator(
+        datafit=Quadratic(),
+        penalty=L1_plus_L2(alpha=alpha, l1_ratio=0.5),
+        solver=AndersonCD(max_iter=100)
+    )
+    est_temp.fit(X, y)
+    coef_paths.append(est_temp.coef_)
+coef_paths = np.array(coef_paths)
+
+fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8, 10), sharex=True)
+
+ax1.semilogx(estimator.alphas_, -mean_scores, 'b-', linewidth=2, label='MSE')
+ax1.fill_between(estimator.alphas_,
+                 -mean_scores - std_scores,
+                 -mean_scores + std_scores,
+                 alpha=0.2, label='±1 std. dev.')
+ax1.axvline(best_alpha, color='red', linestyle='--',
+            label=f'Best alpha = {best_alpha:.2e}')
+ax1.set_ylabel('MSE')
+ax1.set_title('Cross-Validation Score vs. Regularization')
+ax1.legend(loc='best')
+ax1.grid(True, alpha=0.3)
+ax1.set_xlabel('alpha')
+
+for j in range(coef_paths.shape[1]):
+    ax2.semilogx(estimator.alphas_, coef_paths[:, j], lw=1, alpha=0.3)
+ax2.axvline(best_alpha, color='red', linestyle='--')
+ax2.set_xlabel('alpha')
+ax2.set_ylabel('Coefficient value')
+ax2.set_title('Regularization Path of Coefficients')
+ax2.grid(True, alpha=0.3)
+
+plt.tight_layout()
+plt.show()
+
+# %% [markdown]
+# Top panel: Mean CV MSE shows U-shape, minimized at chosen alpha for optimal
+# bias-variance tradeoff.
+#
+# Bottom panel: At this alpha, most coefficients are shrunk (many near zero),
+# highlighting a sparse subset of key predictors.
+
+
+# %%
+# Visualize distance to true coefficients
+# ----------------------------------------
+# Compute how well different regularization strengths recover the true coefficients
+
+distances = []
+for alpha in estimator.alphas_:
+    est_temp = GeneralizedLinearEstimator(
+        datafit=Quadratic(),
+        penalty=L1_plus_L2(alpha=alpha, l1_ratio=0.5),
+        solver=AndersonCD(max_iter=100)
+    )
+    est_temp.fit(X, y)
+    distances.append(np.linalg.norm(est_temp.coef_ - true_coef, ord=1))
+
+plt.figure(figsize=(8, 5))
+plt.loglog(estimator.alphas_, distances, 'b-', linewidth=2)
+plt.axvline(estimator.alpha_, color='red', linestyle='--',
+            label=f'CV-selected alpha = {estimator.alpha_:.3f}')
+plt.xlabel('Alpha (regularization strength)')
+plt.ylabel('L1 distance to true coefficients')
+plt.title('Recovery of True Coefficients')
+plt.legend()
+plt.grid(True, alpha=0.3)
+plt.show()
+
+print(
+    f"Distance at CV-selected alpha: "
+    f"{np.linalg.norm(estimator.coef_ - true_coef, ord=1):.3f}")
+
+# %% [markdown]
+# The U-shaped curve shows two failure modes: small alpha doesn't induce
+# enough sparsity (keeping noisy/irrelevant features), while large alpha
+# overshrinks all coefficients including the true signals. Cross-validation
+# finds a good balance without needing access to the ground truth.
diff --git a/skglm/__init__.py b/skglm/__init__.py
@@ -4,3 +4,4 @@
     Lasso, WeightedLasso, ElasticNet, MCPRegression, MultiTaskLasso, LinearSVC,
     SparseLogisticRegression, GeneralizedLinearEstimator, CoxEstimator, GroupLasso,
 )
+from .cv import GeneralizedLinearEstimatorCV  # noqa F401
diff --git a/skglm/cv.py b/skglm/cv.py
@@ -0,0 +1,181 @@
+import numpy as np
+from joblib import Parallel, delayed
+from skglm.datafits import Logistic, QuadraticSVC
+from skglm.estimators import GeneralizedLinearEstimator
+from sklearn.model_selection import KFold, StratifiedKFold
+from sklearn.metrics import accuracy_score, mean_squared_error
+
+
+class GeneralizedLinearEstimatorCV(GeneralizedLinearEstimator):
+    """Cross-validated wrapper for GeneralizedLinearEstimator.
+
+    This class performs cross-validated selection of the regularization parameter(s)
+    for a generalized linear estimator, supporting both L1 and elastic-net penalties.
+
+    Parameters
+    ----------
+    datafit : object
+        Datafit (loss) function instance (e.g., Logistic, Quadratic).
+    penalty : object
+        Penalty instance with an 'alpha' parameter (and optionally 'l1_ratio').
+    solver : object
+        Solver instance to use for optimization.
+    alphas : array-like of shape (n_alphas,), optional
+        List of alpha values to try. If None, they are set automatically.
+    l1_ratio : float or array-like, optional
+        The ElasticNet mixing parameter(s), with 0 <= l1_ratio <= 1.
+        Only used if the penalty supports 'l1_ratio'. If None, defaults to 1.0 (Lasso).
+    cv : int, default=4
+        Number of cross-validation folds.
+    n_jobs : int, default=1
+        Number of jobs to run in parallel for cross-validation.
+    random_state : int or None, default=None
+        Random seed for cross-validation splitting.
+    eps : float, default=1e-3
+        Ratio of minimum to maximum alpha if alphas are set automatically.
+    n_alphas : int, default=100
+        Number of alphas along the regularization path if alphas are set automatically.
+
+    Attributes
+    ----------
+    alpha_ : float
+        Best alpha found by cross-validation.
+    l1_ratio_ : float or None
+        Best l1_ratio found by cross-validation (if applicable).
+    best_estimator_ : GeneralizedLinearEstimator
+        Estimator fitted on the full data with the best parameters.
+    coef_ : ndarray
+        Coefficients of the fitted model.
+    intercept_ : float or ndarray
+        Intercept of the fitted model.
+    alphas_ : ndarray
+        Array of alphas used in the search.
+    scores_path_ : ndarray
+        Cross-validation scores for each parameter combination.
+    n_iter_ : int or None
+        Number of iterations run by the solver (if available).
+    n_features_in_ : int or None
+        Number of features seen during fit.
+    feature_names_in_ : ndarray or None
+        Names of features seen during fit.
+    """
+
+    def __init__(self, datafit, penalty, solver, alphas=None, l1_ratio=None,
+                 cv=4, n_jobs=1, random_state=None,
+                 eps=1e-3, n_alphas=100):
+        super().__init__(datafit=datafit, penalty=penalty, solver=solver)
+        self.alphas = alphas
+        self.l1_ratio = l1_ratio
+        self.cv = cv
+        self.n_jobs = n_jobs
+        self.random_state = random_state
+        self.eps = eps
+        self.n_alphas = n_alphas
+
+    def _score(self, y_true, y_pred):
+        """Compute the performance score (higher is better)."""
+        if isinstance(self.datafit, (Logistic, QuadraticSVC)):
+            return accuracy_score(y_true, y_pred)
+        return -mean_squared_error(y_true, y_pred)
+
+    def fit(self, X, y):
+        """Fit the model using cross-validation."""
+        if not hasattr(self.penalty, "alpha"):
+            raise ValueError(
+                "GeneralizedLinearEstimatorCV only supports penalties which "
+                "expose an 'alpha' parameter."
+            )
+        n_samples, n_features = X.shape
+
+        if self.alphas is not None:
+            alphas = np.sort(self.alphas)[::-1]
+        else:
+            alpha_max = np.max(np.abs(X.T @ y)) / n_samples
+            alphas = np.geomspace(
+                alpha_max,
+                alpha_max * self.eps,
+                self.n_alphas
+            )
+        has_l1_ratio = hasattr(self.penalty, "l1_ratio")
+        l1_ratios = [1.] if not has_l1_ratio else np.atleast_1d(
+            self.l1_ratio if self.l1_ratio is not None else [1.])
+
+        scores_path = np.empty((len(l1_ratios), len(alphas), self.cv))
+        best_loss = -np.inf
+
+        def _solve_fold(k, train, test, alpha, l1, w_init):
+            pen_kwargs = {k: v for k, v in self.penalty.__dict__.items()
+                          if k not in ("alpha", "l1_ratio")}
+            if has_l1_ratio:
+                pen_kwargs['l1_ratio'] = l1
+            pen = type(self.penalty)(alpha=alpha, **pen_kwargs)
+
+            est = GeneralizedLinearEstimator(
+                datafit=self.datafit, penalty=pen, solver=self.solver
+            )
+            if w_init is not None:
+                est.coef_ = w_init[0]
+                est.intercept_ = w_init[1]
+            est.fit(X[train], y[train])
+            y_pred = est.predict(X[test])
+            return est.coef_, est.intercept_, self._score(y[test], y_pred)
+
+        for idx_ratio, l1_ratio in enumerate(l1_ratios):
+            warm_start = [None] * self.cv
+
+            for idx_alpha, alpha in enumerate(alphas):
+                if isinstance(self.datafit, (Logistic, QuadraticSVC)):
+                    kf = StratifiedKFold(n_splits=self.cv, shuffle=True,
+                                         random_state=self.random_state)
+                    split_iter = kf.split(np.arange(n_samples), y)
+                else:
+                    kf = KFold(n_splits=self.cv, shuffle=True,
+                               random_state=self.random_state)
+                    split_iter = kf.split(np.arange(n_samples))
+                fold_result = Parallel(self.n_jobs)(
+                    delayed(_solve_fold)(k, tr, te, alpha, l1_ratio, warm_start[k])
+                    for k, (tr, te) in enumerate(split_iter)
+                )
+
+                for k, (coef_fold, intercept_fold, loss_fold) in enumerate(fold_result):
+                    warm_start[k] = (coef_fold, intercept_fold)
+                    scores_path[idx_ratio, idx_alpha, k] = loss_fold
+
+                mean_loss = np.mean(scores_path[idx_ratio, idx_alpha])
+                if mean_loss > best_loss:
+                    best_loss = mean_loss
+                    self.alpha_ = float(alpha)
+                    self.l1_ratio_ = float(l1_ratio) if has_l1_ratio else None
+
+        # Refit on full dataset
+        pen_kwargs = {k: v for k, v in self.penalty.__dict__.items()
+                      if k not in ("alpha", "l1_ratio")}
+        if has_l1_ratio:
+            pen_kwargs["l1_ratio"] = self.l1_ratio_
+        best_penalty = type(self.penalty)(
+            alpha=self.alpha_, **pen_kwargs
+        )
+        best_estimator = GeneralizedLinearEstimator(
+            datafit=self.datafit,
+            penalty=best_penalty,
+            solver=self.solver
+        )
+        best_estimator.fit(X, y)
+        self.best_estimator_ = best_estimator
+        self.coef_ = best_estimator.coef_
+        self.intercept_ = best_estimator.intercept_
+        self.n_iter_ = getattr(best_estimator, "n_iter_", None)
+        self.n_features_in_ = getattr(best_estimator, "n_features_in_", None)
+        self.feature_names_in_ = getattr(best_estimator, "feature_names_in_", None)
+        self.alphas_ = alphas
+        self.scores_path_ = np.squeeze(scores_path)
+        return self
+
+    def predict(self, X):
+        return self.best_estimator_.predict(X)
+
+    def predict_proba(self, X):
+        return self.best_estimator_.predict_proba(X)
+
+    def score(self, X, y):
+        return self.best_estimator_.score(X, y)
diff --git a/skglm/tests/test_cv.py b/skglm/tests/test_cv.py
@@ -0,0 +1,44 @@
+import numpy as np
+from sklearn.datasets import make_regression
+from sklearn.linear_model import ElasticNet
+from sklearn.model_selection import GridSearchCV, KFold
+from skglm.datafits import Quadratic
+from skglm.penalties import L1_plus_L2
+from skglm.solvers import AndersonCD
+from skglm.cv import GeneralizedLinearEstimatorCV
+import pytest
+
+
+@pytest.mark.parametrize("n_samples,n_features,noise",
+                         [(100, 10, 0.1), (100, 500, 0.2), (100, 500, 0.3)])
+def test_elasticnet_cv_matches_sklearn(n_samples, n_features, noise):
+    """Test GeneralizedLinearEstimatorCV matches sklearn GridSearchCV for ElasticNet."""
+    seed = 42
+    X, y = make_regression(n_samples=n_samples,
+                           n_features=n_features, noise=noise, random_state=seed)
+
+    n = X.shape[0]
+    alpha_max = np.max(np.abs(X.T @ y)) / n
+    alphas = alpha_max * np.array([1, 0.1, 0.01, 0.001])
+    l1_ratios = np.array([0.2, 0.5, 0.8])
+    cv = KFold(n_splits=5, shuffle=True, random_state=seed)
+
+    sklearn_model = GridSearchCV(
+        ElasticNet(max_iter=10000, tol=1e-8),
+        {'alpha': alphas, 'l1_ratio': l1_ratios},
+        cv=cv, scoring='neg_mean_squared_error', n_jobs=1
+    ).fit(X, y)
+
+    skglm_model = GeneralizedLinearEstimatorCV(
+        Quadratic(), L1_plus_L2(0.1, 0.5), AndersonCD(max_iter=10000, tol=1e-8),
+        alphas=alphas, l1_ratio=l1_ratios, cv=5, random_state=seed, n_jobs=1
+    ).fit(X, y)
+
+    np.testing.assert_equal(sklearn_model.best_params_['alpha'],
+                            skglm_model.alpha_)
+    np.testing.assert_equal(sklearn_model.best_params_['l1_ratio'],
+                            skglm_model.l1_ratio_)
+    np.testing.assert_allclose(sklearn_model.best_estimator_.coef_,
+                               skglm_model.coef_.ravel(), rtol=1e-4, atol=1e-6)
+    np.testing.assert_allclose(sklearn_model.best_estimator_.intercept_,
+                               skglm_model.intercept_, rtol=1e-4, atol=1e-6)

Original file line number	Diff line number	Diff line change
`@@ -4,3 +4,4 @@`
`4`	`4`	`Lasso, WeightedLasso, ElasticNet, MCPRegression, MultiTaskLasso, LinearSVC,`
`5`	`5`	`SparseLogisticRegression, GeneralizedLinearEstimator, CoxEstimator, GroupLasso,`
`6`	`6`	`)`
	`7`	`+from .cv import GeneralizedLinearEstimatorCV # noqa F401`