|
| 1 | +""" |
| 2 | +Progressive smoothing solver for non-smooth datafits (e.g., Pinball loss). |
| 3 | +
|
| 4 | +This is a meta-solver that gradually approximates a non-smooth loss with a |
| 5 | +smooth version (e.g., Huber-smoothed Pinball), solves the problem at each |
| 6 | +smoothing level with a smooth solver, and eventually switches |
| 7 | +to a non-smooth solver (PDCD_WS) once the smoothing parameter is sufficiently small. |
| 8 | +
|
| 9 | +References |
| 10 | +---------- |
| 11 | +.. [1] Nesterov, Y. |
| 12 | + "Smooth minimization of non-smooth functions," Mathematical Programming, 2005. |
| 13 | + https://link.springer.com/article/10.1007/s10107-004-0552-5 |
| 14 | +
|
| 15 | +.. [2] Beck, A. and Teboulle, M. |
| 16 | + "Smoothing and first order methods: A unified framework," |
| 17 | + SIAM Journal on Optimization, 2012. |
| 18 | + https://epubs.siam.org/doi/10.1137/100818327 |
| 19 | +""" |
| 20 | + |
| 21 | +import copy |
| 22 | +import warnings |
| 23 | +import numpy as np |
| 24 | +from scipy import sparse |
| 25 | + |
| 26 | +from skglm import GeneralizedLinearEstimator |
| 27 | +from skglm.penalties import L1 |
| 28 | +from skglm.datafits import Huber |
| 29 | +from skglm.experimental.pdcd_ws import PDCD_WS |
| 30 | +from skglm.experimental.quantile_regression import Pinball |
| 31 | +from skglm.solvers import LBFGS, FISTA |
| 32 | + |
| 33 | +# ---- ensure Huber and L1 get a gradient method for LBFGS ---- |
| 34 | + |
| 35 | + |
| 36 | +def _huber_gradient(self, X, y, Xw): |
| 37 | + n_samples = len(y) |
| 38 | + r = y - Xw |
| 39 | + δ = self.delta |
| 40 | + # derivative of Huber: r/δ in |r|≤δ, sign(r) outside |
| 41 | + dr = np.where(np.abs(r) <= δ, r / δ, np.sign(r)) |
| 42 | + return - X.T.dot(dr) / n_samples |
| 43 | + |
| 44 | + |
| 45 | +def _l1_gradient(self, w): |
| 46 | + # simple subgradient of ‖w‖₁ |
| 47 | + return self.alpha * np.sign(w) |
| 48 | + |
| 49 | + |
| 50 | +if not hasattr(Huber, 'gradient'): |
| 51 | + Huber.gradient = _huber_gradient |
| 52 | + |
| 53 | +if not hasattr(L1, 'gradient'): |
| 54 | + L1.gradient = _l1_gradient |
| 55 | + |
| 56 | + |
| 57 | +class ProgressiveSmoothingSolver: |
| 58 | + """Progressive smoothing (homotopy) meta-solver. |
| 59 | +
|
| 60 | + This solver addresses convergence issues in the PDCD_WS solver with |
| 61 | + non-smooth datafits like Pinball (quantile regression) on large datasets |
| 62 | + (as discussed in GitHub issue #276). |
| 63 | +
|
| 64 | + It works by progressively solving a sequence of smoothed problems with |
| 65 | + decreasing smoothing parameter, and finally solving the original non-smooth |
| 66 | + problem using PDCD_WS initialized with the smoothed solution. |
| 67 | +
|
| 68 | + Parameters |
| 69 | + ---------- |
| 70 | + smoothing_sequence : list or None, default=None |
| 71 | + Sequence of decreasing smoothing parameters (Huber delta values). |
| 72 | + If None, defaults to [1.0, 0.5, 0.2, 0.1, 0.05]. |
| 73 | +
|
| 74 | + quantile : float, default=0.5 |
| 75 | + Desired quantile level between 0 and 1. When 0.5, uses symmetric Huber. |
| 76 | + Otherwise, uses QuantileHuber for asymmetric smoothing. |
| 77 | +
|
| 78 | + alpha : float, default=0.1 |
| 79 | + L1 regularization strength. |
| 80 | +
|
| 81 | + smooth_solver : instance of BaseSolver, default=None |
| 82 | + Solver to use for smooth approximation stages. |
| 83 | + If None, uses LBFGS(max_iter=500, tol=1e-6). |
| 84 | +
|
| 85 | + nonsmooth_solver : instance of BaseSolver, default=None |
| 86 | + Solver to use for final non-smooth problem. |
| 87 | + If None, uses PDCD_WS with appropriate settings. |
| 88 | +
|
| 89 | + verbose : bool, default=False |
| 90 | + If True, prints progress information during fitting. |
| 91 | +
|
| 92 | + Attributes |
| 93 | + ---------- |
| 94 | + coef_ : ndarray of shape (n_features,) |
| 95 | + Parameter vector (w in the cost function formula). |
| 96 | +
|
| 97 | + intercept_ : float |
| 98 | + Intercept term in decision function. |
| 99 | +
|
| 100 | + stage_results_ : list |
| 101 | + Information about each smoothing stage including smoothing parameter, |
| 102 | + number of iterations, and coefficients. |
| 103 | +
|
| 104 | + Examples |
| 105 | + -------- |
| 106 | + >>> from skglm.experimental.progressive_smoothing import ProgressiveSmoothingSolver |
| 107 | + >>> import numpy as np |
| 108 | + >>> X = np.random.randn(1000, 10) |
| 109 | + >>> y = np.random.randn(1000) |
| 110 | + >>> solver = ProgressiveSmoothingSolver(quantile=0.8) |
| 111 | + >>> solver.fit(X, y) |
| 112 | + >>> print(solver.coef_) |
| 113 | + """ |
| 114 | + |
| 115 | + def __init__( |
| 116 | + self, |
| 117 | + smoothing_sequence=None, |
| 118 | + quantile=0.5, |
| 119 | + alpha=0.1, |
| 120 | + smooth_solver=None, |
| 121 | + nonsmooth_solver=None, |
| 122 | + verbose=False, |
| 123 | + ): |
| 124 | + # Build and *extend* the smoothing sequence |
| 125 | + base_seq = smoothing_sequence or [1.0, 0.5, 0.2, 0.1, 0.05] |
| 126 | + |
| 127 | + # For asymmetric quantiles, extend the sequence more aggressively |
| 128 | + if abs(quantile - 0.5) > 0.1: # If not close to median |
| 129 | + min_delta = 1e-4 # Go much further down for asymmetric quantiles |
| 130 | + else: |
| 131 | + min_delta = 1e-3 # Original minimum for symmetric case |
| 132 | + |
| 133 | + # if user stops above min_delta, append finer deltas |
| 134 | + if base_seq[-1] > min_delta: |
| 135 | + extra = np.geomspace(base_seq[-1], min_delta, num=5, endpoint=False)[1:] |
| 136 | + base_seq = base_seq + list(extra) |
| 137 | + self.smoothing_sequence = base_seq |
| 138 | + self.quantile = float(quantile) |
| 139 | + |
| 140 | + if not 0 < self.quantile < 1: |
| 141 | + raise ValueError("quantile must be between 0 and 1") |
| 142 | + |
| 143 | + self.alpha = float(alpha) |
| 144 | + # self.smooth_solver = smooth_solver or LBFGS(max_iter=500, tol=1e-6) |
| 145 | + # # LBFGS in skglm does not properly handle an intercept column ⇒ disable it |
| 146 | + # if isinstance(self.smooth_solver, LBFGS): |
| 147 | + # self.smooth_solver.fit_intercept = False |
| 148 | + # default smooth solver: FISTA (proximal gradient for Huber + ℓ₁) |
| 149 | + if smooth_solver is None: |
| 150 | + self.smooth_solver = FISTA(max_iter=2000, tol=1e-8) |
| 151 | + else: |
| 152 | + # if they passed LBFGS, override it |
| 153 | + if isinstance(smooth_solver, LBFGS): |
| 154 | + import warnings |
| 155 | + warnings.warn( |
| 156 | + "Overriding provided LBFGS: using FISTA " |
| 157 | + "for L1‐regularized smoothing stages." |
| 158 | + ) |
| 159 | + self.smooth_solver = FISTA( |
| 160 | + max_iter=max(2000, smooth_solver.max_iter), |
| 161 | + tol=min(1e-8, smooth_solver.tol), |
| 162 | + ) |
| 163 | + else: |
| 164 | + self.smooth_solver = smooth_solver |
| 165 | + self.smooth_solver.fit_intercept = False |
| 166 | + # ensure a warm_start flag on every solver |
| 167 | + if not hasattr(self.smooth_solver, 'warm_start'): |
| 168 | + self.smooth_solver.warm_start = False |
| 169 | + self.nonsmooth_solver = nonsmooth_solver or PDCD_WS( |
| 170 | + max_iter=10000, |
| 171 | + max_epochs=5000, |
| 172 | + # p0=500, |
| 173 | + tol=1e-8, |
| 174 | + warm_start=True, |
| 175 | + verbose=verbose, |
| 176 | + ) |
| 177 | + self.nonsmooth_solver.fit_intercept = True |
| 178 | + if not hasattr(self.nonsmooth_solver, 'warm_start'): |
| 179 | + self.nonsmooth_solver.warm_start = False |
| 180 | + self.verbose = verbose |
| 181 | + |
| 182 | + # Check if we need QuantileHuber for asymmetric quantiles |
| 183 | + if self.quantile != 0.5: |
| 184 | + try: |
| 185 | + from skglm.experimental.quantile_huber import QuantileHuber |
| 186 | + self._quantile_huber_cls = QuantileHuber |
| 187 | + except ImportError: |
| 188 | + raise ImportError( |
| 189 | + "QuantileHuber class is required for quantile != 0.5. " |
| 190 | + "Please ensure skglm.experimental.quantile_huber is available." |
| 191 | + ) |
| 192 | + else: |
| 193 | + self._quantile_huber_cls = None |
| 194 | + |
| 195 | + # Optimal delta approach: Find the smoothing stage that gives the best quantile |
| 196 | + |
| 197 | + def fit(self, X, y): |
| 198 | + """Fit the model according to the given training data.""" |
| 199 | + # Initialize coefficients |
| 200 | + n_features = X.shape[1] |
| 201 | + coef = np.zeros(n_features) |
| 202 | + intercept = 0.0 |
| 203 | + is_sparse = sparse.issparse(X) |
| 204 | + |
| 205 | + # Track progress of each stage |
| 206 | + stage_results = [] |
| 207 | + |
| 208 | + # For asymmetric quantiles, track the best solution |
| 209 | + best_quantile_error = float('inf') |
| 210 | + best_coef = None |
| 211 | + best_intercept = None |
| 212 | + best_delta = None |
| 213 | + |
| 214 | + # Extended smoothing sequence |
| 215 | + extended_sequence = self.smoothing_sequence[:] |
| 216 | + # Add finer steps specifically to capture the optimal delta |
| 217 | + if abs(self.quantile - 0.5) > 0.05: |
| 218 | + # Add a range around 0.1 where we saw good quantile accuracy |
| 219 | + fine_deltas = [0.15, 0.12, 0.1, 0.08, 0.06, 0.04, 0.02, 0.01] |
| 220 | + extended_sequence = sorted( |
| 221 | + set(extended_sequence + fine_deltas), reverse=True) |
| 222 | + |
| 223 | + # Progressive smoothing stages |
| 224 | + for stage, delta in enumerate(extended_sequence): |
| 225 | + if self.verbose: |
| 226 | + print(f"[ProgressiveSmoothing] Stage {stage+1}/{len(extended_sequence)}: " |
| 227 | + f"delta = {delta:.3g}") |
| 228 | + |
| 229 | + # Create appropriate datafit based on quantile |
| 230 | + if self.quantile == 0.5: |
| 231 | + datafit = Huber(delta=delta) |
| 232 | + else: |
| 233 | + datafit = self._quantile_huber_cls(delta=delta, quantile=self.quantile) |
| 234 | + |
| 235 | + # Create a modified copy of the solver |
| 236 | + solver = copy.deepcopy(self.smooth_solver) |
| 237 | + |
| 238 | + # Safely set fit_intercept attribute if the solver can accept it |
| 239 | + if not hasattr(solver, 'fit_intercept'): |
| 240 | + solver.fit_intercept = True |
| 241 | + |
| 242 | + # Create estimator |
| 243 | + est = GeneralizedLinearEstimator( |
| 244 | + datafit=datafit, |
| 245 | + penalty=L1(alpha=self.alpha), |
| 246 | + solver=solver, |
| 247 | + ) |
| 248 | + |
| 249 | + # Warm start if available and possible |
| 250 | + if stage > 0: |
| 251 | + est.coef_ = coef |
| 252 | + est.intercept_ = intercept |
| 253 | + |
| 254 | + # Fit model |
| 255 | + est.fit(X, y) |
| 256 | + |
| 257 | + # Store results |
| 258 | + coef, intercept = est.coef_, est.intercept_ |
| 259 | + |
| 260 | + # Check quantile error for asymmetric quantiles |
| 261 | + if abs(self.quantile - 0.5) > 0.05: |
| 262 | + y_pred = X @ coef if not is_sparse else X.dot(coef) |
| 263 | + residuals = y - y_pred |
| 264 | + actual_quantile = np.sum(residuals > 0) / len(residuals) |
| 265 | + quantile_error = abs(actual_quantile - self.quantile) |
| 266 | + |
| 267 | + if self.verbose: |
| 268 | + print( |
| 269 | + f" Actual quantile: {actual_quantile:.3f}, Error: {quantile_error:.3f}") |
| 270 | + |
| 271 | + # Update best solution if this is better |
| 272 | + if quantile_error < best_quantile_error: |
| 273 | + best_quantile_error = quantile_error |
| 274 | + best_coef = coef.copy() |
| 275 | + best_intercept = intercept |
| 276 | + best_delta = delta |
| 277 | + |
| 278 | + if self.verbose: |
| 279 | + print( |
| 280 | + f" New best quantile at delta={delta:.3g}, error={quantile_error:.3f}") |
| 281 | + |
| 282 | + # Record stage information |
| 283 | + obj_value = datafit.value( |
| 284 | + y, coef, X @ coef if not is_sparse else X.dot(coef)) |
| 285 | + obj_value += est.penalty.value(coef) |
| 286 | + |
| 287 | + stage_info = { |
| 288 | + 'delta': delta, |
| 289 | + 'obj_value': obj_value, |
| 290 | + 'coef_norm': np.linalg.norm(coef), |
| 291 | + } |
| 292 | + |
| 293 | + # Add solver-specific info if available |
| 294 | + if hasattr(solver, "n_iter_"): |
| 295 | + stage_info['n_iter'] = solver.n_iter_ |
| 296 | + |
| 297 | + stage_results.append(stage_info) |
| 298 | + |
| 299 | + # Handle final stage based on quantile type |
| 300 | + if abs(self.quantile - 0.5) > 0.05: # Asymmetric quantile |
| 301 | + # Use the best smoothed solution (skip PDCD_WS entirely) |
| 302 | + self.coef_ = best_coef |
| 303 | + self.intercept_ = best_intercept |
| 304 | + |
| 305 | + if self.verbose: |
| 306 | + print(f"[Final] Using smoothed solution with delta={best_delta:.3g}") |
| 307 | + print(f" Best quantile error: {best_quantile_error:.3f}") |
| 308 | + else: |
| 309 | + # For symmetric quantiles, still use PDCD_WS |
| 310 | + if self.verbose: |
| 311 | + print( |
| 312 | + "[ProgressiveSmoothing] Final stage: non-smooth solver on true Pinball loss") |
| 313 | + |
| 314 | + final_solver = copy.deepcopy(self.nonsmooth_solver) |
| 315 | + # Ensure it has fit_intercept attribute |
| 316 | + if not hasattr(final_solver, 'fit_intercept'): |
| 317 | + final_solver.fit_intercept = True |
| 318 | + |
| 319 | + final_est = GeneralizedLinearEstimator( |
| 320 | + datafit=Pinball(self.quantile), |
| 321 | + penalty=L1(alpha=self.alpha), |
| 322 | + solver=final_solver, |
| 323 | + ) |
| 324 | + |
| 325 | + # Initialize with smoothed solution if possible |
| 326 | + final_est.coef_ = coef |
| 327 | + final_est.intercept_ = intercept |
| 328 | + |
| 329 | + # Fit with non-smooth solver |
| 330 | + final_est.fit(X, y) |
| 331 | + |
| 332 | + # Store final results |
| 333 | + self.coef_ = final_est.coef_ |
| 334 | + self.intercept_ = final_est.intercept_ |
| 335 | + |
| 336 | + self.stage_results_ = stage_results |
| 337 | + |
| 338 | + return self |
| 339 | + |
| 340 | + def predict(self, X): |
| 341 | + """Predict using the linear model. |
| 342 | +
|
| 343 | + Parameters |
| 344 | + ---------- |
| 345 | + X : array-like or sparse matrix, shape (n_samples, n_features) |
| 346 | + Samples. |
| 347 | +
|
| 348 | + Returns |
| 349 | + ------- |
| 350 | + C : array, shape (n_samples,) |
| 351 | + Returns predicted values. |
| 352 | + """ |
| 353 | + if not hasattr(self, "coef_"): |
| 354 | + raise ValueError("Model not fitted. Call fit before predict.") |
| 355 | + |
| 356 | + is_sparse = sparse.issparse(X) |
| 357 | + if is_sparse: |
| 358 | + return X.dot(self.coef_) + self.intercept_ |
| 359 | + else: |
| 360 | + return X @ self.coef_ + self.intercept_ |
0 commit comments