|
| 1 | +""" |
| 2 | +======================================= |
| 3 | +Quantile Huber Loss Visualization |
| 4 | +======================================= |
| 5 | +
|
| 6 | +This example demonstrates the smoothed approximation of the pinball loss |
| 7 | +(Quantile Huber) used in quantile regression. It illustrates how the |
| 8 | +smoothing parameter affects the loss function shape and optimization |
| 9 | +performance for different quantile levels. |
| 10 | +""" |
| 11 | + |
| 12 | +# Authors: Florian Kozikowski |
| 13 | + |
| 14 | +# %% |
| 15 | +# Understanding the Quantile Huber Loss |
| 16 | +# ------------------------------------ |
| 17 | +# |
| 18 | +# The standard pinball loss used in quantile regression is not differentiable |
| 19 | +# at zero, which causes issues for gradient-based optimization methods. |
| 20 | +# The Quantile Huber loss provides a smoothed approximation by replacing the |
| 21 | +# non-differentiable point with a quadratic region. |
| 22 | + |
| 23 | +import numpy as np |
| 24 | +import matplotlib.pyplot as plt |
| 25 | +from sklearn.datasets import make_regression |
| 26 | +from sklearn.preprocessing import StandardScaler |
| 27 | +import time |
| 28 | + |
| 29 | +from skglm import GeneralizedLinearEstimator |
| 30 | +from skglm.penalties import L1 |
| 31 | +from skglm.solvers import FISTA |
| 32 | +from skglm.experimental.quantile_huber import QuantileHuber |
| 33 | + |
| 34 | +# %% |
| 35 | +# First, let's visualize the Quantile Huber loss for median regression (τ=0.5) |
| 36 | +# with different smoothing parameters. |
| 37 | + |
| 38 | +fig1, ax1 = plt.subplots(figsize=(10, 5)) |
| 39 | + |
| 40 | +# Plot several smoothing levels |
| 41 | +deltas = [1.0, 0.5, 0.2, 0.1, 0.01] |
| 42 | +tau = 0.5 |
| 43 | +r = np.linspace(-3, 3, 1000) |
| 44 | + |
| 45 | +# Calculate the non-smooth pinball loss for comparison |
| 46 | +pinball_loss = np.where(r >= 0, tau * r, (tau - 1) * r) |
| 47 | +ax1.plot(r, pinball_loss, 'k--', label='Pinball (non-smooth)') |
| 48 | + |
| 49 | +# Plot losses for different deltas |
| 50 | +for delta in deltas: |
| 51 | + loss_fn = QuantileHuber(delta=delta, quantile=tau) |
| 52 | + loss = np.array([loss_fn._loss_and_grad_scalar(ri)[0] for ri in r]) |
| 53 | + ax1.plot(r, loss, '-', label=f'Quantile Huber (delta={delta})') |
| 54 | + |
| 55 | +ax1.axvline(x=0, color='gray', linestyle='-', alpha=0.4) |
| 56 | +ax1.axhline(y=0, color='gray', linestyle='-', alpha=0.4) |
| 57 | +ax1.set_xlabel('Residual (r)') |
| 58 | +ax1.set_ylabel('Loss') |
| 59 | +ax1.set_title('Quantile Huber Loss (τ=0.5) with Different Smoothing Parameters') |
| 60 | +ax1.legend() |
| 61 | +ax1.grid(True, alpha=0.3) |
| 62 | + |
| 63 | +# %% |
| 64 | +# As we can see, smaller values of delta make the approximation closer to the |
| 65 | +# original non-smooth pinball loss. Now, let's compare different quantile |
| 66 | +# levels with the same smoothing parameter. |
| 67 | + |
| 68 | +fig2, ax2 = plt.subplots(figsize=(10, 5)) |
| 69 | + |
| 70 | +# Plot for different quantile levels |
| 71 | +taus = [0.1, 0.25, 0.5, 0.75, 0.9] |
| 72 | +delta = 0.2 |
| 73 | + |
| 74 | +for tau in taus: |
| 75 | + loss_fn = QuantileHuber(delta=delta, quantile=tau) |
| 76 | + loss = np.array([loss_fn._loss_and_grad_scalar(ri)[0] for ri in r]) |
| 77 | + ax2.plot(r, loss, '-', label=f'τ={tau}') |
| 78 | + |
| 79 | +ax2.axvline(x=0, color='gray', linestyle='-', alpha=0.4) |
| 80 | +ax2.axhline(y=0, color='gray', linestyle='-', alpha=0.4) |
| 81 | +ax2.set_xlabel('Residual (r)') |
| 82 | +ax2.set_ylabel('Loss') |
| 83 | +ax2.set_title(f'Quantile Huber Loss for Different Quantile Levels (delta={delta})') |
| 84 | +ax2.legend() |
| 85 | +ax2.grid(True, alpha=0.3) |
| 86 | + |
| 87 | +# %% |
| 88 | +# The gradient of the smoothed loss |
| 89 | +# --------------------------------- |
| 90 | +# |
| 91 | +# The primary advantage of the Quantile Huber loss is its continuous gradient. |
| 92 | +# Let's visualize both the loss and its gradient for specific quantile levels. |
| 93 | + |
| 94 | + |
| 95 | +def plot_quantile_huber(tau=0.5, delta=0.5, residual_range=(-2, 2), num_points=1000): |
| 96 | + """ |
| 97 | + Plot the quantile Huber loss function and its gradient. |
| 98 | +
|
| 99 | + This utility function generates plots of the quantile Huber loss and its |
| 100 | + gradient for visualization and documentation purposes. It's not part of the |
| 101 | + core implementation but is useful for understanding the behavior of the loss. |
| 102 | +
|
| 103 | + Parameters |
| 104 | + ---------- |
| 105 | + tau : float, default=0.5 |
| 106 | + Quantile level between 0 and 1. |
| 107 | +
|
| 108 | + delta : float, default=0.5 |
| 109 | + Smoothing parameter controlling the width of the quadratic region. |
| 110 | +
|
| 111 | + residual_range : tuple (min, max), default=(-2, 2) |
| 112 | + Range of residual values to plot. |
| 113 | +
|
| 114 | + num_points : int, default=1000 |
| 115 | + Number of points to plot. |
| 116 | +
|
| 117 | + Returns |
| 118 | + ------- |
| 119 | + fig : matplotlib figure |
| 120 | + Figure containing the plots. |
| 121 | +
|
| 122 | + Example |
| 123 | + ------- |
| 124 | + >>> from skglm.experimental.quantile_huber import plot_quantile_huber |
| 125 | + >>> fig = plot_quantile_huber(tau=0.8, delta=0.3) |
| 126 | + >>> fig.savefig('quantile_huber_tau_0.8.png') |
| 127 | + """ |
| 128 | + try: |
| 129 | + import matplotlib.pyplot as plt |
| 130 | + import numpy as np |
| 131 | + except ImportError: |
| 132 | + raise ImportError("Matplotlib is required for plotting.") |
| 133 | + |
| 134 | + loss_fn = QuantileHuber(delta=delta, quantile=tau) |
| 135 | + r = np.linspace(residual_range[0], residual_range[1], num_points) |
| 136 | + |
| 137 | + # Calculate loss and gradient for each residual value |
| 138 | + loss = np.zeros_like(r) |
| 139 | + grad = np.zeros_like(r) |
| 140 | + |
| 141 | + for i, ri in enumerate(r): |
| 142 | + loss_val, grad_val = loss_fn._loss_and_grad_scalar(ri) |
| 143 | + loss[i] = loss_val |
| 144 | + grad[i] = grad_val |
| 145 | + |
| 146 | + # For comparison, calculate the non-smooth pinball loss |
| 147 | + pinball_loss = np.where(r >= 0, tau * r, (tau - 1) * r) |
| 148 | + |
| 149 | + # Create figure with two subplots |
| 150 | + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5)) |
| 151 | + |
| 152 | + # Plot loss functions |
| 153 | + ax1.plot(r, loss, 'b-', label=f'Quantile Huber (δ={delta})') |
| 154 | + ax1.plot(r, pinball_loss, 'r--', label='Pinball (non-smooth)') |
| 155 | + |
| 156 | + # Add vertical lines at the transition points |
| 157 | + ax1.axvline(x=delta, color='gray', linestyle=':', alpha=0.7) |
| 158 | + ax1.axvline(x=-delta, color='gray', linestyle=':', alpha=0.7) |
| 159 | + ax1.axvline(x=0, color='gray', linestyle='-', alpha=0.3) |
| 160 | + |
| 161 | + # Add horizontal line at y=0 |
| 162 | + ax1.axhline(y=0, color='gray', linestyle='-', alpha=0.3) |
| 163 | + |
| 164 | + # Add shaded regions to highlight the quadratic zone |
| 165 | + ax1.axvspan(-delta, delta, alpha=0.1, color='blue') |
| 166 | + |
| 167 | + ax1.set_xlabel('Residual (r)') |
| 168 | + ax1.set_ylabel('Loss') |
| 169 | + ax1.set_title(f'Quantile Huber Loss (τ={tau})') |
| 170 | + ax1.legend() |
| 171 | + ax1.grid(True, alpha=0.3) |
| 172 | + |
| 173 | + # Plot gradients |
| 174 | + ax2.plot(r, grad, 'b-', label=f'Quantile Huber Gradient (δ={delta})') |
| 175 | + |
| 176 | + # Add the non-smooth gradient for comparison |
| 177 | + pinball_grad = np.where(r > 0, tau, tau - 1) |
| 178 | + ax2.plot(r, pinball_grad, 'r--', label='Pinball Gradient (non-smooth)') |
| 179 | + |
| 180 | + # Add vertical lines at the transition points |
| 181 | + ax2.axvline(x=delta, color='gray', linestyle=':', alpha=0.7) |
| 182 | + ax2.axvline(x=-delta, color='gray', linestyle=':', alpha=0.7) |
| 183 | + ax2.axvline(x=0, color='gray', linestyle='-', alpha=0.3) |
| 184 | + |
| 185 | + # Add horizontal line at y=0 |
| 186 | + ax2.axhline(y=0, color='gray', linestyle='-', alpha=0.3) |
| 187 | + |
| 188 | + # Add shaded regions to highlight the quadratic zone |
| 189 | + ax2.axvspan(-delta, delta, alpha=0.1, color='blue') |
| 190 | + |
| 191 | + ax2.set_xlabel('Residual (r)') |
| 192 | + ax2.set_ylabel('Gradient') |
| 193 | + ax2.set_title(f'Gradient of Quantile Huber Loss (τ={tau})') |
| 194 | + ax2.legend() |
| 195 | + ax2.grid(True, alpha=0.3) |
| 196 | + |
| 197 | + plt.tight_layout() |
| 198 | + return fig |
| 199 | + |
| 200 | + |
| 201 | +# For the 75th percentile with delta=0.3 |
| 202 | +fig3 = plot_quantile_huber(tau=0.75, delta=0.3) |
| 203 | +plt.suptitle('Quantile Huber Loss and Gradient for τ=0.75, delta=0.3', fontsize=14) |
| 204 | +plt.tight_layout(rect=[0, 0, 1, 0.95]) # Adjust for suptitle |
| 205 | + |
| 206 | +# %% |
| 207 | +# For the 50th percentile (Median) with delta=0.3 |
| 208 | +fig4 = plot_quantile_huber(tau=0.5, delta=0.3) |
| 209 | +plt.suptitle('Quantile Huber Loss and Gradient for τ=0.5, delta=0.3', fontsize=14) |
| 210 | +plt.tight_layout(rect=[0, 0, 1, 0.95]) # Adjust for suptitle |
| 211 | + |
| 212 | +# %% |
| 213 | +# For the 25th percentile with delta=0.3 |
| 214 | +fig4 = plot_quantile_huber(tau=0.25, delta=0.3) |
| 215 | +plt.suptitle('Quantile Huber Loss and Gradient for τ=0.25, delta=0.3', fontsize=14) |
| 216 | +plt.tight_layout(rect=[0, 0, 1, 0.95]) # Adjust for suptitle |
| 217 | + |
| 218 | +# %% |
| 219 | +# Performance impact of smoothing parameter |
| 220 | +# ----------------------------------------- |
| 221 | +# |
| 222 | +# The smoothing parameter delta strongly affects optimization performance. |
| 223 | +# Let's examine how it impacts convergence speed and solution quality. |
| 224 | + |
| 225 | +# Generate synthetic data |
| 226 | +X, y = make_regression(n_samples=500, n_features=10, noise=0.1, random_state=42) |
| 227 | +X = StandardScaler().fit_transform(X) |
| 228 | + |
| 229 | +# Set up the experiment |
| 230 | +delta_values = [1.0, 0.5, 0.2, 0.1, 0.05, 0.02] |
| 231 | +tau = 0.75 |
| 232 | +alpha = 0.1 |
| 233 | + |
| 234 | +# Collect results |
| 235 | +times = [] |
| 236 | +objectives = [] |
| 237 | + |
| 238 | +# Define pinball loss function for evaluation |
| 239 | + |
| 240 | + |
| 241 | +def pinball_loss(y_true, y_pred, tau=0.5): |
| 242 | + """Compute the pinball loss for quantile regression.""" |
| 243 | + residuals = y_true - y_pred |
| 244 | + return np.mean(np.where(residuals >= 0, |
| 245 | + tau * residuals, |
| 246 | + (1 - tau) * -residuals)) |
| 247 | + |
| 248 | + |
| 249 | +# Run for each delta |
| 250 | +for delta in delta_values: |
| 251 | + datafit = QuantileHuber(delta=delta, quantile=tau) |
| 252 | + solver = FISTA(max_iter=1000, tol=1e-6) |
| 253 | + |
| 254 | + start_time = time.time() |
| 255 | + est = GeneralizedLinearEstimator( |
| 256 | + datafit=datafit, penalty=L1(alpha=alpha), solver=solver |
| 257 | + ).fit(X, y) |
| 258 | + |
| 259 | + elapsed = time.time() - start_time |
| 260 | + times.append(elapsed) |
| 261 | + |
| 262 | + # Compute pinball loss of the solution (not the smoothed loss) |
| 263 | + pinball = pinball_loss(y, X @ est.coef_, tau=tau) |
| 264 | + objectives.append(pinball) |
| 265 | + |
| 266 | +# %% |
| 267 | +# The results show the trade-off between optimization speed and solution quality. |
| 268 | +# Let's plot the results to visualize this relationship. |
| 269 | + |
| 270 | +fig5, ax5 = plt.subplots(figsize=(10, 5)) |
| 271 | + |
| 272 | +ax5.plot(delta_values, times, 'o-') |
| 273 | +ax5.set_xscale('log') |
| 274 | +ax5.set_xlabel('Delta (delta)') |
| 275 | +ax5.set_ylabel('Time (seconds)') |
| 276 | +ax5.set_title('Computation Time vs Smoothing Parameter') |
| 277 | +ax5.grid(True, alpha=0.3) |
| 278 | + |
| 279 | +fig6, ax7 = plt.subplots(figsize=(10, 5)) |
| 280 | +ax7.plot(delta_values, objectives, 'o-') |
| 281 | +ax7.set_xscale('log') |
| 282 | +ax7.set_xlabel('Delta (delta)') |
| 283 | +ax7.set_ylabel('Pinball Loss') |
| 284 | +ax7.set_title(f'Final Pinball Loss (τ={tau}) vs Smoothing Parameter') |
| 285 | +ax7.grid(True, alpha=0.3) |
| 286 | + |
| 287 | +# %% |
| 288 | +# This example illustrates the key trade-off when choosing the smoothing parameter: |
| 289 | +# |
| 290 | +# - Larger values of delta make the problem easier to optimize (faster convergence, |
| 291 | +# fewer iterations), but may yield less accurate results for the original |
| 292 | +# quantile regression objective. |
| 293 | +# - Smaller values of delta give more accurate results, but may require more iterations |
| 294 | +# to converge and take longer to compute. |
| 295 | +# |
| 296 | +# In practice, a progressive smoothing approach (as used in SmoothQuantileRegressor) |
| 297 | +# can be beneficial, starting with a large delta and gradually reducing it to |
| 298 | +# approach the original non-smooth problem. |
| 299 | +# |
| 300 | +# The optimal choice of delta depends on your specific application and the balance |
| 301 | +# between computational efficiency and solution accuracy you require. |
0 commit comments