|
| 1 | +r""" |
| 2 | +Bayesian Linear Regression |
| 3 | +========================== |
| 4 | +
|
| 5 | +In the :ref:`sphx_glr_gallery_plot_linearregr.py` example, we |
| 6 | +performed linear regression by applying a variety of solvers to the |
| 7 | +:py:class:`pylops.LinearRegression` operator. |
| 8 | +
|
| 9 | +In this example, we will apply linear regression the Bayesian way. |
| 10 | +In Bayesian inference, we are not looking for a "best" estimate |
| 11 | +of the linear regression parameters; rather, we are looking for |
| 12 | +all possible parameters and their associated (posterior) probability, |
| 13 | +that is, how likely that those are the parameters that generated our data. |
| 14 | +
|
| 15 | +To do this, we will leverage the probabilistic programming library |
| 16 | +`PyMC <https://www.pm.io>`_. |
| 17 | +
|
| 18 | +In the Bayesian formulation, we write the problem in the following manner: |
| 19 | +
|
| 20 | + .. math:: |
| 21 | + y_i \sim N(x_0 + x_1 t_i, \sigma) \qquad \forall i=0,1,\ldots,N-1 |
| 22 | +
|
| 23 | +where :math:`x_0` is the intercept and :math:`x_1` is the gradient. |
| 24 | +This notation means that the obtained measurements :math:`y_i` are normally distributed around |
| 25 | +mean :math:`x_0 + x_1 t_i` with a standard deviation of :math:`\sigma`. |
| 26 | +We can also express this problem in a matrix form, which makes it clear that we |
| 27 | +can use a PyLops operator to describe this relationship. |
| 28 | +
|
| 29 | + .. math:: |
| 30 | + \mathbf{y} \sim N(\mathbf{A} \mathbf{x}, \sigma) |
| 31 | +
|
| 32 | +In this example, we will combine the Bayesian power of PyMC with the linear language of |
| 33 | +PyLops. |
| 34 | +""" |
| 35 | + |
| 36 | +import arviz as az |
| 37 | +import matplotlib.pyplot as plt |
| 38 | +import numpy as np |
| 39 | +import pymc as pm |
| 40 | + |
| 41 | +import pylops |
| 42 | + |
| 43 | +plt.close("all") |
| 44 | +np.random.seed(10) |
| 45 | + |
| 46 | +############################################################################### |
| 47 | +# Define the input parameters: number of samples along the t-axis (``N``), |
| 48 | +# linear regression coefficients (``x``), and standard deviation of noise |
| 49 | +# to be added to data (``sigma``). |
| 50 | +N = 30 |
| 51 | +x = np.array([1.0, 0.5]) |
| 52 | +sigma = 0.25 |
| 53 | + |
| 54 | +############################################################################### |
| 55 | +# Let's create the time axis and initialize the |
| 56 | +# :py:class:`pylops.LinearRegression` operator |
| 57 | +t = np.linspace(0, 1, N) |
| 58 | +LRop = pylops.LinearRegression(t, dtype=t.dtype) |
| 59 | + |
| 60 | +############################################################################### |
| 61 | +# We can then apply the operator in forward mode to compute our data points |
| 62 | +# along the x-axis (``y``). We will also generate some random gaussian noise |
| 63 | +# and create a noisy version of the data (``yn``). |
| 64 | +y = LRop @ x |
| 65 | +yn = y + np.random.normal(0, sigma, N) |
| 66 | + |
| 67 | +############################################################################### |
| 68 | +# The deterministic solution is to solve the |
| 69 | +# :math:`\mathbf{y} = \mathbf{A} \mathbf{x}` in a least-squares sense. |
| 70 | +# Using PyLops, the ``/`` operator solves the iteratively (i.e., |
| 71 | +# :py:func:`scipy.sparse.linalg.lsqr`). |
| 72 | +# In Bayesian terminology, this estimator is known as the maximulum likelihood |
| 73 | +# estimation (MLE). |
| 74 | +x_mle = LRop / yn |
| 75 | +noise_mle = np.sqrt(np.sum((yn - LRop @ x_mle) ** 2) / (N - 1)) |
| 76 | + |
| 77 | +############################################################################### |
| 78 | +# Alternatively, we may regularize the problem. In this case we will condition |
| 79 | +# the solution towards smaller magnitude parameters, we can use a regularized |
| 80 | +# least squares approach. Since the weight is pretty small, we expect the |
| 81 | +# result to be very similar to the one above. |
| 82 | +sigma_prior = 20 |
| 83 | +eps = 1 / np.sqrt(2) / sigma_prior |
| 84 | +x_map, *_ = pylops.optimization.basic.lsqr(LRop, yn, damp=eps) |
| 85 | +noise_map = np.sqrt(np.sum((yn - LRop @ x_map) ** 2) / (N - 1)) |
| 86 | + |
| 87 | +############################################################################### |
| 88 | +# Let's plot the best fitting line for the case of noise free and noisy data |
| 89 | +fig, ax = plt.subplots(figsize=(8, 4)) |
| 90 | +for est, est_label, c in zip( |
| 91 | + [x, x_mle, x_map], ["True", "MLE", "MAP"], ["k", "C0", "C1"] |
| 92 | +): |
| 93 | + ax.plot( |
| 94 | + np.array([t.min(), t.max()]), |
| 95 | + np.array([t.min(), t.max()]) * est[1] + est[0], |
| 96 | + color=c, |
| 97 | + ls="--" if est_label == "MAP" else "-", |
| 98 | + lw=4, |
| 99 | + label=rf"{est_label}: $x_0$ = {est[0]:.2f}, $x_1$ = {est[1]:.2f}", |
| 100 | + ) |
| 101 | +ax.scatter(t, y, c="r", s=70) |
| 102 | +ax.scatter(t, yn, c="g", s=70) |
| 103 | +ax.legend() |
| 104 | +fig.tight_layout() |
| 105 | + |
| 106 | +############################################################################### |
| 107 | +# Let's solve this problem the Bayesian way, which consists in obtaining the |
| 108 | +# posterior probability :math:`p(\mathbf{x}\,|\,\mathbf{y})` via Bayes theorem: |
| 109 | +# |
| 110 | +# .. math:: |
| 111 | +# \underbrace{p(\mathbf{x} \,|\, \mathbf{y})}_{\text{posterior}} |
| 112 | +# \propto \overbrace{p(\mathbf{y} \,|\, \mathbf{x})}^{\text{likelihood}}\; |
| 113 | +# \overbrace{p(\mathbf{x})}^{\text{prior}} |
| 114 | +# |
| 115 | +# To do so, we need to define the priors and the likelihood. |
| 116 | +# |
| 117 | +# As hinted above, priors in Bayesian analysis can be interpreted as the |
| 118 | +# probabilistic equivalent to regularization. Finding the maximum a posteriori |
| 119 | +# (MAP) estimate to a least-squares problem with a Gaussian prior on the |
| 120 | +# parameters is equivalent to applying a Tikhonov (L2) regularization to these |
| 121 | +# parameters. A Laplace prior is equivalent to a sparse (L1) regularization. |
| 122 | +# In addition, the weight of the regularization is controlled by the "scale" of |
| 123 | +# the distribution of the prior; the standard deviation (in the case of a Gaussian) |
| 124 | +# is inversely proportional strength of the regularization. So if we use the same |
| 125 | +# sigma_prior above as the standard deviation of our prior distribition, we |
| 126 | +# should get the same MAP out of them. In practice, in Bayesian analysis we are |
| 127 | +# not only interested in point estimates like MAP, but rather, the whole |
| 128 | +# posterior distribution. If you want the MAP only, there are better, |
| 129 | +# methods to obtain them, such as the one shown above. |
| 130 | +# |
| 131 | +# In this problem we will use weak, not very informative priors, by setting |
| 132 | +# their prior to accept a wide range of probable values. This is equivalent to |
| 133 | +# setting the "weights" to be small, as shown above: |
| 134 | +# |
| 135 | +# .. math:: |
| 136 | +# x_0 \sim N(0, 20) |
| 137 | +# |
| 138 | +# x_1 \sim N(0, 20) |
| 139 | +# |
| 140 | +# \sigma \sim \text{HalfCauchy}(10) |
| 141 | +# |
| 142 | +# The (log) likelihood in Bayesian analysis is the equivalent of the cost |
| 143 | +# function in deterministic inverse problems. In this case we have already |
| 144 | +# seen this likelihood: |
| 145 | +# |
| 146 | +# .. math:: |
| 147 | +# p(\mathbf{y}\,|\,\mathbf{x}) \sim N(\mathbf{A}\mathbf{x}, \sigma) |
| 148 | +# |
| 149 | + |
| 150 | +# Construct a PyTensor `Op` which can be used in a PyMC model. |
| 151 | +pytensor_lrop = pylops.PyTensorOperator(LRop) |
| 152 | +dims = pytensor_lrop.dims # Inherits dims, dimsd and shape from LRop |
| 153 | + |
| 154 | +# Construct the PyMC model |
| 155 | +with pm.Model() as model: |
| 156 | + y_data = pm.Data("y_data", yn) |
| 157 | + |
| 158 | + # Define priors |
| 159 | + sp = pm.HalfCauchy("σ", beta=10) |
| 160 | + xp = pm.Normal("x", 0, sigma=sigma_prior, shape=dims) |
| 161 | + mu = pm.Deterministic("mu", pytensor_lrop(xp)) |
| 162 | + |
| 163 | + # Define likelihood |
| 164 | + likelihood = pm.Normal("y", mu=mu, sigma=sp, observed=y_data) |
| 165 | + |
| 166 | + # Inference! |
| 167 | + idata = pm.sample(500, tune=200, chains=2) |
| 168 | + |
| 169 | +############################################################################### |
| 170 | +# The plot below is known as the "trace" plot. The left column displays the |
| 171 | +# posterior distributions of all latent variables in the model. The top-left |
| 172 | +# plot has multiple colored posteriors, one for each parameter of the latent |
| 173 | +# vector :math:`\mathbf{x}`. The bottom left plot displays the posterior of the |
| 174 | +# estimated noise :math:`\sigma`. |
| 175 | +# |
| 176 | +# In these plots there are multiple distributions of the same color and |
| 177 | +# multiple line styles. Each of these represents a "chain". A chain is a single |
| 178 | +# run of a Monte Carlo algorithm. Generally, Monte Carlo methods run various |
| 179 | +# chains to ensure that all regions of the posterior distribution are sampled. |
| 180 | +# These chains are shown on the right hand plots. |
| 181 | + |
| 182 | +axes = az.plot_trace(idata, figsize=(10, 7), var_names=["~mu"]) |
| 183 | +axes[0, 0].axvline(x[0], label="True Intercept", lw=2, color="k") |
| 184 | +axes[0, 0].axvline(x_map[0], label="Intercept MAP", lw=2, color="C0", ls="--") |
| 185 | +axes[0, 0].axvline(x[1], label="True Slope", lw=2, color="darkgray") |
| 186 | +axes[0, 0].axvline(x_map[1], label="Slope MAP", lw=2, color="C1", ls="--") |
| 187 | +axes[0, 1].axhline(x[0], label="True Intercept", lw=2, color="k") |
| 188 | +axes[0, 1].axhline(x_map[0], label="Intercept MAP", lw=2, color="C0", ls="--") |
| 189 | +axes[0, 1].axhline(x[1], label="True Slope", lw=2, color="darkgray") |
| 190 | +axes[0, 1].axhline(x_map[1], label="Slope MAP", lw=2, color="C1", ls="--") |
| 191 | +axes[1, 0].axvline(sigma, label="True Sigma", lw=2, color="k") |
| 192 | +axes[1, 0].axvline(noise_map, label="Sigma MAP", lw=2, color="C0", ls="--") |
| 193 | +axes[1, 1].axhline(sigma, label="True Sigma", lw=2, color="k") |
| 194 | +axes[1, 1].axhline(noise_map, label="Sigma MAP", lw=2, color="C0", ls="--") |
| 195 | +for ax in axes.ravel(): |
| 196 | + ax.legend() |
| 197 | +ax.get_figure().tight_layout() |
| 198 | + |
| 199 | +################################################################################ |
| 200 | +# With this model, we can obtain an uncertainty measurement via the High Density |
| 201 | +# Interval. To do that, we need to sample the "preditive posterior", that is, |
| 202 | +# the posterior distribution of the data, given the model. What this does is |
| 203 | +# sample the latent vetors from their posteriors (above), and use the model |
| 204 | +# to construct realizations of the data given these realizations. They represent |
| 205 | +# what the model thinks the data should look like, given everything it has |
| 206 | +# already seen. |
| 207 | + |
| 208 | +with model: |
| 209 | + pm.sample_posterior_predictive(idata, extend_inferencedata=True) |
| 210 | + |
| 211 | +############################################################################### |
| 212 | +# sphinx_gallery_thumbnail_number = 3 |
| 213 | +fig, ax = plt.subplots(figsize=(8, 4)) |
| 214 | +az.plot_hdi( |
| 215 | + t, |
| 216 | + idata.posterior_predictive["y"], |
| 217 | + fill_kwargs={"label": "95% HDI"}, |
| 218 | + hdi_prob=0.95, |
| 219 | + ax=ax, |
| 220 | +) |
| 221 | +for est, est_label, c in zip( |
| 222 | + [x, x_mle, x_map], ["True", "MLE", "MAP"], ["k", "C0", "C1"] |
| 223 | +): |
| 224 | + ax.plot( |
| 225 | + np.array([t.min(), t.max()]), |
| 226 | + np.array([t.min(), t.max()]) * est[1] + est[0], |
| 227 | + color=c, |
| 228 | + ls="--" if est_label == "MAP" else "-", |
| 229 | + lw=4, |
| 230 | + label=rf"{est_label}: $x_0$ = {est[0]:.2f}, $x_1$ = {est[1]:.2f}", |
| 231 | + ) |
| 232 | +ax.scatter(t, y, c="r", s=70) |
| 233 | +ax.scatter(t, yn, c="g", s=70) |
| 234 | +ax.legend() |
| 235 | +fig.tight_layout() |
0 commit comments