pymc-devs
diff --git a/‎examples/gaussian_processes/GP-Heteroskedastic.ipynb
Lines changed: 316 additions & 417 deletions b/‎examples/gaussian_processes/GP-Heteroskedastic.ipynb
Lines changed: 316 additions & 417 deletions
diff --git a/‎examples/gaussian_processes/GP-Heteroskedastic.myst.md
Lines changed: 95 additions & 59 deletions b/‎examples/gaussian_processes/GP-Heteroskedastic.myst.md
Lines changed: 95 additions & 59 deletions
@@ -156,9 +156,9 @@ def plot_total(ax, mean_samples, var_samples=None, bootstrap=True, n_boots=100):
         # Estimate the aggregate behavior using samples from each normal distribution in the posterior
         samples = (
             rng.normal(
-                mean_samples.T[:, :, None],
-                np.sqrt(var_samples).T[:, :, None],
-                (*mean_samples.T.shape, n_boots),
+                mean_samples.values.T[..., None],
+                np.sqrt(var_samples.values).T[..., None],
+                (*mean_samples.values.T.shape, n_boots),
             )
             .reshape(len(Xnew_), -1)
             .T
@@ -269,7 +269,7 @@ This approach captured slightly more nuance in the overall uncertainty than the
 
 Now let's model the mean and the log of the variance as separate GPs through PyMC's `Latent` implementation, feeding both into a `Normal` likelihood. Note that we add a small amount of diagonal noise to the individual covariances in order to stabilize them for inversion.
 
-The `Latent` parameterization takes signifiantly longer to sample than the `Marginal` approach, so we are going to accerelate the sampling with the Numpyro NUTS sampler.
+The `Latent` parameterization takes signifiantly longer to sample than the `Marginal` model, so we are going to accerelate the sampling with the Numpyro NUTS sampler.
 
 ```{code-cell} ipython3
 with pm.Model() as model_ht:
@@ -295,7 +295,7 @@ with pm.Model() as model_ht:
     trace_ht = pm.sample(
         target_accept=0.95,
         chains=2,
-        nuts_sampler="nutpie",
+        nuts_sampler="numpyro",
         return_inferencedata=True,
         random_seed=SEED,
     )
@@ -311,15 +311,6 @@ with model_ht:
     )
 ```
 
-```{code-cell} ipython3
-_, axs = plt.subplots(1, 3, figsize=(18, 4))
-μ_samples = samples_ht["μ_pred_ht"]
-σ_samples = np.exp(samples_ht["lg_σ_pred_ht"])
-plot_mean(axs[0], μ_samples)
-plot_var(axs[1], σ_samples**2)
-plot_total(axs[2], μ_samples, σ_samples**2)
-```
-
 ```{code-cell} ipython3
 _, axs = plt.subplots(1, 3, figsize=(18, 4))
 mu_samples = az.extract(trace_ht.predictions["mu_pred_ht"])["mu_pred_ht"]
@@ -330,7 +321,7 @@ plot_var(axs[1], sigma_samples.T**2)
 plot_total(axs[2], mu_samples.T, sigma_samples.T**2)
 ```
 
-That looks much better! We've accurately captured the mean behavior of our system along with an understanding of the underlying trend in the variance, with appropriate uncertainty. Crucially, the aggregate behavior of the model integrates both epistemic *and* aleatoric uncertainty, and the ~5% of our observations fall outside the 2σ band are more or less evenly distributed across the domain. However, that took *over two hours* to sample only 4k NUTS iterations. Due to the expense of the requisite matrix inversions, GPs are notoriously inefficient for large data sets. Let's reformulate this model using a sparse approximation.
+That looks much better! We've accurately captured the mean behavior of our system, as well as the underlying trend in the variance (with appropriate uncertainty). Crucially, the aggregate behavior of the model integrates both epistemic *and* aleatoric uncertainty, and the ~5% of our observations fall outside the 2σ band are more or less evenly distributed across the domain. However, even with the Numpyro sampler, this took nearly an hour on a Ryen 7040 laptop to sample only 4k NUTS iterations. Due to the expense of the requisite matrix inversions, GPs are notoriously inefficient for large data sets. Let's reformulate this model using a sparse approximation.
 
 +++
 
@@ -347,25 +338,25 @@ class SparseLatent:
 
     def prior(self, name, X, Xu):
         Kuu = self.cov(Xu)
-        self.L = pm.gp.util.cholesky(pm.gp.util.stabilize(Kuu))
+        self.L = pt.linalg.cholesky(pm.gp.util.stabilize(Kuu))
 
-        self.v = pm.Normal(f"u_rotated_{name}", mu=0.0, sd=1.0, shape=len(Xu))
-        self.u = pm.Deterministic(f"u_{name}", tt.dot(self.L, self.v))
+        self.v = pm.Normal(f"u_rotated_{name}", mu=0.0, sigma=1.0, shape=len(Xu))
+        self.u = pm.Deterministic(f"u_{name}", pt.dot(self.L, self.v))
 
         Kfu = self.cov(X, Xu)
-        self.Kuiu = tt.slinalg.solve_upper_triangular(
-            self.L.T, tt.slinalg.solve_lower_triangular(self.L, self.u)
+        self.Kuiu = pt.slinalg.solve_triangular(
+            self.L.T, pt.slinalg.solve_triangular(self.L, self.u, lower=True), lower=False
         )
-        self.mu = pm.Deterministic(f"mu_{name}", tt.dot(Kfu, self.Kuiu))
+        self.mu = pm.Deterministic(f"mu_{name}", pt.dot(Kfu, self.Kuiu))
         return self.mu
 
     def conditional(self, name, Xnew, Xu):
         Ksu = self.cov(Xnew, Xu)
-        mus = tt.dot(Ksu, self.Kuiu)
-        tmp = tt.slinalg.solve_lower_triangular(self.L, Ksu.T)
-        Qss = tt.dot(tmp.T, tmp)  # Qss = tt.dot(tt.dot(Ksu, tt.nlinalg.pinv(Kuu)), Ksu.T)
+        mus = pt.dot(Ksu, self.Kuiu)
+        tmp = pt.slinalg.solve_triangular(self.L, Ksu.T, lower=True)
+        Qss = pt.dot(tmp.T, tmp)
         Kss = self.cov(Xnew)
-        Lss = pm.gp.util.cholesky(pm.gp.util.stabilize(Kss - Qss))
+        Lss = pt.linalg.cholesky(pm.gp.util.stabilize(Kss - Qss))
         mu_pred = pm.MvNormal(name, mu=mus, chol=Lss, shape=len(Xnew))
         return mu_pred
 ```
@@ -375,39 +366,51 @@ class SparseLatent:
 Xu = X[1::2]
 
 with pm.Model() as model_hts:
-    ℓ = pm.InverseGamma("ℓ", mu=ℓ_μ, sigma=ℓ_σ)
-    η = pm.Gamma("η", alpha=2, beta=1)
-    cov = η**2 * pm.gp.cov.ExpQuad(input_dim=1, ls=ℓ)
+    ell = pm.InverseGamma("ell", mu=ell_mu, sigma=ell_sigma)
+    eta = pm.Gamma("eta", alpha=2, beta=1)
+    cov = eta**2 * pm.gp.cov.ExpQuad(input_dim=1, ls=ell)
 
-    μ_gp = SparseLatent(cov)
-    μ_f = μ_gp.prior("μ", X_obs, Xu)
+    mu_gp = SparseLatent(cov)
+    mu_f = mu_gp.prior("mu", X_obs, Xu)
 
-    σ_ℓ = pm.InverseGamma("σ_ℓ", mu=ℓ_μ, sigma=ℓ_σ)
-    σ_η = pm.Gamma("σ_η", alpha=2, beta=1)
-    σ_cov = σ_η**2 * pm.gp.cov.ExpQuad(input_dim=1, ls=σ_ℓ)
+    sigma_ell = pm.InverseGamma("sigma_ell", mu=ell_mu, sigma=ell_sigma)
+    sigma_η = pm.Gamma("sigma_η", alpha=2, beta=1)
+    sigma_cov = sigma_η**2 * pm.gp.cov.ExpQuad(input_dim=1, ls=sigma_ell)
 
-    lg_σ_gp = SparseLatent(σ_cov)
-    lg_σ_f = lg_σ_gp.prior("lg_σ_f", X_obs, Xu)
-    σ_f = pm.Deterministic("σ_f", pm.math.exp(lg_σ_f))
+    lg_sigma_gp = SparseLatent(sigma_cov)
+    lg_sigma_f = lg_sigma_gp.prior("lg_sigma_f", X_obs, Xu)
+    sigma_f = pm.Deterministic("sigma_f", pm.math.exp(lg_sigma_f))
 
-    lik_hts = pm.Normal("lik_hts", mu=μ_f, sd=σ_f, observed=y_obs_)
-    trace_hts = pm.sample(target_accept=0.95, return_inferencedata=True, random_seed=SEED)
+    lik_hts = pm.Normal("lik_hts", mu=mu_f, sigma=sigma_f, observed=y_obs_)
+    trace_hts = pm.sample(
+        target_accept=0.95,
+        nuts_sampler="numpyro",
+        chains=2,
+        return_inferencedata=True,
+        random_seed=SEED,
+    )
 
 with model_hts:
-    μ_pred = μ_gp.conditional("μ_pred", Xnew, Xu)
-    lg_σ_pred = lg_σ_gp.conditional("lg_σ_pred", Xnew, Xu)
-    samples_hts = pm.sample_posterior_predictive(trace_hts, var_names=["μ_pred", "lg_σ_pred"])
+    mu_pred = mu_gp.conditional("mu_pred", Xnew, Xu)
+    lg_sigma_pred = lg_sigma_gp.conditional("lg_sigma_pred", Xnew, Xu)
+    pm.sample_posterior_predictive(
+        trace_hts,
+        var_names=["mu_pred", "lg_sigma_pred"],
+        extend_inferencedata=True,
+        predictions=True,
+    )
 ```
 
 ```{code-cell} ipython3
 _, axs = plt.subplots(1, 3, figsize=(18, 4))
-μ_samples = samples_hts["μ_pred"]
-σ_samples = np.exp(samples_hts["lg_σ_pred"])
-plot_mean(axs[0], μ_samples)
+mu_samples = az.extract(trace_hts.predictions["mu_pred"])["mu_pred"]
+sigma_samples = np.exp(az.extract(trace_hts.predictions["lg_sigma_pred"])["lg_sigma_pred"])
+
+plot_mean(axs[0], mu_samples.T)
 plot_inducing_points(axs[0])
-plot_var(axs[1], σ_samples**2)
+plot_var(axs[1], sigma_samples.T**2)
 plot_inducing_points(axs[1])
-plot_total(axs[2], μ_samples, σ_samples**2)
+plot_total(axs[2], mu_samples.T, sigma_samples.T**2)
 plot_inducing_points(axs[2])
 ```
 
@@ -429,31 +432,60 @@ def add_coreg_idx(x):
 Xu_c, X_obs_c, Xnew_c = [add_coreg_idx(x) for x in [Xu, X_obs, Xnew]]
 
 with pm.Model() as model_htsc:
-    ℓ = pm.InverseGamma("ℓ", mu=ℓ_μ, sigma=ℓ_σ)
-    η = pm.Gamma("η", alpha=2, beta=1)
-    EQcov = η**2 * pm.gp.cov.ExpQuad(input_dim=1, active_dims=[0], ls=ℓ)
+    ell = pm.InverseGamma("ell", mu=ell_mu, sigma=ell_sigma)
+    eta = pm.Gamma("eta", alpha=2, beta=1)
+    cov = eta**2 * pm.gp.cov.ExpQuad(input_dim=1, ls=ell)
 
     D_out = 2  # two output dimensions, mean and variance
     rank = 2  # two basis GPs
-    W = pm.Normal("W", mu=0, sd=3, shape=(D_out, rank), testval=np.full([D_out, rank], 0.1))
+    W = pm.Normal("W", mu=0, sigma=3, shape=(D_out, rank), initval=np.full([D_out, rank], 0.1))
     kappa = pm.Gamma("kappa", alpha=1.5, beta=1, shape=D_out)
     coreg = pm.gp.cov.Coregion(input_dim=1, active_dims=[0], kappa=kappa, W=W)
 
-    cov = pm.gp.cov.Kron([EQcov, coreg])
+    cov = pm.gp.cov.Kron([cov, coreg])
 
     gp_LMC = SparseLatent(cov)
     LMC_f = gp_LMC.prior("LMC", X_obs_c, Xu_c)
 
-    μ_f = LMC_f[: len(y_obs_)]
-    lg_σ_f = LMC_f[len(y_obs_) :]
-    σ_f = pm.Deterministic("σ_f", pm.math.exp(lg_σ_f))
+    mu_f = LMC_f[: len(y_obs_)]
+    lg_sigma_f = LMC_f[len(y_obs_) :]
+    sigma_f = pm.Deterministic("sigma_f", pm.math.exp(lg_sigma_f))
 
-    lik_htsc = pm.Normal("lik_htsc", mu=μ_f, sd=σ_f, observed=y_obs_)
-    trace_htsc = pm.sample(target_accept=0.95, return_inferencedata=True, random_seed=SEED)
+    lik_htsc = pm.Normal("lik_htsc", mu=mu_f, sigma=sigma_f, observed=y_obs_)
+    trace_htsc = pm.sample(
+        target_accept=0.95,
+        chains=2,
+        nuts_sampler="numpyro",
+        return_inferencedata=True,
+        random_seed=SEED,
+    )
 
 with model_htsc:
     c_mu_pred = gp_LMC.conditional("c_mu_pred", Xnew_c, Xu_c)
-    samples_htsc = pm.sample_posterior_predictive(trace_htsc, var_names=["c_mu_pred"])
+    pm.sample_posterior_predictive(
+        trace_htsc, var_names=["c_mu_pred"], extend_inferencedata=True, predictions=True
+    )
+```
+
+```{code-cell} ipython3
+sigma_samples.shape
+```
+
+```{code-cell} ipython3
+# μ_samples = samples_htsc["c_mu_pred"][:, : len(Xnew)]
+# σ_samples = np.exp(samples_htsc["c_mu_pred"][:, len(Xnew) :])
+mu_samples = az.extract(trace_htsc.predictions["c_mu_pred"])["c_mu_pred"][: len(Xnew)]
+sigma_samples = np.exp(az.extract(trace_htsc.predictions["c_mu_pred"])["c_mu_pred"])[len(Xnew) :]
+
+_, axs = plt.subplots(1, 3, figsize=(18, 4))
+plot_mean(axs[0], mu_samples.T)
+plot_inducing_points(axs[0])
+plot_var(axs[1], sigma_samples.T**2)
+axs[1].set_ylim(-0.01, 0.2)
+axs[1].legend(loc="upper left")
+plot_inducing_points(axs[1])
+plot_total(axs[2], mu_samples.T, sigma_samples.T**2)
+plot_inducing_points(axs[2])
 ```
 
 ```{code-cell} ipython3
@@ -478,13 +510,17 @@ with model_htsc:
     B_samples = pm.sample_posterior_predictive(trace_htsc, var_names=["W", "kappa"])
 ```
 
+```{code-cell} ipython3
+kappa.shape
+```
+
 ```{code-cell} ipython3
 # Keep in mind that the first dimension in all arrays is the sampling dimension
-W = B_samples["W"]
+W = az.extract(B_samples.posterior_predictive["W"])["W"].values.T
 W_T = np.swapaxes(W, 1, 2)
 WW_T = np.matmul(W, W_T)
 
-kappa = B_samples["kappa"]
+kappa = az.extract(B_samples.posterior_predictive["kappa"])["kappa"].values.T
 I = np.tile(np.identity(2), [kappa.shape[0], 1, 1])
 # einsum is just a concise way of doing multiplication and summation over arbitrary axes
 diag_kappa = np.einsum("ij,ijk->ijk", kappa, I)