Add PSIS k-hat diagnostic for variational inference (#2139)

michaelellis003 · web-flow · commit 8464741e27f7 · 2026-02-22T15:10:36.000-05:00
Implement Pareto Smoothed Importance Sampling (PSIS) diagnostic to evaluate variational approximation quality, as requested in #1804. The k-hat statistic is the shape parameter of a Generalized Pareto Distribution fitted to the upper tail of importance weights. It indicates whether the guide is a reliable approximation: k < 0.5: good (finite variance) 0.5 <= k < 0.7: marginal (finite mean) k >= 0.7: unreliable GPD fitting uses Zhang & Stephens (2009) with prior regularization from Vehtari et al. (2024), matching Pyro's implementation and Vehtari's reference code to ~1e-15.
diff --git a/docs/source/utilities.rst b/docs/source/utilities.rst
@@ -68,6 +68,10 @@ log_likelihood
 --------------
 .. autofunction:: numpyro.infer.util.log_likelihood
 
+psis_diagnostic
+---------------
+.. autofunction:: numpyro.infer.importance.psis_diagnostic
+
 find_valid_initial_params
 -------------------------
 .. autofunction:: numpyro.infer.util.find_valid_initial_params
diff --git a/numpyro/infer/__init__.py b/numpyro/infer/__init__.py
@@ -14,6 +14,7 @@
 from numpyro.infer.ensemble import AIES, ESS
 from numpyro.infer.hmc import HMC, NUTS
 from numpyro.infer.hmc_gibbs import HMCECS, DiscreteHMCGibbs, HMCGibbs
+from numpyro.infer.importance import psis_diagnostic
 from numpyro.infer.initialization import (
     init_to_feasible,
     init_to_mean,
@@ -40,6 +41,7 @@
     "init_to_uniform",
     "init_to_value",
     "log_likelihood",
+    "psis_diagnostic",
     "reparam",
     "BarkerMH",
     "DiscreteHMCGibbs",
diff --git a/numpyro/infer/importance.py b/numpyro/infer/importance.py
@@ -0,0 +1,259 @@
+# Copyright Contributors to the Pyro project.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Pareto Smoothed Importance Sampling (PSIS) diagnostics for variational inference.
+
+Implements the k-hat diagnostic from:
+    Yao, Y., Vehtari, A., Simpson, D., and Gelman, A. (2018).
+    Yes, but Did It Work?: Evaluating Variational Inference.
+    International Conference on Machine Learning.
+
+    Vehtari, A., Simpson, D., Gelman, A., Yao, Y., and Gabry, J. (2024).
+    Pareto smoothed importance sampling.
+    Journal of Machine Learning Research, 25(72):1-58.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+import math
+import warnings
+
+import numpy as np
+
+import jax
+from jax import device_get, random
+
+from numpyro.handlers import seed
+from numpyro.infer.elbo import get_importance_log_probs
+
+__all__ = ["psis_diagnostic"]
+
+
+def _fit_generalized_pareto(x: np.ndarray) -> tuple[float, float]:
+    """Estimate parameters of the Generalized Pareto Distribution (GPD).
+
+    Returns empirical Bayes estimates for the shape (k) and scale (sigma)
+    parameters of the two-parameter GPD, using the method of Zhang and
+    Stephens (2009) with the prior regularization from Vehtari et al. (2024).
+
+    References:
+        Zhang, J. and Stephens, M.A. (2009). A new and efficient estimation
+        method for the generalized Pareto distribution. Technometrics,
+        51(3):316-325.
+
+        Vehtari, A., Simpson, D., Gelman, A., Yao, Y., and Gabry, J. (2024).
+        Pareto smoothed importance sampling. Journal of Machine Learning
+        Research, 25(72):1-58.
+
+    :param numpy.ndarray x: one-dimensional array of positive exceedances (tail samples).
+    :return: tuple of (k, sigma) where k is the shape parameter and sigma
+        is the scale parameter.
+    """
+    if x.ndim != 1 or len(x) <= 1:
+        raise ValueError(
+            f"Expected 1-D array with at least 2 elements, got shape {x.shape}."
+        )
+
+    # Broad errstate is needed because degenerate inputs (e.g. zeros or
+    # identical values) cause cascading numerical issues at multiple points:
+    #   divide: 1/x[quartile], 1/x[-1], -k/b when tail values are zero
+    #   over:   exp(L - L') when profile log-likelihood differences are large
+    #   invalid: downstream ops on nan/inf from earlier divide-by-zero
+    # The resulting nan/inf propagate correctly through the algorithm,
+    # matching the reference implementation behavior.
+    with np.errstate(divide="ignore", over="ignore", invalid="ignore"):
+        return _fit_generalized_pareto_impl(x)
+
+
+def _fit_generalized_pareto_impl(x: np.ndarray) -> tuple[float, float]:
+    x = np.sort(x)
+    n = len(x)
+    PRIOR = 3
+    m = 30 + int(np.sqrt(n))
+
+    # Candidate shape parameters (Zhang & Stephens grid)
+    bs = np.arange(1, m + 1, dtype=float)
+    bs -= 0.5
+    np.divide(m, bs, out=bs)
+    np.sqrt(bs, out=bs)
+    np.subtract(1, bs, out=bs)
+    bs /= PRIOR * x[int(n / 4 + 0.5) - 1]
+    bs += 1 / x[-1]
+
+    # Profile log-likelihood for each candidate
+    ks = np.negative(bs)
+    temp = ks[:, None] * x
+    np.log1p(temp, out=temp)
+    np.mean(temp, axis=1, out=ks)
+
+    L = bs / ks
+    np.negative(L, out=L)
+    np.log(L, out=L)
+    L -= ks
+    L -= 1
+    L *= n
+
+    # Posterior weights (overflow in exp is expected and harmless;
+    # overflowed values get negligible weight after normalization)
+    temp = L - L[:, None]
+    np.exp(temp, out=temp)
+    w = np.sum(temp, axis=1)
+    np.divide(1, w, out=w)
+
+    # Remove negligible weights
+    dii = w >= 10 * np.finfo(float).eps
+    if not np.all(dii):
+        w = w[dii]
+        bs = bs[dii]
+    w /= w.sum()
+
+    # Posterior mean for b
+    b = np.sum(bs * w)
+
+    # Estimate for k (note: negated relative to Zhang & Stephens)
+    temp = (-b) * x
+    np.log1p(temp, out=temp)
+    k = np.mean(temp)
+
+    # Estimate for sigma
+    sigma = -k / b
+
+    # Weakly informative prior for k (Vehtari et al. 2024, Appendix G)
+    # Prior: mean=0.5, effective sample size a=10
+    a = 10
+    k = k * n / (n + a) + a * 0.5 / (n + a)
+
+    return float(k), float(sigma)
+
+
+def _compute_log_weights(
+    rng_key: jax.Array,
+    param_map: dict[str, jax.Array],
+    model: Callable,
+    guide: Callable,
+    args: tuple,
+    kwargs: dict,
+) -> jax.Array:
+    """Compute log importance weight log p(x,z) - log q(z) for a single particle."""
+    # Separate seeds: guide needs its own randomness for sampling latent sites;
+    # model gets an independent seed in case it has stochastic structure beyond
+    # the latent sites replayed from the guide (e.g. stochastic control flow).
+    model_seed, guide_seed = random.split(rng_key)
+    seeded_model = seed(model, model_seed)
+    seeded_guide = seed(guide, guide_seed)
+    model_log_probs, guide_log_probs = get_importance_log_probs(
+        seeded_model, seeded_guide, args, kwargs, param_map
+    )
+    log_model = sum(v.sum() for v in model_log_probs.values())
+    log_guide = sum(v.sum() for v in guide_log_probs.values())
+    return log_model - log_guide
+
+
+def _psis_khat(log_weights: np.ndarray) -> float:
+    """Compute PSIS k-hat from an array of raw log importance weights."""
+    log_weights = log_weights.copy()
+    log_weights -= log_weights.max()
+    log_weights = np.sort(log_weights)
+
+    # S matches notation in Vehtari et al. (2024), Algorithm 1
+    S = len(log_weights)
+
+    # Tail extraction (Vehtari et al. 2024, Algorithm 1)
+    M = math.ceil(min(0.2 * S, 3 * math.sqrt(S)))
+    cutoff_ind = -(M + 1)
+    lw_cutoff = max(np.log(np.finfo(float).tiny), log_weights[cutoff_ind])
+
+    lw_tail = log_weights[log_weights > lw_cutoff]
+
+    if len(lw_tail) < 5:
+        warnings.warn(
+            "Not enough tail samples for reliable PSIS diagnostic.",
+            stacklevel=3,
+        )
+        return float("inf")
+
+    # Shift to exceedances
+    tail = np.exp(lw_tail) - np.exp(lw_cutoff)
+
+    # Fit GPD to the tail
+    k, sigma = _fit_generalized_pareto(tail)
+
+    return float(k)
+
+
+def psis_diagnostic(
+    rng_key: jax.Array,
+    param_map: dict[str, jax.Array],
+    model: Callable,
+    guide: Callable,
+    *args,
+    num_particles: int = 1000,
+    chunk_size: int | None = None,
+    **kwargs,
+) -> float:
+    r"""Compute the PSIS k-hat diagnostic for a model/guide pair.
+
+    The k-hat statistic measures the reliability of importance sampling
+    estimates. It is the shape parameter of a Generalized Pareto Distribution
+    (GPD) fitted to the upper tail of the importance weights.
+
+    Interpretation (Vehtari et al. 2024):
+
+    - k < 0.5: finite variance, classical CLT applies
+    - 0.5 <= k < 0.7: finite mean, generalized CLT may apply
+    - k >= 0.7: unreliable importance sampling estimates
+
+    **Example usage**::
+
+        >>> from jax import random
+        >>> from numpyro.infer import SVI, Trace_ELBO, psis_diagnostic
+        >>> svi = SVI(model, guide, optimizer, Trace_ELBO())
+        >>> svi_result = svi.run(random.PRNGKey(0), num_steps, *args)
+        >>> khat = psis_diagnostic(
+        ...     random.PRNGKey(1), svi_result.params, model, guide, *args
+        ... )
+
+    .. note::
+
+        For reliable results, use at least several hundred particles
+        (the default of 1000 is usually sufficient). Very few particles
+        may not provide enough tail samples for GPD fitting.
+
+    :param jax.random.PRNGKey rng_key: random number generator key.
+    :param dict param_map: dictionary of current parameter values
+        (e.g. ``svi_result.params``).
+    :param Callable model: NumPyro model.
+    :param Callable guide: NumPyro guide.
+    :param args: positional arguments to model and guide.
+    :param int num_particles: number of importance weight samples to draw.
+    :param int chunk_size: maximum particles to evaluate at once (for memory
+        control). If None, all particles are evaluated together.
+    :param kwargs: keyword arguments to model and guide.
+    :return: the estimated k-hat statistic.
+    :rtype: float
+    """
+    if num_particles < 2:
+        raise ValueError("num_particles must be at least 2.")
+
+    if chunk_size is None:
+        chunk_size = num_particles
+
+    rng_keys = random.split(rng_key, num_particles)
+
+    # Compute log weights in batches
+    def compute_fn(key):
+        return _compute_log_weights(key, param_map, model, guide, args, kwargs)
+
+    log_weights_list = []
+    for batch_start in range(0, num_particles, chunk_size):
+        batch_keys = rng_keys[batch_start : batch_start + chunk_size]
+        batch_lw = jax.vmap(compute_fn)(batch_keys)
+        log_weights_list.append(batch_lw)
+
+    log_weights = np.concatenate(
+        [np.asarray(device_get(lw)) for lw in log_weights_list]
+    )
+
+    return _psis_khat(log_weights)
diff --git a/test/infer/test_importance.py b/test/infer/test_importance.py