Add Inverse Wishart Distribution (#2103)

juanitorduz · web-flow · commit 212bccd114f6 · 2025-11-30T23:37:34.000+01:00
* init

* docs and simplify

* rtol

* feedback 1

* add comment

* improve docstrings

* feedback

* reuse existing code

* feedback flip operator

* simplify flip operator
diff --git a/docs/source/distributions.rst b/docs/source/distributions.rst
@@ -256,6 +256,22 @@ InverseGamma
     :show-inheritance:
     :member-order: bysource
 
+InverseWishart
+^^^^^^^^^^^^^^
+.. autoclass:: numpyro.distributions.continuous.InverseWishart
+    :members:
+    :undoc-members:
+    :show-inheritance:
+    :member-order: bysource
+
+InverseWishartCholesky
+^^^^^^^^^^^^^^^^^^^^^^
+.. autoclass:: numpyro.distributions.continuous.InverseWishartCholesky
+    :members:
+    :undoc-members:
+    :show-inheritance:
+    :member-order: bysource
+
 Kumaraswamy
 ^^^^^^^^^^^
 .. autoclass:: numpyro.distributions.continuous.Kumaraswamy
diff --git a/numpyro/distributions/__init__.py b/numpyro/distributions/__init__.py
@@ -38,6 +38,8 @@
     HalfCauchy,
     HalfNormal,
     InverseGamma,
+    InverseWishart,
+    InverseWishartCholesky,
     Kumaraswamy,
     Laplace,
     Levy,
@@ -169,6 +171,8 @@
     "ImproperUniform",
     "Independent",
     "InverseGamma",
+    "InverseWishart",
+    "InverseWishartCholesky",
     "Kumaraswamy",
     "Laplace",
     "LeftTruncatedDistribution",
diff --git a/numpyro/distributions/continuous.py b/numpyro/distributions/continuous.py
@@ -3265,6 +3265,304 @@ def infer_shapes(
                 return batch_shape, event_shape
 
 
+class InverseWishart(TransformedDistribution):
+    r"""
+    Inverse Wishart distribution for covariance matrices.
+
+    The Inverse Wishart distribution is the conjugate prior for the covariance matrix
+    of a multivariate normal distribution. If :math:`\mathbf{X} \sim W^{-1}(\mathbf{\Psi}, \nu)`,
+    then :math:`\mathbf{X}^{-1} \sim W(\mathbf{\Psi}^{-1}, \nu)` (Wishart distribution).
+
+    .. math::
+
+        p(\mathbf{X} \mid \mathbf{\Psi}, \nu) =
+        \frac{|\mathbf{\Psi}|^{\nu/2}}{2^{\nu p/2} \Gamma_p(\nu/2)}
+        |\mathbf{X}|^{-(\nu + p + 1)/2}
+        \exp\left( -\frac{1}{2} \mathrm{tr}(\mathbf{\Psi} \mathbf{X}^{-1}) \right)
+
+    where :math:`p` is the dimension of the matrix, :math:`\nu > p - 1` is the degrees
+    of freedom, and :math:`\mathbf{\Psi}` is the positive definite scale matrix.
+
+    :param concentration: Degrees of freedom parameter (often denoted :math:`\nu`).
+        Must be greater than `p - 1` where `p` is the dimension of the scale matrix.
+    :param scale_matrix: Positive definite scale matrix :math:`\mathbf{\Psi}`, analogous
+        to the inverse rate of a :class:`Gamma` distribution.
+    :param rate_matrix: Inverse of the scale matrix, analogous to the rate of a
+        :class:`Gamma` distribution.
+    :param scale_tril: Cholesky decomposition of the scale matrix.
+
+    **Properties**
+
+    - **Mean**: :math:`\frac{\mathbf{\Psi}}{\nu - p - 1}` for :math:`\nu > p + 1`
+    - **Mode**: :math:`\frac{\mathbf{\Psi}}{\nu + p + 1}`
+
+    **References**
+
+    [1] https://en.wikipedia.org/wiki/Inverse-Wishart_distribution
+    """
+
+    arg_constraints = {
+        "concentration": constraints.dependent(is_discrete=False, event_dim=0),
+        "scale_matrix": constraints.positive_definite,
+        "rate_matrix": constraints.positive_definite,
+        "scale_tril": constraints.lower_cholesky,
+    }
+    support = constraints.positive_definite
+    reparametrized_params = [
+        "scale_matrix",
+        "rate_matrix",
+        "scale_tril",
+    ]
+
+    def __init__(
+        self,
+        concentration: ArrayLike,
+        scale_matrix: Optional[Array] = None,
+        rate_matrix: Optional[Array] = None,
+        scale_tril: Optional[Array] = None,
+        *,
+        validate_args: Optional[bool] = None,
+    ) -> None:
+        base_dist = InverseWishartCholesky(
+            concentration,
+            scale_matrix,
+            rate_matrix,
+            scale_tril,
+            validate_args=validate_args,
+        )
+        super().__init__(
+            base_dist, CholeskyTransform().inv, validate_args=validate_args
+        )
+
+    @lazy_property
+    def concentration(self):
+        return self.base_dist.concentration
+
+    @lazy_property
+    def scale_matrix(self):
+        return self.base_dist.scale_matrix
+
+    @lazy_property
+    def rate_matrix(self):
+        return self.base_dist.rate_matrix
+
+    @lazy_property
+    def scale_tril(self):
+        return self.base_dist.scale_tril
+
+    @lazy_property
+    def mean(self) -> ArrayLike:
+        # Mean exists only when concentration > p + 1
+        p = self.scale_matrix.shape[-1]
+        return jnp.where(
+            self.concentration[..., None, None] > p + 1,
+            self.scale_matrix / (self.concentration[..., None, None] - p - 1),
+            jnp.full_like(self.scale_matrix, jnp.nan),
+        )
+
+    @lazy_property
+    def mode(self) -> ArrayLike:
+        p = self.scale_matrix.shape[-1]
+        return self.scale_matrix / (self.concentration[..., None, None] + p + 1)
+
+    @lazy_property
+    def variance(self) -> ArrayLike:
+        # Variance of entry (i,j) for nu > p + 3
+        # Var(X_ij) = (Psi_ij^2 + Psi_ii * Psi_jj) / ((nu - p - 1)^2 * (nu - p - 3))
+        p = self.scale_matrix.shape[-1]
+        nu = jnp.expand_dims(self.concentration, axis=(-1, -2))
+        psi = self.scale_matrix
+        denom = (nu - p - 1) ** 2 * (nu - p - 3)
+        psi_ii = jnp.diagonal(psi, axis1=-2, axis2=-1)[..., :, None]
+        psi_jj = jnp.diagonal(psi, axis1=-2, axis2=-1)[..., None, :]
+        var = (psi**2 + psi_ii * psi_jj) / denom
+        return jnp.where(nu > p + 3, var, jnp.full_like(var, jnp.nan))
+
+    @staticmethod
+    def infer_shapes(
+        concentration=(), scale_matrix=None, rate_matrix=None, scale_tril=None
+    ):
+        return InverseWishartCholesky.infer_shapes(
+            concentration, scale_matrix, rate_matrix, scale_tril
+        )
+
+
+class InverseWishartCholesky(Distribution):
+    r"""
+    Cholesky factor of an Inverse Wishart distribution for covariance matrices.
+
+    This distribution samples the Cholesky factor :math:`\mathbf{L}` such that
+    :math:`\mathbf{X} = \mathbf{L} \mathbf{L}^T \sim W^{-1}(\mathbf{\Psi}, \nu)`.
+
+    :param concentration: Degrees of freedom parameter (often denoted :math:`\nu`).
+        Must be greater than `p - 1` where `p` is the dimension of the scale matrix.
+    :param scale_matrix: Positive definite scale matrix :math:`\mathbf{\Psi}`, analogous
+        to the inverse rate of a :class:`Gamma` distribution.
+    :param rate_matrix: Inverse of the scale matrix, analogous to the rate of a
+        :class:`Gamma` distribution.
+    :param scale_tril: Cholesky decomposition of the scale matrix.
+
+    **References**
+
+    [1] https://en.wikipedia.org/wiki/Inverse-Wishart_distribution
+    """
+
+    arg_constraints = {
+        "concentration": constraints.dependent(is_discrete=False, event_dim=0),
+        "scale_matrix": constraints.positive_definite,
+        "rate_matrix": constraints.positive_definite,
+        "scale_tril": constraints.lower_cholesky,
+    }
+    support = constraints.lower_cholesky
+    reparametrized_params = [
+        "scale_matrix",
+        "rate_matrix",
+        "scale_tril",
+    ]
+
+    def __init__(
+        self,
+        concentration: ArrayLike,
+        scale_matrix: Optional[Array] = None,
+        rate_matrix: Optional[Array] = None,
+        scale_tril: Optional[Array] = None,
+        *,
+        validate_args: Optional[bool] = None,
+    ) -> None:
+        assert_one_of(
+            scale_matrix=scale_matrix,
+            rate_matrix=rate_matrix,
+            scale_tril=scale_tril,
+        )
+        concentration = jnp.asarray(concentration)[..., None, None]
+        if scale_matrix is not None:
+            concentration, self.scale_matrix = promote_shapes(
+                concentration, scale_matrix
+            )
+            self.scale_tril = jnp.linalg.cholesky(self.scale_matrix)
+        elif rate_matrix is not None:
+            concentration, self.rate_matrix = promote_shapes(concentration, rate_matrix)
+            self.scale_tril = cholesky_of_inverse(self.rate_matrix)
+        elif scale_tril is not None:
+            concentration, self.scale_tril = promote_shapes(
+                concentration, jnp.asarray(scale_tril)
+            )
+        batch_shape = lax.broadcast_shapes(
+            jnp.shape(concentration)[:-2], jnp.shape(self.scale_tril)[:-2]
+        )
+        event_shape = jnp.shape(self.scale_tril)[-2:]
+        self.concentration = concentration[..., 0, 0]
+        super().__init__(
+            batch_shape=batch_shape,
+            event_shape=event_shape,
+            validate_args=validate_args,
+        )
+
+    @validate_sample
+    def log_prob(self, value: ArrayLike) -> ArrayLike:
+        # L = value (Cholesky factor), X = L @ L^T ~ InverseWishart(Psi, nu)
+        # log p(X) = (nu/2) log|Psi| - (nu*p/2) log(2) - log Gamma_p(nu/2)
+        #            - ((nu+p+1)/2) log|X| - tr(Psi @ X^{-1}) / 2
+        # Trace trick: tr(Psi @ X^{-1}) = ||L^{-1} @ scale_tril||_F^2
+        x = solve_triangular(*jnp.broadcast_arrays(value, self.scale_tril), lower=True)
+        trace = jnp.square(x).sum(axis=(-1, -2))
+
+        p = value.shape[-1]
+        log_diag = jnp.log(jnp.diagonal(value, axis1=-2, axis2=-1))
+        return (
+            self.concentration * tri_logabsdet(self.scale_tril)  # (nu/2) log|Psi|
+            + p * (1 - self.concentration / 2) * jnp.log(2)  # normalization
+            - multigammaln(self.concentration / 2, p)
+            + jnp.sum(
+                (-self.concentration[..., None] - 1 - jnp.arange(p)) * log_diag,
+                axis=-1,
+            )
+            - trace / 2
+        )
+
+    @lazy_property
+    def scale_matrix(self):
+        return jnp.matmul(self.scale_tril, self.scale_tril.mT)
+
+    @lazy_property
+    def rate_matrix(self):
+        identity = jnp.broadcast_to(
+            jnp.eye(self.scale_tril.shape[-1]), self.scale_tril.shape
+        )
+        return cho_solve((self.scale_tril, True), identity)
+
+    def sample(
+        self, key: jax.dtypes.prng_key, sample_shape: tuple[int, ...] = ()
+    ) -> ArrayLike:
+        assert is_prng_key(key)
+        # Sample from standard InverseWishartCholesky using Bartlett decomposition
+        # Ref: https://nbviewer.org/gist/fehiepsi/5ef8e09e61604f10607380467eb82006#Precision-to-scale_tril
+        rng_diag, rng_offdiag = random.split(key)
+        latent = jnp.zeros(sample_shape + self.batch_shape + self.event_shape)
+        p = self.event_shape[-1]
+        # Inverse Wishart Bartlett: nu - p + 1, nu - p + 2, ..., nu - 1, nu
+        i = jnp.arange(p)
+        latent = latent.at[..., i, i].set(
+            jnp.sqrt(
+                random.chisquare(
+                    rng_diag,
+                    self.concentration[..., None] + i - p + 1,
+                    latent.shape[:-1],
+                )
+            )
+        )
+        i, j = jnp.tril_indices(p, -1)
+        latent = latent.at[..., i, j].set(
+            random.normal(rng_offdiag, latent.shape[:-2] + (i.size,))
+        )
+        # Get Cholesky of InverseWishart(I) by inverting latent
+        identity = jnp.broadcast_to(jnp.eye(p), latent.shape)
+        L_inv_std = solve_triangular(latent, identity, lower=True)
+
+        # Transform to InverseWishart(Psi): L = scale_tril @ L_inv_std
+        return jnp.matmul(self.scale_tril, L_inv_std)
+
+    @lazy_property
+    def mean(self) -> ArrayLike:
+        # Approximate: chol(E[X]) where E[X] = Psi / (nu - p - 1) for nu > p + 1
+        p = self.scale_tril.shape[-1]
+        mean_x = jnp.where(
+            self.concentration[..., None, None] > p + 1,
+            self.scale_matrix / (self.concentration[..., None, None] - p - 1),
+            jnp.full_like(self.scale_matrix, jnp.nan),
+        )
+        return jnp.linalg.cholesky(
+            jnp.where(jnp.isnan(mean_x), jnp.eye(p), mean_x)
+        ) * jnp.where(
+            self.concentration[..., None, None] > p + 1,
+            jnp.ones_like(mean_x),
+            jnp.full_like(mean_x, jnp.nan),
+        )
+
+    @lazy_property
+    def variance(self) -> ArrayLike:
+        # Variance of Cholesky factor is complex; return NaN for now
+        return jnp.full(self.batch_shape + self.event_shape, jnp.nan)
+
+    @staticmethod
+    def infer_shapes(
+        concentration: tuple[int, ...] = (),
+        scale_matrix: Optional[tuple[int, ...]] = None,
+        rate_matrix: Optional[tuple[int, ...]] = None,
+        scale_tril: Optional[tuple[int, ...]] = None,
+    ):
+        assert_one_of(
+            scale_matrix=scale_matrix,
+            rate_matrix=rate_matrix,
+            scale_tril=scale_tril,
+        )
+        for matrix in [scale_matrix, rate_matrix, scale_tril]:
+            if matrix is not None:
+                batch_shape = lax.broadcast_shapes(concentration, matrix[:-2])
+                event_shape = matrix[-2:]
+                return batch_shape, event_shape
+
+
 class Levy(Distribution):
     r"""Lévy distribution is a special case of Lévy alpha-stable distribution.
     Its probability density function is given by,
diff --git a/test/test_distributions.py b/test/test_distributions.py