breaking: parameterize MVNormalScore by inverse cholesky factor

vpratz · vpratz · commit bb67e90e7b1c · 2025-07-26T13:21:33.000Z
The log_prob can be completely calculated using the inverse cholesky
factor L^{-1}. Using this also stabilizes the initial loss, and speeds
up computation.

This commit also contains two optimizations.
Moving the computation of the precision matrix into the einsum, and
using the sum of the logs instead of the log of a product.

Open question:
Is the transformation behavior "left_side_scale" still correct for the
inverse matrix?

As the parameterization changes, this is a breaking change.
As it resolves major stability problems for higher-dimensional problems,
I thing it is worth including them anyway.
diff --git a/bayesflow/scores/multivariate_normal_score.py b/bayesflow/scores/multivariate_normal_score.py
@@ -17,7 +17,7 @@ class MultivariateNormalScore(ParametricDistributionScore):
     of the materialized value.
     """
 
-    NOT_TRANSFORMING_LIKE_VECTOR_WARNING = ("cov_chol",)
+    NOT_TRANSFORMING_LIKE_VECTOR_WARNING = ("cov_chol_inv",)
     """
     Marks head for covariance matrix Cholesky factor as an exception for adapter transformations.
 
@@ -27,7 +27,7 @@ class MultivariateNormalScore(ParametricDistributionScore):
     For more information see :py:class:`ScoringRule`.
     """
 
-    TRANSFORMATION_TYPE: dict[str, str] = {"cov_chol": "left_side_scale"}
+    TRANSFORMATION_TYPE: dict[str, str] = {"cov_chol_inv": "left_side_scale"}
     """
     Marks covariance Cholesky factor head to handle de-standardization as for covariant rank-(0,2) tensors.
 
@@ -42,7 +42,7 @@ def __init__(self, dim: int = None, links: dict = None, **kwargs):
         super().__init__(links=links, **kwargs)
 
         self.dim = dim
-        self.links = links or {"cov_chol": CholeskyFactor()}
+        self.links = links or {"cov_chol_inv": CholeskyFactor()}
 
         self.config = {"dim": dim}
 
@@ -52,9 +52,9 @@ def get_config(self):
 
     def get_head_shapes_from_target_shape(self, target_shape: Shape) -> dict[str, Shape]:
         self.dim = target_shape[-1]
-        return dict(mean=(self.dim,), cov_chol=(self.dim, self.dim))
+        return dict(mean=(self.dim,), cov_chol_inv=(self.dim, self.dim))
 
-    def log_prob(self, x: Tensor, mean: Tensor, cov_chol: Tensor) -> Tensor:
+    def log_prob(self, x: Tensor, mean: Tensor, cov_chol_inv: Tensor) -> Tensor:
         """
         Compute the log probability density of a multivariate Gaussian distribution.
 
@@ -82,25 +82,21 @@ def log_prob(self, x: Tensor, mean: Tensor, cov_chol: Tensor) -> Tensor:
         """
         diff = x - mean
 
-        # Calculate precision from Cholesky factors of covariance matrix
-        cov_chol_inv = keras.ops.inv(cov_chol)
-        precision = keras.ops.matmul(
-            keras.ops.swapaxes(cov_chol_inv, -2, -1),
-            cov_chol_inv,
-        )
-
         # Compute log determinant, exploiting Cholesky factors
-        log_det_covariance = keras.ops.log(keras.ops.prod(keras.ops.diagonal(cov_chol, axis1=1, axis2=2), axis=1)) * 2
+        log_det_covariance = -2 * keras.ops.sum(
+            keras.ops.log(keras.ops.diagonal(cov_chol_inv, axis1=1, axis2=2)), axis=1
+        )
 
-        # Compute the quadratic term in the exponential of the multivariate Gaussian
-        quadratic_term = keras.ops.einsum("...i,...ij,...j->...", diff, precision, diff)
+        # Compute the quadratic term in the exponential of the multivariate Gaussian from Cholesky factors
+        # diff^T * cov_chol_inv^T * cov_chol_inv * diff
+        quadratic_term = keras.ops.einsum("...i,...ji,...jk,...k->...", diff, cov_chol_inv, cov_chol_inv, diff)
 
         # Compute the log probability density
         log_prob = -0.5 * (self.dim * keras.ops.log(2 * math.pi) + log_det_covariance + quadratic_term)
 
         return log_prob
 
-    def sample(self, batch_shape: Shape, mean: Tensor, cov_chol: Tensor) -> Tensor:
+    def sample(self, batch_shape: Shape, mean: Tensor, cov_chol_inv: Tensor) -> Tensor:
         """
         Generate samples from a multivariate Gaussian distribution.
 
@@ -123,17 +119,18 @@ def sample(self, batch_shape: Shape, mean: Tensor, cov_chol: Tensor) -> Tensor:
         Tensor
             A tensor of shape (batch_size, num_samples, D) containing the generated samples.
         """
+        cov_chol = keras.ops.inv(cov_chol_inv)
         if len(batch_shape) == 1:
             batch_shape = (1,) + tuple(batch_shape)
         batch_size, num_samples = batch_shape
         dim = keras.ops.shape(mean)[-1]
         if keras.ops.shape(mean) != (batch_size, dim):
             raise ValueError(f"mean must have shape (batch_size, {dim}), but got {keras.ops.shape(mean)}")
 
-        if keras.ops.shape(cov_chol) != (batch_size, dim, dim):
+        if keras.ops.shape(cov_chol_inv) != (batch_size, dim, dim):
             raise ValueError(
                 f"covariance Cholesky factor must have shape (batch_size, {dim}, {dim}),"
-                f"but got {keras.ops.shape(cov_chol)}"
+                f"but got {keras.ops.shape(cov_chol_inv)}"
             )
 
         # Use Cholesky decomposition to generate samples