Better parameterization of covariance matrices

han-ol · han-ol · commit 34c7f2a241ee · 2025-03-27T16:31:12.000+01:00
diff --git a/bayesflow/approximators/point_approximator.py b/bayesflow/approximators/point_approximator.py
@@ -5,7 +5,7 @@
 )
 
 from bayesflow.types import Tensor
-from bayesflow.utils import filter_kwargs, split_arrays, squeeze_inner_estimates_dict
+from bayesflow.utils import filter_kwargs, split_arrays, squeeze_inner_estimates_dict, logging
 from .continuous_approximator import ContinuousApproximator
 
 
@@ -119,6 +119,7 @@ def sample(
     def _prepare_conditions(self, conditions: dict[str, np.ndarray], **kwargs) -> dict[str, Tensor]:
         """Adapts and converts the conditions to tensors."""
         conditions = self.adapter(conditions, strict=False, stage="inference", **kwargs)
+        conditions.pop("inference_variables", None)
         return keras.tree.map_structure(keras.ops.convert_to_tensor, conditions)
 
     def _apply_inverse_adapter_to_estimates(
@@ -130,6 +131,12 @@ def _apply_inverse_adapter_to_estimates(
         for score_key, score_val in estimates.items():
             processed[score_key] = {}
             for head_key, estimate in score_val.items():
+                if head_key in self.inference_network.scores[score_key].not_transforming_like_vector:
+                    logging.warning(
+                        f"Estimate '{score_key}.{head_key}' is marked to not transform like a vector. "
+                        "It was treated like a vector by the adapter. Handle '{head_key}' estimates with care."
+                    )
+
                 adapted = self.adapter(
                     {"inference_variables": estimate},
                     inverse=True,
diff --git a/bayesflow/links/__init__.py b/bayesflow/links/__init__.py
@@ -2,7 +2,7 @@
 
 from .ordered import Ordered
 from .ordered_quantiles import OrderedQuantiles
-from .positive_semi_definite import PositiveSemiDefinite
+from .positive_definite import PositiveDefinite
 
 from ..utils._docs import _add_imports_to_all
 
diff --git a/bayesflow/links/positive_definite.py b/bayesflow/links/positive_definite.py
@@ -0,0 +1,52 @@
+import keras
+
+# import numpy as np
+from keras.saving import register_keras_serializable as serializable
+
+from bayesflow.types import Tensor
+from bayesflow.utils import keras_kwargs, fill_triangular_matrix
+
+
+@serializable(package="bayesflow.links")
+class PositiveDefinite(keras.Layer):
+    """Activation function to link from flat elements of a lower triangular matrix to a positive definite matrix."""
+
+    def __init__(self, **kwargs):
+        super().__init__(**keras_kwargs(kwargs))
+        self.built = True
+
+    def call(self, inputs: Tensor) -> Tensor:
+        # Build cholesky factor from inputs
+        L = fill_triangular_matrix(inputs, positive_diag=True)
+
+        # diagonal_mask = keras.ops.identity(L.shape[-1]) > 0
+        # L[..., diagonal_mask] = keras.activations.softplus(L[..., diagonal_mask])
+        # L += keras.ops.identity(L.shape[-1]) * 2
+        # L *= keras.ops.sign(keras.ops.diagonal(L, axis1=-1))[..., None]  # ensure positive diagonal entries
+
+        # calculate positive definite matrix from cholesky factors
+        psd = keras.ops.matmul(
+            L,
+            keras.ops.moveaxis(L, -2, -1),  # L transposed
+        )
+        return psd
+
+    def compute_output_shape(self, input_shape):
+        m = input_shape[-1]
+        n = int((0.25 + 2.0 * m) ** 0.5 - 0.5)
+        return input_shape[:-1] + (n, n)
+
+    def compute_input_shape(self, output_shape):
+        """
+        Returns the shape of parameterization of a cholesky factor triangular matrix.
+
+        There are m nonzero elements of a lower triangular nxn matrix with m = n * (n + 1) / 2.
+
+        Example
+        -------
+        >>> PositiveDefinite().compute_output_shape((None, 3, 3))
+        6
+        """
+        n = output_shape[-1]
+        m = int(n * (n + 1) / 2)
+        return output_shape[:-2] + (m,)
diff --git a/bayesflow/links/positive_semi_definite.py b/bayesflow/links/positive_semi_definite.py
diff --git a/bayesflow/scores/multivariate_normal_score.py b/bayesflow/scores/multivariate_normal_score.py
@@ -4,7 +4,7 @@
 from keras.saving import register_keras_serializable as serializable
 
 from bayesflow.types import Shape, Tensor
-from bayesflow.links import PositiveSemiDefinite
+from bayesflow.links import PositiveDefinite
 from bayesflow.utils import logging
 
 from .parametric_distribution_score import ParametricDistributionScore
@@ -21,7 +21,11 @@ def __init__(self, dim: int = None, links: dict = None, **kwargs):
         super().__init__(links=links, **kwargs)
 
         self.dim = dim
-        self.links = links or {"covariance": PositiveSemiDefinite()}
+        self.links = links or {"covariance": PositiveDefinite()}
+
+        # mark head for covariance matrix as an exception for adapter transformations
+        self.not_transforming_like_vector = ["covariance"]
+
         self.config = {"dim": dim}
 
         logging.warning("MultivariateNormalScore is unstable.")
@@ -60,12 +64,12 @@ def log_prob(self, x: Tensor, mean: Tensor, covariance: Tensor) -> Tensor:
             A tensor containing the log probability densities for each sample in `x` under the
             given Gaussian distribution.
         """
-        diff = x[:, None, :] - mean
-        inv_covariance = keras.ops.inv(covariance)
+        diff = x - mean
+        precision = keras.ops.inv(covariance)
         log_det_covariance = keras.ops.slogdet(covariance)[1]  # Only take the log of the determinant part
 
         # Compute the quadratic term in the exponential of the multivariate Gaussian
-        quadratic_term = keras.ops.einsum("...i,...ij,...j->...", diff, inv_covariance, diff)
+        quadratic_term = keras.ops.einsum("...i,...ij,...j->...", diff, precision, diff)
 
         # Compute the log probability density
         log_prob = -0.5 * (self.dim * keras.ops.log(2 * math.pi) + log_det_covariance + quadratic_term)
diff --git a/bayesflow/scores/parametric_distribution_score.py b/bayesflow/scores/parametric_distribution_score.py
@@ -51,5 +51,4 @@ def score(self, estimates: dict[str, Tensor], targets: Tensor, weights: Tensor =
         """
         scores = -self.log_prob(x=targets, **estimates)
         score = self.aggregate(scores, weights)
-        # multipy to mitigate instability due to relatively high values of parametric score
-        return score * 0.01
+        return score
diff --git a/bayesflow/scores/scoring_rule.py b/bayesflow/scores/scoring_rule.py
@@ -29,6 +29,8 @@ def __init__(
         self.subnets_kwargs = subnets_kwargs or {}
         self.links = links or {}
 
+        self.not_transforming_like_vector = []
+
         self.config = {"subnets_kwargs": self.subnets_kwargs}
 
     def get_config(self):
@@ -95,14 +97,14 @@ def get_link(self, key: str) -> keras.Layer:
         else:
             return self.links[key]
 
-    def get_head(self, key: str, shape: Shape) -> keras.Sequential:
+    def get_head(self, key: str, output_shape: Shape) -> keras.Sequential:
         """For a specified head key and shape, request corresponding head network.
 
         Parameters
         ----------
         key : str
             Name of head for which to request a link.
-        shape: Shape
+        output_shape: Shape
             The necessary shape for the point estimators.
 
         Returns
@@ -111,10 +113,19 @@ def get_head(self, key: str, shape: Shape) -> keras.Sequential:
             Head network consisting of a learnable projection, a reshape and a link operation
             to parameterize estimates.
         """
-        subnet = self.get_subnet(key)
-        dense = keras.layers.Dense(units=math.prod(shape))
-        reshape = keras.layers.Reshape(target_shape=shape)
+        # initialize head components back to front
         link = self.get_link(key)
+
+        # link input shape can differ from output shape
+        if hasattr(link, "compute_input_shape"):
+            link_input_shape = link.compute_input_shape(output_shape)
+        else:
+            link_input_shape = output_shape
+
+        reshape = keras.layers.Reshape(target_shape=link_input_shape)
+        dense = keras.layers.Dense(units=math.prod(link_input_shape))
+        subnet = self.get_subnet(key)
+
         return keras.Sequential([subnet, dense, reshape, link])
 
     def score(self, estimates: dict[str, Tensor], targets: Tensor, weights: Tensor) -> Tensor:
diff --git a/bayesflow/utils/__init__.py b/bayesflow/utils/__init__.py
@@ -66,6 +66,7 @@
     tile_axis,
     tree_concatenate,
     tree_stack,
+    fill_triangular_matrix,
 )
 from .validators import check_lengths_same
 from .workflow_utils import find_inference_network, find_summary_network
diff --git a/bayesflow/utils/tensor_utils.py b/bayesflow/utils/tensor_utils.py
@@ -277,3 +277,80 @@ def stack(*items):
             return keras.ops.stack(items, axis=axis)
 
     return keras.tree.map_structure(stack, *structures)
+
+
+def fill_triangular_matrix(x: Tensor, upper: bool = False, positive_diag: bool = False):
+    """
+    Reshapes a batch of matrix entries into a triangular matrix (either upper or lower).
+
+    Note: If final axis has length 1, this simply reshapes to (batch_size, 1, 1) and optionally applies softplus.
+
+    Parameters
+    ----------
+    x : Tensor of shape (batch_size, m)
+        Batch of flattened nonzero matrix elements for triangular matrix.
+    upper : bool
+        Return upper triangular matrix if True, else lower triangular matrix. Default is False.
+    positive_diag : bool
+        Whether to apply a softplus operation to diagonal elements. Default is False.
+
+    Returns
+    -------
+    Tensor of shape (batch_size, n, n)
+        Batch of triangular matrices with m = n * (n + 1) / 2 unique nonzero elements.
+
+    Raises
+    ------
+    ValueError
+        If provided nonzero elements do not correspond to possible triangular matrix shape
+        (n,n) with n = sqrt( 1/4 + 2 * m) - 1/2 due to m = n * (n + 1) / 2.
+    """
+    batch_shape = x.shape[:-1]
+    m = x.shape[-1]
+
+    if m == 1:
+        y = keras.ops.reshape(x, (-1, 1, 1))
+        if positive_diag:
+            y = keras.activations.softplus(y)
+        return y
+
+    # Calculate matrix shape
+    n = (0.25 + 2 * m) ** 0.5 - 0.5
+    if not np.isclose(np.floor(n), n):
+        raise ValueError(f"Input right-most shape ({m}) does not correspond to a triangular matrix.")
+    else:
+        n = int(n)
+
+    # Trick: Create triangular matrix by concatenating with a flipped version of its tail, then reshape.
+    x_tail = keras.ops.take(x, indices=list(range((m - (n**2 - m)), x.shape[-1])), axis=-1)
+    if not upper:
+        y = keras.ops.concatenate([x_tail, keras.ops.flip(x, axis=-1)], axis=len(batch_shape))
+        y = keras.ops.reshape(y, (-1, n, n))
+        y = keras.ops.tril(y)  # TODO: fails with tensorflow
+
+        if positive_diag:
+            y_offdiag = keras.ops.tril(y, k=-1)
+            y_diag = keras.ops.tril(
+                keras.ops.triu(  # carve out diagonal, by setting upper and lower offdiagonals to zero
+                    keras.activations.softplus(y)
+                ),  # apply softplus to enforce positivity
+            )
+            y = y_diag + y_offdiag
+
+    else:
+        y = keras.ops.concatenate([x, keras.ops.flip(x_tail, axis=-1)], axis=len(batch_shape))
+        y = keras.ops.reshape(y, (-1, n, n))
+        y = keras.ops.triu(
+            y,
+        )
+
+        if positive_diag:
+            y_offdiag = keras.ops.triu(y, k=1)
+            y_diag = keras.ops.tril(
+                keras.ops.triu(  # carve out diagonal, by setting upper and lower offdiagonals to zero
+                    keras.activations.softplus(y)
+                ),  # apply softplus to enforce positivity
+            )
+            y = y_diag + y_offdiag
+
+    return y
diff --git a/tests/test_links/conftest.py b/tests/test_links/conftest.py
@@ -15,7 +15,7 @@ def num_variables():
 
 @pytest.fixture()
 def generic_preactivation(batch_size):
-    return keras.ops.ones((batch_size, 4, 4))
+    return keras.ops.ones((batch_size, 6))
 
 
 @pytest.fixture()
@@ -33,18 +33,18 @@ def ordered_quantiles():
 
 
 @pytest.fixture()
-def positive_semi_definite():
-    from bayesflow.links import PositiveSemiDefinite
+def positive_definite():
+    from bayesflow.links import PositiveDefinite
 
-    return PositiveSemiDefinite()
+    return PositiveDefinite()
 
 
 @pytest.fixture()
 def linear():
     return keras.layers.Activation("linear")
 
 
-@pytest.fixture(params=["ordered", "ordered_quantiles", "positive_semi_definite", "linear"], scope="function")
+@pytest.fixture(params=["ordered", "ordered_quantiles", "positive_definite", "linear"], scope="function")
 def link(request):
     return request.getfixturevalue(request.param)
 
@@ -84,6 +84,6 @@ def unordered(batch_size, num_quantiles, num_variables):
     return keras.random.normal((batch_size, num_quantiles, num_variables))
 
 
-@pytest.fixture()
-def random_matrix_batch(batch_size, num_variables):
-    return keras.random.normal((batch_size, num_variables, num_variables))
+# @pytest.fixture()
+# def random_matrix_batch(batch_size, num_variables):
+#     return keras.random.normal((batch_size, num_variables, num_variables))
diff --git a/tests/test_links/test_links.py b/tests/test_links/test_links.py
@@ -3,13 +3,6 @@
 import pytest
 
 
-def test_link_output(link, generic_preactivation):
-    output_shape = link.compute_output_shape(generic_preactivation.shape)
-    output = link(generic_preactivation)
-
-    assert output_shape == output.shape
-
-
 def test_invalid_shape_for_ordered_quantiles(ordered_quantiles, batch_size, num_quantiles, num_variables):
     with pytest.raises(AssertionError) as excinfo:
         ordered_quantiles.build((batch_size, batch_size, num_quantiles, num_variables))
@@ -59,16 +52,17 @@ def test_quantile_ordering(quantiles, unordered):
     check_ordering(output, axis)
 
 
-def test_positive_semi_definite(random_matrix_batch):
-    from bayesflow.links import PositiveSemiDefinite
-
-    activation = PositiveSemiDefinite()
-
-    output = activation(random_matrix_batch)
+def test_positive_definite(positive_definite, batch_size, num_variables):
+    psd = positive_definite
+    input_shape = psd.compute_input_shape((batch_size, num_variables, num_variables))
+    print(input_shape)
+    random_preactivation = keras.random.normal(input_shape, seed=12)
+    output = psd(random_preactivation)
 
     output = keras.ops.convert_to_numpy(output)
     eigenvalues = np.linalg.eig(output).eigenvalues
 
     assert np.all(eigenvalues.real > 0) and np.all(np.isclose(eigenvalues.imag, 0)), (
-        f"output is not positive semi-definite: real={eigenvalues.real}, imag={eigenvalues.imag}"
+        f"output is not positive definite: min(real)={np.min(eigenvalues.real)}, "
+        f"max(abs(imag))={np.max(np.abs(eigenvalues.imag))}"
     )
diff --git a/tests/test_scores/test_scores.py b/tests/test_scores/test_scores.py

Original file line number	Diff line number	Diff line change
`@@ -66,6 +66,7 @@`
`66`	`66`	`tile_axis,`
`67`	`67`	`tree_concatenate,`
`68`	`68`	`tree_stack,`
	`69`	`+ fill_triangular_matrix,`
`69`	`70`	`)`
`70`	`71`	`from .validators import check_lengths_same`
`71`	`72`	`from .workflow_utils import find_inference_network, find_summary_network`