Merge branch 'standardize_in_approx' of https://github.com/bayesflow-org/bayesflow into standardize_in_approx

stefanradev93 · stefanradev93 · commit dd24941ae9f2 · 2025-06-05T11:04:46.000-04:00
diff --git a/bayesflow/networks/standardization/standardization.py b/bayesflow/networks/standardization/standardization.py
@@ -3,51 +3,49 @@
 import keras
 
 from bayesflow.types import Tensor, Shape
-from bayesflow.utils.serialization import serialize, deserialize, serializable
+from bayesflow.utils.serialization import serializable
 from bayesflow.utils import expand_left_as, layer_kwargs
 from bayesflow.utils.tree import flatten_shape
 
 
 @serializable("bayesflow.networks")
 class Standardization(keras.Layer):
-    def __init__(self, momentum: float = 0.95, epsilon: float = 1e-6, **kwargs):
+    def __init__(self, **kwargs):
         """
         Initializes a Standardization layer that will keep track of the running mean and
         running standard deviation across a batch of potentially nested tensors.
 
+        The layer computes and stores running estimates of the mean and variance using a numerically
+        stable online algorithm, allowing for consistent normalization during both training and inference,
+        regardless of batch composition.
+
         Parameters
         ----------
-        momentum : float, optional
-            Momentum for the exponential moving average used to update the mean and
-            standard deviation during training. Must be between 0 and 1.
-            Default is 0.95.
-        epsilon: float, optional
-            Stability parameter to avoid division by zero.
+        **kwargs
+            Additional keyword arguments passed to the base Keras Layer.
+
+        Notes
+        -----
         """
         super().__init__(**layer_kwargs(kwargs))
 
-        self.momentum = momentum
-        self.epsilon = epsilon
         self.moving_mean = None
-        self.moving_std = None
+        self.moving_m2 = None
+        self.count = None
+
+    def moving_std(self, index: int) -> Tensor:
+        return keras.ops.sqrt(self.moving_m2[index] / self.count)
 
     def build(self, input_shape: Shape):
         flattened_shapes = flatten_shape(input_shape)
+
         self.moving_mean = [
             self.add_weight(shape=(shape[-1],), initializer="zeros", trainable=False) for shape in flattened_shapes
         ]
-        self.moving_std = [
-            self.add_weight(shape=(shape[-1],), initializer="ones", trainable=False) for shape in flattened_shapes
+        self.moving_m2 = [
+            self.add_weight(shape=(shape[-1],), initializer="zeros", trainable=False) for shape in flattened_shapes
         ]
-
-    def get_config(self) -> dict:
-        base_config = super().get_config()
-        config = {"momentum": self.momentum, "epsilon": self.epsilon}
-        return base_config | serialize(config)
-
-    @classmethod
-    def from_config(cls, config, custom_objects=None):
-        return cls(**deserialize(config, custom_objects=custom_objects))
+        self.count = self.add_weight(shape=(), initializer="zeros", trainable=False)
 
     def call(
         self,
@@ -80,23 +78,25 @@ def call(
         flattened = keras.tree.flatten(x)
         outputs, log_det_jacs = [], []
 
-        for i, val in enumerate(flattened):
+        for idx, val in enumerate(flattened):
             if stage == "training":
-                self._update_moments(val, i)
+                self._update_moments(val, idx)
 
-            mean = expand_left_as(self.moving_mean[i], val)
-            std = expand_left_as(self.moving_std[i], val)
+            mean = expand_left_as(self.moving_mean[idx], val)
+            std = expand_left_as(self.moving_std(idx), val)
 
             if forward:
                 out = (val - mean) / std
+                # if the std is zero, out will become nan. As val - mean(val) = 0 if std(val) = 0,
+                # we can just replace them with zeros.
+                out = keras.ops.nan_to_num(out, nan=0.0)
             else:
                 out = mean + std * val
 
             outputs.append(out)
 
             if log_det_jac:
                 ldj = keras.ops.sum(keras.ops.log(keras.ops.abs(std)), axis=-1)
-                # For convenience, tile to batch shape of val
                 ldj = keras.ops.tile(ldj, keras.ops.shape(val)[:-1])
                 log_det_jacs.append(-ldj if forward else ldj)
 
@@ -108,9 +108,38 @@ def call(
         return outputs
 
     def _update_moments(self, x: Tensor, index: int):
-        mean = keras.ops.mean(x, axis=tuple(range(keras.ops.ndim(x) - 1)))
-        std = keras.ops.std(x, axis=tuple(range(keras.ops.ndim(x) - 1)))
-        std = keras.ops.maximum(std, self.epsilon)
+        """
+        Incrementally updates the running mean and variance (M2) per feature using a numerically
+        stable online algorithm.
+
+        Parameters
+        ----------
+        x : Tensor
+            Input tensor of shape (..., features), where all axes except the last are treated as batch/sample axes.
+            The method computes batch-wise statistics by aggregating over all non-feature axes and updates the
+            running totals (mean, M2, and sample count) accordingly.
+        index : int
+            The index of the corresponding running statistics to be updated.
+        """
+
+        reduce_axes = tuple(range(x.ndim - 1))
+        batch_count = keras.ops.cast(keras.ops.shape(x)[0], self.count.dtype)
+
+        # Compute batch mean and M2 per feature
+        batch_mean = keras.ops.mean(x, axis=reduce_axes)
+        batch_m2 = keras.ops.sum((x - expand_left_as(batch_mean, x)) ** 2, axis=reduce_axes)
+
+        # Read current totals
+        mean = self.moving_mean[index]
+        m2 = self.moving_m2[index]
+        count = self.count
+
+        total_count = count + batch_count
+        delta = batch_mean - mean
+
+        new_mean = mean + delta * (batch_count / total_count)
+        new_m2 = m2 + batch_m2 + (delta**2) * (count * batch_count / total_count)
 
-        self.moving_mean[index].assign(self.momentum * self.moving_mean[index] + (1.0 - self.momentum) * mean)
-        self.moving_std[index].assign(self.momentum * self.moving_std[index] + (1.0 - self.momentum) * std)
+        self.moving_mean[index].assign(new_mean)
+        self.moving_m2[index].assign(new_m2)
+        self.count.assign(total_count)
diff --git a/bayesflow/scores/multivariate_normal_score.py b/bayesflow/scores/multivariate_normal_score.py
@@ -26,6 +26,17 @@ class MultivariateNormalScore(ParametricDistributionScore):
     For more information see :py:class:`ScoringRule`.
     """
 
+    RANK: dict[str, int] = {"covariance": 2}
+    """
+    The covariance matrix is a rank 2 tensor and as such the inverse of the standardization operation is
+
+    x = x' * sigma ^ 2
+
+    Accordingly, covariance is also included in :py:attr:`NO_SHIFT`.
+    """
+
+    NO_SHIFT: tuple[str] = ("covariance",)
+
     def __init__(self, dim: int = None, links: dict = None, **kwargs):
         super().__init__(links=links, **kwargs)
 
diff --git a/bayesflow/scores/scoring_rule.py b/bayesflow/scores/scoring_rule.py
@@ -26,10 +26,9 @@ class ScoringRule:
     and covariance simultaneously.
     """
 
-    NOT_TRANSFORMING_LIKE_VECTOR_WARNING = tuple()
+    NOT_TRANSFORMING_LIKE_VECTOR_WARNING: tuple[str] = tuple()
     """
-    This variable contains names of prediction heads that should lead to a warning when the adapter is applied
-    in inverse direction to them.
+    Names of prediction heads for which to warn if the adapter is called on their estimates in inverse direction.
 
     Prediction heads can output estimates in spaces other than the target distribution space.
     To such estimates the adapter cannot be straightforwardly applied in inverse direction,
@@ -38,6 +37,33 @@ class ScoringRule:
     with a type of estimate whenever the adapter is applied to them in inverse direction.
     """
 
+    RANK: dict[str, int] = {}
+    """
+    Mapping of prediction head names to their tensor rank for inverse standardization.
+
+    The rank indicates the power to which the standard deviation is raised before being multiplied to some estimate
+    in standardized space.
+
+    x = x' * sigma ^ rank [ + mean ]
+
+    If a head is not present in this mapping, a default rank of 1 is assumed.
+
+    Typically, if :py:attr:`RANK` is modified for an estimate, it is also included in :py:attr:`NO_SHIFT`.
+    """
+
+    NO_SHIFT: tuple[str] = tuple()
+    """
+    Names of prediction heads whose estimates should not be shifted when applying inverse standardization.
+
+    During inverse standardization, point estimates are typically shifted by the stored mean vector. Any head
+    listed in this tuple will skip the shift step and only be scaled. By default, this tuple is empty,
+    meaning all heads will be shifted to undo standardization.
+
+    x = x' * sigma ^ rank + mean
+
+    See also :py:attr:`RANK`.
+    """
+
     def __init__(
         self,
         subnets: dict[str, str | type] = None,
diff --git a/tests/test_networks/test_standardization.py b/tests/test_networks/test_standardization.py
@@ -10,21 +10,40 @@
 def test_forward_standardization_training():
     random_input = keras.random.normal((8, 4))
 
-    layer = Standardization(momentum=0.0)  # no EMA for test stability
+    layer = Standardization()
     layer.build(random_input.shape)
 
     out = layer(random_input, stage="training")
 
     moving_mean = keras.ops.convert_to_numpy(layer.moving_mean[0])
-    moving_std = keras.ops.convert_to_numpy(layer.moving_std[0])
     random_input = keras.ops.convert_to_numpy(random_input)
     out = keras.ops.convert_to_numpy(out)
 
     np.testing.assert_allclose(moving_mean, np.mean(random_input, axis=0), atol=1e-5)
-    np.testing.assert_allclose(moving_std, np.std(random_input, axis=0), atol=1e-5)
 
     assert out.shape == random_input.shape
     assert not np.any(np.isnan(out))
+    np.testing.assert_allclose(np.std(out, axis=0), 1.0, atol=1e-5)
+
+
+def test_forward_standardization_training_constant_batch():
+    constant_input = keras.ops.ones((8, 4))
+
+    layer = Standardization()
+    layer.build(constant_input.shape)
+
+    out = layer(constant_input, stage="training")
+
+    moving_mean = keras.ops.convert_to_numpy(layer.moving_mean[0])
+    constant_input = keras.ops.convert_to_numpy(constant_input)
+    out = keras.ops.convert_to_numpy(out)
+
+    np.testing.assert_allclose(moving_mean, np.mean(constant_input, axis=0), atol=1e-5)
+
+    assert out.shape == constant_input.shape
+    assert not np.any(np.isnan(out))
+    np.testing.assert_allclose(out, 0.0, atol=1e-5)
+    np.testing.assert_allclose(np.std(out, axis=0), 0.0, atol=1e-5)
 
 
 def test_inverse_standardization_ldj():
@@ -42,9 +61,10 @@ def test_inverse_standardization_ldj():
 
 def test_consistency_forward_inverse():
     random_input = keras.random.normal((4, 20, 5))
-    layer = Standardization(momentum=0.0)
-    layer.build((5,))
-    standardized = layer(random_input, stage="training", forward=True)
+    layer = Standardization()
+    _ = layer(random_input, stage="training", forward=True)
+
+    standardized = layer(random_input, stage="inference", forward=True)
     recovered = layer(standardized, stage="inference", forward=False)
 
     random_input = keras.ops.convert_to_numpy(random_input)
@@ -58,9 +78,10 @@ def test_nested_consistency_forward_inverse():
     random_input_b = keras.random.normal((4, 3))
     random_input = {"a": random_input_a, "b": random_input_b}
 
-    layer = Standardization(momentum=0.0)
+    layer = Standardization()
 
-    standardized = layer(random_input, stage="training", forward=True)
+    _ = layer(random_input, stage="training", forward=True)
+    standardized = layer(random_input, stage="inference", forward=True)
     recovered = layer(standardized, stage="inference", forward=False)
 
     random_input = keras.tree.map_structure(keras.ops.convert_to_numpy, random_input)