bayesflow-org
diff --git a/‎bayesflow/experimental/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎bayesflow/experimental/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bayesflow/experimental/stable_consistency_model/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎bayesflow/experimental/stable_consistency_model/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎bayesflow/experimental/continuous_time_consistency_model.py‎ renamed to ‎bayesflow/experimental/stable_consistency_model/stable_consistency_model.py‎
Lines changed: 84 additions & 58 deletions b/‎bayesflow/experimental/continuous_time_consistency_model.py‎ renamed to ‎bayesflow/experimental/stable_consistency_model/stable_consistency_model.py‎
Lines changed: 84 additions & 58 deletions
@@ -3,7 +3,7 @@
 """
 
 from .cif import CIF
-from .continuous_time_consistency_model import ContinuousTimeConsistencyModel
+from .stable_consistency_model import StableConsistencyModel
 from .diffusion_model import DiffusionModel
 from .free_form_flow import FreeFormFlow
 
 
@@ -0,0 +1 @@
+from .stable_consistency_model import StableConsistencyModel
@@ -1,71 +1,107 @@
+from math import pi
+
 import keras
 from keras import ops
 
-import numpy as np
-
 from bayesflow.networks import MLP
 from bayesflow.types import Tensor
 from bayesflow.utils import (
+    logging,
     jvp,
     concatenate_valid,
     find_network,
     expand_right_as,
     expand_right_to,
-    model_kwargs,
+    layer_kwargs,
 )
 from bayesflow.utils.serialization import deserialize, serializable, serialize
 
-
 from bayesflow.networks import InferenceNetwork
 from bayesflow.networks.embeddings import FourierEmbedding
 
 
 # disable module check, use potential module after moving from experimental
 @serializable("bayesflow.networks", disable_module_check=True)
-class ContinuousTimeConsistencyModel(InferenceNetwork):
-    """(IN) Implements an sCM (simple, stable, and scalable Consistency Model)
-    with continous-time Consistency Training (CT) as described in [1].
-    The sampling procedure is taken from [2].
+class StableConsistencyModel(InferenceNetwork):
+    """(IN) Implements an sCM (simple, stable, and scalable Consistency Model) with continuous-time Consistency Training
+    (CT) as described in [1]. The sampling procedure is taken from [2].
 
     [1] Lu, C., & Song, Y. (2024).
     Simplifying, Stabilizing and Scaling Continuous-Time Consistency Models
     arXiv preprint arXiv:2410.11081
 
     [2] Song, Y., Dhariwal, P., Chen, M. & Sutskever, I. (2023).
-    Consistency Models.
-    arXiv preprint arXiv:2303.01469
+    Consistency Models. arXiv preprint arXiv:2303.01469
     """
 
+    MLP_DEFAULT_CONFIG = {
+        "widths": (256, 256, 256, 256, 256),
+        "activation": "mish",
+        "kernel_initializer": "he_normal",
+        "residual": True,
+        "dropout": 0.05,
+        "spectral_normalization": False,
+    }
+
+    WEIGHT_MLP_DEFAULT_CONFIG = {
+        "widths": (256,),
+        "activation": "mish",
+        "kernel_initializer": "he_normal",
+        "residual": False,
+        "dropout": 0.05,
+        "spectral_normalization": False,
+    }
+
+    EPS_WARN = 0.1
+
     def __init__(
         self,
-        subnet: str | keras.Layer = "mlp",
-        sigma_data: float = 1.0,
+        subnet: str | type | keras.Layer = "mlp",
+        sigma: float = 1.0,
         subnet_kwargs: dict[str, any] = None,
+        weight_mlp_kwargs: dict[str, any] = None,
         embedding_kwargs: dict[str, any] = None,
         **kwargs,
     ):
         """Creates an instance of an sCM to be used for consistency training (CT).
 
         Parameters
         ----------
-        subnet        : str or type, optional, default: "mlp"
-            A neural network type for the consistency model, will be
-            instantiated using subnet_kwargs.
-        sigma_data    : float, optional, default: 1.0
-            Standard deviation of the target distribution
+        subnet : str, type, or keras.Layer, optional, default="mlp"
+            The neural network architecture used for the consistency model.
+            If a string is provided, it should be a registered name (e.g., "mlp").
+            If a type or keras.Layer is provided, it will be directly instantiated
+            with the given ``subnet_kwargs``.
+        sigma : float, optional, default=1.0
+            Standard deviation of the target distribution for the consistency loss.
+            Controls the scale of the noise injected during training.
+        subnet_kwargs : dict[str, any], optional, default=None
+            Keyword arguments passed to the constructor of the chosen ``subnet``. For example, number of hidden units,
+            activation functions, or dropout settings.
+        weight_mlp_kwargs : dict[str, any], optional, default=None
+            Keyword arguments for an auxiliary MLP used to generate weights within the consistency model. Typically
+            includes depth, hidden sizes, and non-linearity choices.
+        embedding_kwargs : dict[str, any], optional, default=None
+            Keyword arguments for the time embedding layer(s) used in the model
         **kwargs
-            Additional keyword arguments to the layer.
+            Additional keyword arguments passed to the parent ``InferenceNetwork`` initializer
+            (e.g., ``name``, ``dtype``, or ``trainable``).
         """
         super().__init__(base_distribution="normal", **kwargs)
 
         subnet_kwargs = subnet_kwargs or {}
-
+        if subnet == "mlp":
+            subnet_kwargs = StableConsistencyModel.MLP_DEFAULT_CONFIG | subnet_kwargs
         self.subnet = find_network(subnet, **subnet_kwargs)
+
         self.subnet_projector = keras.layers.Dense(
             units=None, bias_initializer="zeros", kernel_initializer="zeros", name="subnet_projector"
         )
 
-        self.weight_fn = MLP([256], dropout=0.0)
+        weight_mlp_kwargs = weight_mlp_kwargs or {}
+        weight_mlp_kwargs = StableConsistencyModel.WEIGHT_MLP_DEFAULT_CONFIG | weight_mlp_kwargs
+        self.weight_fn = MLP(**weight_mlp_kwargs)
+
         self.weight_fn_projector = keras.layers.Dense(
             units=1, bias_initializer="zeros", kernel_initializer="zeros", name="weight_fn_projector"
         )
@@ -74,8 +110,7 @@ def __init__(
         self.time_emb = FourierEmbedding(**embedding_kwargs)
         self.time_emb_dim = self.time_emb.embed_dim
 
-        self.sigma_data = sigma_data
-
+        self.sigma = sigma
         self.seed_generator = keras.random.SeedGenerator()
 
     @classmethod
@@ -84,29 +119,33 @@ def from_config(cls, config, custom_objects=None):
 
     def get_config(self):
         base_config = super().get_config()
-        base_config = model_kwargs(base_config)
+        base_config = layer_kwargs(base_config)
 
         config = {
             "subnet": self.subnet,
-            "sigma_data": self.sigma_data,
+            "sigma": self.sigma,
         }
 
         return base_config | serialize(config)
 
     def _discretize_time(self, num_steps: int, rho: float = 3.5, **kwargs):
-        t = np.linspace(0.0, np.pi / 2, num_steps)
-        times = np.exp((t - np.pi / 2) * rho) * np.pi / 2
-        times[0] = 0.0
+        t = keras.ops.linspace(0.0, pi / 2, num_steps)
+        times = keras.ops.exp((t - pi / 2) * rho) * pi / 2
+        times.at[0].set(0.0)
 
         # if rho is set too low, bad schedules can occur
-        EPS_WARN = 0.1
-        if times[1] > EPS_WARN:
-            print("Warning: The last time step is large.")
-            print(f"Increasing rho (was {rho}) or n_steps (was {num_steps}) might improve results.")
-        return ops.convert_to_tensor(times)
+        if times[1] > StableConsistencyModel.EPS_WARN:
+            logging.warning("Warning: The last time step is large.")
+            logging.warning(f"Increasing rho (was {rho}) or n_steps (was {num_steps}) might improve results.")
+        return times
 
     def build(self, xz_shape, conditions_shape=None):
-        super().build(xz_shape)
+        if self.built:
+            # building when the network is already built can cause issues with serialization
+            # see https://github.com/keras-team/keras/issues/21147
+            return
+
+        self.base_distribution.build(xz_shape)
         self.subnet_projector.units = xz_shape[-1]
 
         # construct input shape for subnet and subnet projector
@@ -134,17 +173,6 @@ def build(self, xz_shape, conditions_shape=None):
         input_shape = self.weight_fn.compute_output_shape(input_shape)
         self.weight_fn_projector.build(input_shape)
 
-    def call(
-        self,
-        xz: Tensor,
-        conditions: Tensor = None,
-        inverse: bool = False,
-        **kwargs,
-    ):
-        if inverse:
-            return self._inverse(xz, conditions=conditions, **kwargs)
-        return self._forward(xz, conditions=conditions, **kwargs)
-
     def _forward(self, x: Tensor, conditions: Tensor = None, **kwargs) -> Tensor:
         # Consistency Models only learn the direction from noise distribution
         # to target distribution, so we cannot implement this function.
@@ -172,8 +200,8 @@ def _inverse(self, z: Tensor, conditions: Tensor = None, **kwargs) -> Tensor:
         steps = kwargs.get("steps", 15)
         rho = kwargs.get("rho", 3.5)
 
-        # noise distribution has variance sigma_data
-        x = keras.ops.copy(z) * self.sigma_data
+        # noise distribution has variance sigma
+        x = keras.ops.copy(z) * self.sigma
         discretized_time = keras.ops.flip(self._discretize_time(steps, rho=rho), axis=-1)
         t = keras.ops.full((*keras.ops.shape(x)[:-1], 1), discretized_time[0], dtype=x.dtype)
         x = self.consistency_function(x, t, conditions=conditions)
@@ -207,9 +235,9 @@ def consistency_function(
         **kwargs    : dict, optional, default: {}
             Additional keyword arguments passed to the inner network.
         """
-        xtc = concatenate_valid([x / self.sigma_data, self.time_emb(t), conditions], axis=-1)
+        xtc = concatenate_valid([x / self.sigma, self.time_emb(t), conditions], axis=-1)
         f = self.subnet_projector(self.subnet(xtc, training=training, **kwargs))
-        out = ops.cos(t) * x - ops.sin(t) * self.sigma_data * f
+        out = ops.cos(t) * x - ops.sin(t) * self.sigma * f
         return out
 
     def compute_metrics(
@@ -226,17 +254,14 @@ def compute_metrics(
         c = 0.1
 
         # generate noise vector
-        z = (
-            keras.random.normal(keras.ops.shape(x), dtype=keras.ops.dtype(x), seed=self.seed_generator)
-            * self.sigma_data
-        )
+        z = keras.random.normal(keras.ops.shape(x), dtype=keras.ops.dtype(x), seed=self.seed_generator) * self.sigma
 
         # sample time
         tau = (
             keras.random.normal(keras.ops.shape(x)[:1], dtype=keras.ops.dtype(x), seed=self.seed_generator) * p_std
             + p_mean
         )
-        t_ = ops.arctan(ops.exp(tau) / self.sigma_data)
+        t_ = ops.arctan(ops.exp(tau) / self.sigma)
         t = expand_right_as(t_, x)
 
         # generate noisy sample
@@ -251,23 +276,23 @@ def f_teacher(x, t):
             o = self.subnet(concatenate_valid([x, self.time_emb(t), conditions], axis=-1), training=stage == "training")
             return self.subnet_projector(o)
 
-        primals = (xt / self.sigma_data, t)
+        primals = (xt / self.sigma, t)
         tangents = (
             ops.cos(t) * ops.sin(t) * dxtdt,
-            ops.cos(t) * ops.sin(t) * self.sigma_data,
+            ops.cos(t) * ops.sin(t) * self.sigma,
         )
 
         teacher_output, cos_sin_dFdt = jvp(f_teacher, primals, tangents, return_output=True)
         teacher_output = ops.stop_gradient(teacher_output)
         cos_sin_dFdt = ops.stop_gradient(cos_sin_dFdt)
 
         # calculate output of the network
-        xtc = concatenate_valid([xt / self.sigma_data, self.time_emb(t), conditions], axis=-1)
+        xtc = concatenate_valid([xt / self.sigma, self.time_emb(t), conditions], axis=-1)
         student_out = self.subnet_projector(self.subnet(xtc, training=stage == "training"))
 
         # calculate the tangent
-        g = -(ops.cos(t) ** 2) * (self.sigma_data * teacher_output - dxtdt) - r * ops.cos(t) * ops.sin(t) * (
-            xt + self.sigma_data * cos_sin_dFdt
+        g = -(ops.cos(t) ** 2) * (self.sigma * teacher_output - dxtdt) - r * ops.cos(t) * ops.sin(t) * (
+            xt + self.sigma * cos_sin_dFdt
         )
 
         # apply normalization to stabilize training
@@ -277,6 +302,7 @@ def f_teacher(x, t):
         w = self.weight_fn_projector(self.weight_fn(expand_right_to(t_, 2)))
 
         D = ops.shape(x)[-1]
+
         loss = ops.mean(
             (ops.exp(w) / D)
             * ops.mean(
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from .stable_consistency_model import StableConsistencyModel`