Add initial value to Gaussian state space distribution (fixes #2098). (#2104)

tillahoffmann · web-flow · commit 931d67d0dc24 · 2026-01-20T11:55:41.000-05:00
diff --git a/numpyro/distributions/constraints.py b/numpyro/distributions/constraints.py
@@ -45,6 +45,7 @@
     "multinomial",
     "nonnegative",
     "nonnegative_integer",
+    "ordered_vector",
     "positive",
     "positive_definite",
     "positive_definite_circulant_vector",
diff --git a/numpyro/distributions/continuous.py b/numpyro/distributions/continuous.py
@@ -709,10 +709,11 @@ class GaussianStateSpace(TransformedDistribution):
 
     .. math::
         \mathbf{z}_{t} &= \mathbf{A} \mathbf{z}_{t - 1} + \boldsymbol{\epsilon}_t\\
-        &=\sum_{k=1} \mathbf{A}^{t-k} \boldsymbol{\epsilon}_t,
+        &= \mathbf{A}^t \mathbf{z}_0 + \sum_{k=1}^{t} \mathbf{A}^{t-k} \boldsymbol{\epsilon}_k,
 
     where :math:`\mathbf{z}_t` is the state vector at step :math:`t`, :math:`\mathbf{A}`
-    is the transition matrix, and :math:`\boldsymbol\epsilon` is the innovation noise.
+    is the transition matrix, :math:`\mathbf{z}_0` is the initial value, and
+    :math:`\boldsymbol\epsilon` is the innovation noise.
 
 
     :param num_steps: Number of steps.
@@ -723,15 +724,19 @@ class GaussianStateSpace(TransformedDistribution):
         :math:`\boldsymbol\epsilon`.
     :param scale_tril: Scale matrix of the innovation noise
         :math:`\boldsymbol\epsilon`.
+    :param initial_value: Initial state vector :math:`\mathbf{z}_0`. If ``None``,
+        defaults to zero.
     """
 
     arg_constraints = {
         "covariance_matrix": constraints.positive_definite,
         "precision_matrix": constraints.positive_definite,
         "scale_tril": constraints.lower_cholesky,
         "transition_matrix": constraints.real_matrix,
+        "initial_value": constraints.real_vector,
     }
     support = constraints.real_matrix
+    pytree_data_fields = ("transition_matrix", "_initial_value", "scale_tril")
     pytree_aux_fields = ("num_steps",)
 
     def __init__(
@@ -741,6 +746,7 @@ def __init__(
         covariance_matrix: Optional[Array] = None,
         precision_matrix: Optional[Array] = None,
         scale_tril: Optional[Array] = None,
+        initial_value: Optional[Array] = None,
         *,
         validate_args: Optional[bool] = None,
     ) -> None:
@@ -752,6 +758,7 @@ def __init__(
             "`transition_matrix` argument should be a square matrix"
         )
         self.transition_matrix = transition_matrix
+        self._initial_value = initial_value
         # Expand the covariance/precision/scale matrices to the right number of steps.
         args = {
             "covariance_matrix": covariance_matrix,
@@ -766,23 +773,51 @@ def __init__(
         base_distribution = MultivariateNormal(**args)
         self.scale_tril = base_distribution.scale_tril[..., 0, :, :]
         base_distribution = base_distribution.to_event(1)
-        transform = RecursiveLinearTransform(transition_matrix)
+
+        # The base distribution must have at least the same batch shape as the initial
+        # value.
+        if initial_value is not None:
+            batch_shape = initial_value.shape[:-1]
+            base_distribution = base_distribution.expand(batch_shape)
+
+        transform = RecursiveLinearTransform(
+            transition_matrix, initial_value=initial_value
+        )
         super().__init__(base_distribution, transform, validate_args=validate_args)
 
+    @property
+    def initial_value(self) -> Array:
+        if self._initial_value is None:
+            return jnp.zeros(self.transition_matrix.shape[-1:])
+        return self._initial_value
+
     @property
     def mean(self) -> ArrayLike:
-        # The mean of the base distribution is zero and it has the right shape.
-        return self.base_dist.mean
+        # If there's no initial value, the mean is zero (base distribution mean).
+        if self._initial_value is None:
+            return self.base_dist.mean
+
+        # Otherwise, compute A^t @ z_0 for each time step t.
+        # z_t = A @ z_{t-1} for the deterministic part with z_0 = initial_value
+        def propagate(z, _):
+            z_next = jnp.einsum("...ij,...j->...i", self.transition_matrix, z)
+            return z_next, z_next
+
+        _, means = scan(propagate, self.initial_value, jnp.arange(self.num_steps))
+        # means has shape (num_steps, ..., state_dim)
+        # We need to move num_steps to axis -2 to match base_dist.mean shape
+        return jnp.moveaxis(means, 0, -2)
 
     @property
     def variance(self) -> ArrayLike:
-        # Given z_t = \sum_{k=1}^t A^{t-k} \epsilon_t, the covariance of the state
+        # Given z_t = z_0 + \sum_{k=1}^t A^{t-k} \epsilon_t, the covariance of the state
         # vector at step t is E[z_t transpose(z_t)] = \sum_{k,k'}^t A^{t-k}
         # E[\epsilon_k transpose(\epsilon_{k'})] transpose(A^{t-k'}). We only have
         # contributions for k = k' because innovations at different steps are
         # independent such that E[z_t transpose(z_t)] = \sum_k^t A^{t-k} @
-        # @ covariance_matrix @ transpose(A^{t-k}). Using `scan` is an easy way to
-        # evaluate this expression.
+        # @ covariance_matrix @ transpose(A^{t-k}). The initial value is deterministic,
+        # and we don't need to consider it here. Using `scan` is an easy way to evaluate
+        # this expression.
         _, scale_tril = scan(
             lambda carry, _: (self.transition_matrix @ carry, carry),
             self.scale_tril,
diff --git a/test/test_distributions.py b/test/test_distributions.py
@@ -631,6 +631,33 @@ def get_sp_dist(jax_dist):
         np.array([[0.8, 0.2], [-0.1, 1.1]]),
         np.array([0.1, 0.3, 0.25])[:, None, None] * np.array([[0.8, 0.2], [0.2, 0.7]]),
     ),
+    T(
+        dist.GaussianStateSpace,
+        5,
+        np.array([[0.8, 0.1], [-0.1, 0.9]]),
+        None,
+        None,
+        np.array([[0.5, 0.0], [0.0, 0.5]]),
+        np.array([1.0, 2.0]),
+    ),
+    T(
+        dist.GaussianStateSpace,
+        5,
+        np.array([[0.8, 0.1], [-0.1, 0.9]]),
+        None,
+        None,
+        np.array([[0.5, 0.0], [0.0, 0.5]]),
+        np.array([[1.0, 2.0], [0.5, 1.5], [-1.0, 0.0]]),
+    ),
+    T(
+        dist.GaussianStateSpace,
+        4,
+        np.array([[0.9, 0.0], [0.0, 0.9]]),
+        None,
+        None,
+        np.array([[0.3, 0.0], [0.0, 0.3]]),
+        np.array([[[1.0, 0.0]], [[0.0, 1.0]]]),
+    ),
     pytest.param(
         *T(
             dist.GaussianCopulaBeta,