Use square root filter equations in SquareRootFilter

jessegrabowski · jessegrabowski · commit c3bc3659b332 · 2024-11-23T00:41:39.000+08:00
diff --git a/pymc_experimental/statespace/filters/kalman_filter.py b/pymc_experimental/statespace/filters/kalman_filter.py
@@ -4,6 +4,7 @@
 import pytensor
 import pytensor.tensor as pt
 
+from pymc.pytensorf import constant_fold
 from pytensor.compile.mode import get_mode
 from pytensor.graph.basic import Variable
 from pytensor.raise_op import Assert
@@ -203,8 +204,11 @@ def build_graph(
         self.missing_fill_value = missing_fill_value
         self.cov_jitter = cov_jitter
 
-        self.n_states, self.n_shocks = R.shape[-2:]
-        self.n_endog = Z.shape[-2]
+        [R_shape] = constant_fold([R.shape], raise_not_constant=False)
+        [Z_shape] = constant_fold([Z.shape], raise_not_constant=False)
+
+        self.n_states, self.n_shocks = R_shape[-2:]
+        self.n_endog = Z_shape[-2]
 
         data, a0, P0, *params = self.check_params(data, a0, P0, c, d, T, Z, R, H, Q)
 
@@ -408,7 +412,7 @@ def predict(a, P, c, T, R, Q) -> tuple[TensorVariable, TensorVariable]:
 
     @staticmethod
     def update(
-        a, P, y, c, d, Z, H, all_nan_flag
+        a, P, y, d, Z, H, all_nan_flag
     ) -> tuple[TensorVariable, TensorVariable, TensorVariable, TensorVariable, TensorVariable]:
         """
         Perform the update step of the Kalman filter.
@@ -419,7 +423,7 @@ def update(
         .. math::
 
             \begin{align}
-            \\hat{y}_t &= Z_t a_{t | t-1} \\
+            \\hat{y}_t &= Z_t a_{t | t-1} + d_t \\
             v_t &= y_t - \\hat{y}_t \\
             F_t &= Z_t P_{t | t-1} Z_t^T + H_t \\
             a_{t|t} &= a_{t | t-1} + P_{t | t-1} Z_t^T F_t^{-1} v_t \\
@@ -435,8 +439,6 @@ def update(
             The current covariance matrix estimate, conditioned on information up to time t-1.
         y : TensorVariable
             The observation data at time t.
-        c : TensorVariable
-            The matrix c.
         d : TensorVariable
             The matrix d.
         Z : TensorVariable
@@ -529,7 +531,7 @@ def kalman_step(self, *args) -> tuple:
         y_masked, Z_masked, H_masked, all_nan_flag = self.handle_missing_values(y, Z, H)
 
         a_filtered, P_filtered, obs_mu, obs_cov, ll = self.update(
-            y=y_masked, a=a, c=c, d=d, P=P, Z=Z_masked, H=H_masked, all_nan_flag=all_nan_flag
+            y=y_masked, a=a, d=d, P=P, Z=Z_masked, H=H_masked, all_nan_flag=all_nan_flag
         )
 
         P_filtered = stabilize(P_filtered, self.cov_jitter)
@@ -545,7 +547,7 @@ class StandardFilter(BaseFilter):
     Basic Kalman Filter
     """
 
-    def update(self, a, P, y, c, d, Z, H, all_nan_flag):
+    def update(self, a, P, y, d, Z, H, all_nan_flag):
         """
         Compute one-step forecasts for observed states conditioned on information up to, but not including, the current
         timestep, `y_hat`, along with the forcast covariance matrix, `F`. Marginalize over observed states to obtain
@@ -566,9 +568,6 @@ def update(self, a, P, y, c, d, Z, H, all_nan_flag):
         y : TensorVariable
             Observations at time t.
 
-        c : TensorVariable
-            Latent state bias term.
-
         d : TensorVariable
             Observed state bias term.
 
@@ -628,38 +627,128 @@ class SquareRootFilter(BaseFilter):
 
     """
 
-    # TODO: Can the entire Kalman filter process be re-written, starting from P0_chol, so it's not necessary to compute
-    #     cholesky(F) at every iteration?
+    def predict(self, a, P, c, T, R, Q):
+        """
+        Compute one-step forecasts for the hidden states conditioned on information up to, but not including, the current
+        timestep, `a_hat`, along with the forcast covariance matrix, `P_hat`.
+
+        .. warning::
+            Very important -- In this function, $P$ is the **cholesky factor** of the covariance matrix, not the
+            covariance matrix itself. The name `P` is kept for consistency with the superclass.
+        """
+        # Rename P to P_chol for clarity
+        P_chol = P
+
+        a_hat = T.dot(a) + c
+        Q_chol = pt.linalg.cholesky(Q, lower=True)
+
+        M = pt.horizontal_stack(T @ P_chol, R @ Q_chol).T
+        R_decomp = pt.linalg.qr(M, mode="r")
+        P_chol_hat = R_decomp[: self.n_states, : self.n_states].T
+
+        return a_hat, P_chol_hat
+
+    def update(self, a, P, y, d, Z, H, all_nan_flag):
+        """
+        Compute posterior estimates of the hidden state distributions conditioned on the observed data, up to and
+        including the present timestep. Also compute the log-likelihood of the data given the one-step forecasts.
+
+        .. warning::
+            Very important -- In this function, $P$ is the **cholesky factor** of the covariance matrix, not the
+            covariance matrix itself. The name `P` is kept for consistency with the superclass.
+        """
+
+        # Rename P to P_chol for clarity
+        P_chol = P
 
-    def update(self, a, P, y, c, d, Z, H, all_nan_flag):
         y_hat = Z.dot(a) + d
         v = y - y_hat
 
-        PZT = P.dot(Z.T)
+        H_chol = pytensor.ifelse(pt.all(pt.eq(H, 0.0)), H, pt.linalg.cholesky(H, lower=True))
+
+        # The following notation comes from https://ipnpr.jpl.nasa.gov/progress_report/42-233/42-233A.pdf
+        # Construct upper-triangular block matrix A = [[chol(H), Z @ L_pred],
+        #                                              [0,           L_pred]]
+        # The Schur decomposition of this matrix will be B (upper triangular). We are
+        # more insterested in B^T:
+        # Structure of B^T = [[chol(F),     0              ],
+        #                    [K @ chol(F), chol(P_filtered)]
+        zeros = pt.zeros((self.n_states, self.n_endog))
+        upper = pt.horizontal_stack(H_chol, Z @ P_chol)
+        lower = pt.horizontal_stack(zeros, P_chol)
+        A_T = pt.vertical_stack(upper, lower)
+        B = pt.linalg.qr(A_T.T, mode="r").T
+
+        F_chol = B[: self.n_endog, : self.n_endog]
+        K_F_chol = B[self.n_endog :, : self.n_endog]
+        P_chol_filtered = B[self.n_endog :, self.n_endog :]
+
+        def compute_non_degenerate(P_chol_filtered, F_chol, K_F_chol, v):
+            a_filtered = a + K_F_chol @ solve_triangular(F_chol, v, lower=True)
+
+            inner_term = solve_triangular(
+                F_chol, solve_triangular(F_chol, v, lower=True), lower=True
+            )
+            loss = (v.T @ inner_term).ravel()
+
+            # abs necessary because we're not guaranteed a positive diagonal from the schur decomposition
+            logdet = 2 * pt.log(pt.abs(pt.diag(F_chol))).sum()
+
+            ll = -0.5 * (self.n_endog * (MVN_CONST + logdet) + loss)[0]
+
+            return [a_filtered, P_chol_filtered, ll]
+
+        def compute_degenerate(P_chol_filtered, F_chol, K_F_chol, v):
+            """
+            If F is zero (usually because there were no observations this period), then we want:
+            K = 0, a = a, P = P, ll = 0
+            """
+            return [a, P_chol, pt.zeros(())]
+
+        [a_filtered, P_chol_filtered, ll] = pytensor.ifelse(
+            pt.eq(all_nan_flag, 1.0),
+            compute_degenerate(P_chol_filtered, F_chol, K_F_chol, v),
+            compute_non_degenerate(P_chol_filtered, F_chol, K_F_chol, v),
+        )
 
-        # If everything is missing, F will be [[0]] and F_chol will raise an error, so add identity to avoid the error
-        F = Z.dot(PZT) + stabilize(H, self.cov_jitter)
-        F_chol = pt.linalg.cholesky(F)
+        a_filtered = pt.specify_shape(a_filtered, (self.n_states,))
+        P_chol_filtered = pt.specify_shape(P_chol_filtered, (self.n_states, self.n_states))
 
-        # If everything is missing, K = 0, IKZ = I
-        K = solve_triangular(F_chol.T, solve_triangular(F_chol, PZT.T)).T
-        I_KZ = pt.eye(self.n_states) - K.dot(Z)
+        return a_filtered, P_chol_filtered, y_hat, F_chol, ll
 
-        a_filtered = a + K.dot(v)
-        P_filtered = quad_form_sym(I_KZ, P) + quad_form_sym(K, H)
+    def _postprocess_scan_results(self, results, a0, P0, n) -> list[TensorVariable]:
+        """
+        Convert the Cholesky factor of the covariance matrix back to the covariance matrix itself.
+        """
+        results = super()._postprocess_scan_results(results, a0, P0, n)
+        (
+            filtered_states,
+            predicted_states,
+            observed_states,
+            filtered_covariances_cholesky,
+            predicted_covariances_cholesky,
+            observed_covariances_cholesky,
+            loglike_obs,
+        ) = results
 
-        inner_term = solve_triangular(F_chol.T, solve_triangular(F_chol, v))
-        n = y.shape[0]
+        def square_sequnece(L):
+            X = pt.einsum("...ij,...kj->...ik", L, L.copy())
+            X = pt.specify_shape(X, (n, self.n_states, self.n_states))
+            return X
 
-        ll = pt.switch(
-            all_nan_flag,
-            0.0,
-            (
-                -0.5 * (n * MVN_CONST + (v.T @ inner_term).ravel()) - pt.log(pt.diag(F_chol)).sum()
-            ).ravel()[0],
-        )
+        filtered_covariances = square_sequnece(filtered_covariances_cholesky)
+        predicted_covariances = square_sequnece(predicted_covariances_cholesky)
+        observed_covariances = square_sequnece(observed_covariances_cholesky)
 
-        return a_filtered, P_filtered, y_hat, F, ll
+        return [
+            filtered_states,
+            predicted_states,
+            observed_states,
+            filtered_covariances,
+            predicted_covariances,
+            observed_covariances,
+            loglike_obs,
+        ]
 
 
 class SingleTimeseriesFilter(BaseFilter):
@@ -679,7 +768,7 @@ def check_params(self, data, a0, P0, c, d, T, Z, R, H, Q):
 
         return data, a0, P0, c, d, T, Z, R, H, Q
 
-    def update(self, a, P, y, c, d, Z, H, all_nan_flag):
+    def update(self, a, P, y, d, Z, H, all_nan_flag):
         y_hat = d + Z.dot(a)
         v = y - y_hat.ravel()
 
diff --git a/tests/statespace/test_kalman_filter.py b/tests/statespace/test_kalman_filter.py
@@ -64,7 +64,7 @@
 
 def test_base_class_update_raises():
     filter = BaseFilter()
-    inputs = [None] * 8
+    inputs = [None] * 7
     with pytest.raises(NotImplementedError):
         filter.update(*inputs)
 
@@ -214,6 +214,7 @@ def test_output_with_multiple_observed(filter_func, filter_name, rng):
 def test_missing_data(filter_func, filter_name, p, rng):
     m, r, n = 5, 1, 10
     inputs = make_test_inputs(p, m, r, n, rng, missing_data=1)
+
     if p > 1 and filter_name == "SingleTimeSeriesFilter":
         with pytest.raises(
             AssertionError,
@@ -243,11 +244,16 @@ def test_last_smoother_is_last_filtered(filter_func, output_idx, rng):
     assert_allclose(filtered[-1], smoothed[-1])
 
 
-@pytest.mark.parametrize("filter_func", filter_funcs, ids=filter_names)
+@pytest.mark.parametrize(
+    "filter_func, filter_name", zip(filter_funcs, filter_names), ids=filter_names
+)
 @pytest.mark.parametrize("n_missing", [0, 5], ids=["n_missing=0", "n_missing=5"])
 @pytest.mark.skipif(floatX == "float32", reason="Tests are too sensitive for float32")
-def test_filters_match_statsmodel_output(filter_func, n_missing, rng):
-    fit_sm_mod, inputs = nile_test_test_helper(rng, n_missing)
+def test_filters_match_statsmodel_output(filter_func, filter_name, n_missing, rng):
+    fit_sm_mod, [data, a0, P0, c, d, T, Z, R, H, Q] = nile_test_test_helper(rng, n_missing)
+    if filter_name == "CholeskyFilter":
+        P0 = np.linalg.cholesky(P0)
+    inputs = [data, a0, P0, c, d, T, Z, R, H, Q]
     outputs = filter_func(*inputs)
 
     for output_idx, name in enumerate(output_names):
@@ -294,6 +300,8 @@ def test_all_covariance_matrices_are_PSD(filter_func, filter_name, n_missing, ob
         pytest.skip("Univariate filter not stable at half precision without measurement error")
 
     fit_sm_mod, [data, a0, P0, c, d, T, Z, R, H, Q] = nile_test_test_helper(rng, n_missing)
+    if filter_name == "CholeskyFilter":
+        P0 = np.linalg.cholesky(P0)
 
     H *= int(obs_noise)
     inputs = [data, a0, P0, c, d, T, Z, R, H, Q]
@@ -325,16 +333,7 @@ def test_kalman_filter_jax(filter):
     # TODO: Add UnivariateFilter to test; need to figure out the broadcasting issue when 2nd data dim is defined
 
     p, m, r, n = 1, 5, 1, 10
-    inputs, outputs = initialize_filter(filter(), mode="JAX")
-
-    # Shape of the data must be static for jax to know how long the scan is
-    data = inputs.pop(0)
-    data_specified = pt.specify_shape(data, (n, None))
-    data_specified.name = "data"
-    inputs = [data, *inputs]
-
-    outputs = pytensor.graph.clone_replace(outputs, {data: data_specified})
-
+    inputs, outputs = initialize_filter(filter(), mode="JAX", p=p, m=m, r=r, n=n)
     inputs_np = make_test_inputs(p, m, r, n, rng)
 
     f_jax = get_jaxified_graph(inputs, outputs)
diff --git a/tests/statespace/utilities/test_helpers.py b/tests/statespace/utilities/test_helpers.py
@@ -34,18 +34,18 @@ def load_nile_test_data():
     return nile
 
 
-def initialize_filter(kfilter, mode=None):
+def initialize_filter(kfilter, mode=None, p=None, m=None, r=None, n=None):
     ksmoother = KalmanSmoother()
-    data = pt.matrix(name="data", dtype=floatX)
-    a0 = pt.vector(name="a0", dtype=floatX)
-    P0 = pt.matrix(name="P0", dtype=floatX)
-    c = pt.vector(name="c", dtype=floatX)
-    d = pt.vector(name="d", dtype=floatX)
-    Q = pt.matrix(name="Q", dtype=floatX)
-    H = pt.matrix(name="H", dtype=floatX)
-    T = pt.matrix(name="T", dtype=floatX)
-    R = pt.matrix(name="R", dtype=floatX)
-    Z = pt.matrix(name="Z", dtype=floatX)
+    data = pt.tensor(name="data", dtype=floatX, shape=(n, p))
+    a0 = pt.tensor(name="x0", dtype=floatX, shape=(m,))
+    P0 = pt.tensor(name="P0", dtype=floatX, shape=(m, m))
+    c = pt.tensor(name="c", dtype=floatX, shape=(m,))
+    d = pt.tensor(name="d", dtype=floatX, shape=(p,))
+    Q = pt.tensor(name="Q", dtype=floatX, shape=(r, r))
+    H = pt.tensor(name="H", dtype=floatX, shape=(p, p))
+    T = pt.tensor(name="T", dtype=floatX, shape=(m, m))
+    R = pt.tensor(name="R", dtype=floatX, shape=(m, r))
+    Z = pt.tensor(name="Z", dtype=floatX, shape=(p, m))
 
     inputs = [data, a0, P0, c, d, T, Z, R, H, Q]