|
| 1 | +import numpy as np |
| 2 | +import pandas as pd |
| 3 | +import pymc as pm |
| 4 | +import pytensor |
| 5 | +import pytensor.tensor as pt |
| 6 | +import pytest |
| 7 | +import statsmodels.api as sm |
| 8 | + |
| 9 | +from numpy.testing import assert_allclose |
| 10 | +from statsmodels.tsa.statespace.dynamic_factor import DynamicFactor |
| 11 | + |
| 12 | +from pymc_extras.statespace.models.DFM import BayesianDynamicFactor |
| 13 | + |
| 14 | +floatX = pytensor.config.floatX |
| 15 | + |
| 16 | + |
| 17 | +@pytest.fixture(scope="session") |
| 18 | +def data(): |
| 19 | + df = pd.read_csv( |
| 20 | + "tests/statespace/_data/statsmodels_macrodata_processed.csv", |
| 21 | + index_col=0, |
| 22 | + parse_dates=True, |
| 23 | + ).astype(floatX) |
| 24 | + df.index.freq = df.index.inferred_freq |
| 25 | + return df |
| 26 | + |
| 27 | + |
| 28 | +@pytest.mark.parametrize("k_factors", [1, 2]) |
| 29 | +@pytest.mark.parametrize("factor_order", [1, 2, 3, 4]) |
| 30 | +@pytest.mark.parametrize("error_order", [1, 2, 3, 4]) |
| 31 | +def test_dfm_matrices_match_statsmodels(data, k_factors, factor_order, error_order): |
| 32 | + y = data |
| 33 | + print(data.shape) |
| 34 | + n_obs, k_endog = y.shape |
| 35 | + print(n_obs, k_endog) |
| 36 | + |
| 37 | + with pm.Model(): |
| 38 | + dfm = BayesianDynamicFactor( |
| 39 | + k_factors=k_factors, |
| 40 | + factor_order=factor_order, |
| 41 | + k_endog=k_endog, |
| 42 | + error_order=error_order, |
| 43 | + error_var=False, |
| 44 | + verbose=True, |
| 45 | + ) |
| 46 | + factor_part = max(1, factor_order) * k_factors |
| 47 | + error_part = error_order * k_endog if error_order > 0 else 0 |
| 48 | + k_states = factor_part + error_part |
| 49 | + |
| 50 | + pm.Normal("x0", mu=0.0, sigma=1.0, shape=(k_states,)) |
| 51 | + pm.HalfNormal("P0", sigma=1.0, shape=(k_states, k_states)) |
| 52 | + |
| 53 | + pm.Normal("factor_loadings", mu=0.0, sigma=1.0, shape=(k_endog, k_factors)) |
| 54 | + pm.Normal("factor_ar", mu=0.0, sigma=1.0, shape=(k_factors, factor_order)) |
| 55 | + pm.HalfNormal("factor_sigma", sigma=1.0, shape=(k_factors,)) |
| 56 | + |
| 57 | + pm.Normal("error_ar", mu=0.0, sigma=1.0, shape=(k_endog, error_order)) |
| 58 | + pm.HalfNormal("error_sigma", sigma=1.0, shape=(k_endog,)) |
| 59 | + |
| 60 | + dfm.build_statespace_graph(data=data) |
| 61 | + ssm_pymc = dfm.unpack_statespace() |
| 62 | + |
| 63 | + # Build statsmodels DynamicFactor model |
| 64 | + sm_dfm = DynamicFactor( |
| 65 | + endog=y, |
| 66 | + k_factors=k_factors, |
| 67 | + factor_order=factor_order, |
| 68 | + error_order=error_order, |
| 69 | + ) |
| 70 | + |
| 71 | + x0 = np.zeros(k_states) |
| 72 | + P0 = np.eye(k_states) |
| 73 | + sm_dfm.initialize_known(initial_state=x0, initial_state_cov=P0) |
| 74 | + # print(sm_dfm.ssm.valid_names) |
| 75 | + |
| 76 | + # Extract relevant matrices from statsmodels |
| 77 | + # sm_matrices = [sm_dfm.ssm[m] for m in ["initial_state", "initial_state_cov", "state_intercept", "obs_intercept", "transition", "design", "selection","obs_cov", "state_cov"]] |
| 78 | + sm_matrices = sm_dfm.ssm |
| 79 | + |
| 80 | + # Matrices arestore in same order in PyMC model and statsmodels model |
| 81 | + |
| 82 | + # Compile pymc tensors to numpy arrays with dummy inputs |
| 83 | + pymc_inputs = list(pytensor.graph.basic.explicit_graph_inputs(ssm_pymc)) |
| 84 | + f_ssm = pytensor.function(pymc_inputs, ssm_pymc) |
| 85 | + |
| 86 | + # Provide dummy inputs: zeros or identities as appropriate |
| 87 | + test_vals = {} |
| 88 | + for v in pymc_inputs: |
| 89 | + shape = v.type.shape |
| 90 | + if len(shape) == 0: |
| 91 | + test_vals[v.name] = 0.5 |
| 92 | + else: |
| 93 | + # For covariance matrices, use identity, else fill with 0.5 |
| 94 | + if "cov" in v.name or "state_cov" in v.name or "obs_cov" in v.name: |
| 95 | + test_vals[v.name] = np.eye(shape[0], shape[1], dtype=pytensor.config.floatX) |
| 96 | + else: |
| 97 | + test_vals[v.name] = np.full(shape, 0.5, dtype=pytensor.config.floatX) |
| 98 | + |
| 99 | + pymc_matrices = f_ssm(**test_vals) |
| 100 | + |
| 101 | + matrix_names = [ |
| 102 | + "initial_state", |
| 103 | + "initial_state_cov", |
| 104 | + "state_intercept", |
| 105 | + "obs_intercept", |
| 106 | + "transition", |
| 107 | + "design", |
| 108 | + "selection", |
| 109 | + "obs_cov", |
| 110 | + "state_cov", |
| 111 | + ] |
| 112 | + for pm_mat, sm_mat, name in zip(pymc_matrices, sm_matrices, matrix_names): |
| 113 | + assert_allclose(pm_mat, sm_mat, atol=1e-5, err_msg=f"Mismatch in matrix {name}") |
| 114 | + |
| 115 | + |
| 116 | +@pytest.mark.parametrize("k_factors", [1]) |
| 117 | +@pytest.mark.parametrize("factor_order", [2]) |
| 118 | +@pytest.mark.parametrize("error_order", [1]) |
| 119 | +def test_loglike_matches_statsmodels(data, k_factors, factor_order, error_order): |
| 120 | + """ |
| 121 | + Tests if the log-likelihood calculated by the PyMC statespace model matches |
| 122 | + the one from statsmodels when given the same parameters. |
| 123 | + """ |
| 124 | + k_endog = data.shape[1] |
| 125 | + endog_names = data.columns.tolist() # ['realgdp', 'realcons', 'realinv'] |
| 126 | + |
| 127 | + sm_dfm = DynamicFactor( |
| 128 | + endog=data, |
| 129 | + k_factors=k_factors, |
| 130 | + factor_order=factor_order, |
| 131 | + error_order=error_order, |
| 132 | + ) |
| 133 | + print(sm_dfm.param_names) |
| 134 | + |
| 135 | + test_params = { |
| 136 | + # Loadings are now separate parameters |
| 137 | + "loading.f1.realgdp": 0.9, |
| 138 | + "loading.f1.realcons": -0.8, |
| 139 | + "loading.f1.realinv": 0.7, |
| 140 | + # Factor AR coefficients |
| 141 | + "L1.f1.f1": 0.5, |
| 142 | + "L2.f1.f1": 0.2, |
| 143 | + # Error AR coefficients use parenthesis |
| 144 | + "L1.e(realgdp).e(realgdp)": 0.4, |
| 145 | + "L1.e(realcons).e(realcons)": 0.2, |
| 146 | + "L1.e(realinv).e(realinv)": -0.1, |
| 147 | + # Error variances |
| 148 | + "sigma2.realgdp": 0.5, |
| 149 | + "sigma2.realcons": 0.4, |
| 150 | + "sigma2.realinv": 0.3, |
| 151 | + # NOTE: 'sigma2.f1' NOT included, as it's fixed to 1 by default. |
| 152 | + } |
| 153 | + |
| 154 | + assert set(test_params.keys()) == set(sm_dfm.param_names) |
| 155 | + params_vector = np.array([test_params[name] for name in sm_dfm.param_names]) |
| 156 | + |
| 157 | + # Calculate the log-likelihood in statsmodels |
| 158 | + sm_loglike = sm_dfm.loglike(params_vector) |
| 159 | + |
| 160 | + # Build the PyMC model and compile its logp function |
| 161 | + with pm.Model() as model: |
| 162 | + factor_part = max(1, factor_order) * k_factors |
| 163 | + error_part = error_order * k_endog if error_order > 0 else 0 |
| 164 | + k_states = factor_part + error_part |
| 165 | + |
| 166 | + pm.Normal("x0", mu=0.0, sigma=1.0, shape=(k_states,)) |
| 167 | + pm.Deterministic("P0", pt.eye(k_states, k_states)) |
| 168 | + |
| 169 | + pm.Normal("factor_loadings", mu=0.0, sigma=1.0, shape=(k_endog, k_factors)) |
| 170 | + pm.Normal("factor_ar", mu=0.0, sigma=1.0, shape=(k_factors, factor_order)) |
| 171 | + pm.Deterministic("factor_sigma", pt.ones(k_factors)) |
| 172 | + |
| 173 | + pm.Normal("error_ar", mu=0.0, sigma=1.0, shape=(k_endog, error_order)) |
| 174 | + pm.HalfNormal("error_sigma", sigma=1.0, shape=(k_endog,)) |
| 175 | + |
| 176 | + dfm = BayesianDynamicFactor( |
| 177 | + k_factors=k_factors, factor_order=factor_order, k_endog=k_endog, error_order=error_order |
| 178 | + ) |
| 179 | + |
| 180 | + dfm.build_statespace_graph(data, "data_ssm") |
| 181 | + logp_fn = model.compile_logp() |
| 182 | + |
| 183 | + # Evaluate the PyMC log-likelihood using the same parameters |
| 184 | + pymc_param_values = { |
| 185 | + "x0": np.zeros(k_states), |
| 186 | + #'P0_log__': np.log(np.eye(k_states)), |
| 187 | + "factor_loadings": np.array( |
| 188 | + [ |
| 189 | + test_params["loading.f1.realgdp"], |
| 190 | + test_params["loading.f1.realcons"], |
| 191 | + test_params["loading.f1.realinv"], |
| 192 | + ] |
| 193 | + ).reshape(k_endog, k_factors), |
| 194 | + "factor_ar": np.array([test_params["L1.f1.f1"], test_params["L2.f1.f1"]]).reshape( |
| 195 | + k_factors, factor_order |
| 196 | + ), |
| 197 | + #'factor_sigma_log__': np.log(np.array([1.0])), |
| 198 | + "error_ar": np.array( |
| 199 | + [ |
| 200 | + test_params["L1.e(realgdp).e(realgdp)"], |
| 201 | + test_params["L1.e(realcons).e(realcons)"], |
| 202 | + test_params["L1.e(realinv).e(realinv)"], |
| 203 | + ] |
| 204 | + ).reshape(k_endog, error_order), |
| 205 | + "error_sigma_log__": np.log( |
| 206 | + np.sqrt( |
| 207 | + np.array( |
| 208 | + [ # Log-transformed |
| 209 | + test_params["sigma2.realgdp"], |
| 210 | + test_params["sigma2.realcons"], |
| 211 | + test_params["sigma2.realinv"], |
| 212 | + ] |
| 213 | + ) |
| 214 | + ) |
| 215 | + ), |
| 216 | + } |
| 217 | + pymc_loglike = logp_fn(pymc_param_values) |
| 218 | + |
| 219 | + assert_allclose( |
| 220 | + pymc_loglike, |
| 221 | + sm_loglike, |
| 222 | + rtol=1e-5, |
| 223 | + err_msg="PyMC log-likelihood does not match statsmodels for manually specified parameters", |
| 224 | + ) |
0 commit comments