Skip to content

Commit dcf285e

Browse files
committed
Update following suggestions by Jesse
1 parent 20793cc commit dcf285e

File tree

2 files changed

+101
-19
lines changed

2 files changed

+101
-19
lines changed

pymc_extras/statespace/models/DFM.py

Lines changed: 100 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
ALL_STATE_AUX_DIM,
1212
ALL_STATE_DIM,
1313
AR_PARAM_DIM,
14+
ERROR_AR_PARAM_DIM,
1415
FACTOR_DIM,
1516
OBS_STATE_AUX_DIM,
1617
OBS_STATE_DIM,
@@ -31,11 +32,13 @@ class BayesianDynamicFactor(PyMCStateSpace):
3132
factor_order : int
3233
Order of the VAR process for the latent factors.
3334
34-
k_endog : int
35-
Number of observed time series.
35+
k_endog : int, optional
36+
Number of observed time series. If not provided, the number of observed series will be inferred from `endog_names`.
37+
At least one of `k_endog` or `endog_names` must be provided.
3638
37-
endog_names : Sequence[str], optional
38-
Names of the observed time series. If not provided, default names will be generated as `endog_1`, `endog_2`, ..., `endog_k`.
39+
endog_names : list of str, optional
40+
Names of the observed time series. If not provided, default names will be generated as `endog_1`, `endog_2`, ..., `endog_k` based on `k_endog`.
41+
At least one of `k_endog` or `endog_names` must be provided.
3942
4043
exog : array_like, optional
4144
Array of exogenous regressors for the observation equation (nobs x k_exog).
@@ -60,14 +63,90 @@ class BayesianDynamicFactor(PyMCStateSpace):
6063
verbose: bool, default True
6164
If true, a message will be logged to the terminal explaining the variable names, dimensions, and supports.
6265
66+
6367
Notes
6468
-----
65-
This model implements a dynamic factor model in the spirit of
66-
statsmodels.tsa.statespace.dynamic_factor.DynamicFactor. The model assumes that
67-
the observed time series are driven by a set of latent factors that evolve
68-
according to a VAR process, possibly along with an autoregressive error term.
69-
69+
The Dynamic Factor Model (DFM) is a multivariate state-space model used to represent high-dimensional time series
70+
as driven by a smaller set of unobserved dynamic factors. Given a set of observed time series
71+
:math:`\{y_t\}_{t=0}^T`, with :math:`y_t = \begin{bmatrix} y_{1,t} & y_{2,t} & \cdots & y_{k_endog,t} \end{bmatrix}^T`,
72+
the DFM assumes that each series is a linear combination of a few latent factors and optional autoregressive errors.
73+
74+
Specifically, denoting the number of dynamic factors as :math:`k_factors`, the order of the latent factor
75+
process as :math:`p = \text{factor\_order}`, and the order of the observation error as
76+
:math:`q = \text{error\_order}`, the model is written as:
77+
78+
.. math::
79+
y_t & = \Lambda f_t + B x_t + u_t \\
80+
f_t & = A_1 f_{t-1} + \dots + A_p f_{t-p} + \eta_t \\
81+
u_t & = C_1 u_{t-1} + \dots + C_q u_{t-q} + \varepsilon_t
82+
83+
84+
Where:
85+
- :math:`f_t` is a vector of latent factors following a VAR(p) process:
86+
- :math:`\x_t` are optional exogenous vectors (Not implemented yet).
87+
- :math:`u_t` is a vector of observation errors, possibly VAR(q) if error_var = True otherwise treated as individual autoregressions.
88+
- :math:`\eta_t` and :math:`\varepsilon_t` are white noise error terms. In order to identify the factors, :math:`Var(\eta_t) = I`.
89+
Denote :math:`Var(\varepsilon_t) \equiv \Sigma`.
90+
91+
92+
Internally, this model is represented in state-space form by stacking all current and lagged latent factors and,
93+
if present, autoregressive observation errors into a single state vector. The full state vector has dimension
94+
:math:`k_factors \cdot factor_order + k_endog \cdot error_order`, where :math:`k_endog` is the number of observed time series.
95+
96+
The number of independent shocks in the system (i.e., the number of nonzero diagonal elements in the state noise
97+
covariance matrix) is equal to the number of latent factors plus the number of observed series if AR errors are present.
98+
99+
As in other high-dimensional models, identification can be an issue, especially when many observed series load on few
100+
factors. Careful prior specification is typically required for good estimation.
101+
102+
Currently, the implementation assumes same factor order for all the factors,
103+
does not yet support measurement error, exogenous variables and joint (VAR) error modeling.
104+
105+
Examples
106+
--------
107+
The following code snippet estimates a dynamic factor model with 1 latent factors,
108+
a AR(2) structure on the factor and a AR(1) structure on the errors:
109+
110+
.. code:: python
111+
112+
import pymc_extras.statespace as pmss
113+
import pymc as pm
114+
115+
# Create DFM Statespace Model
116+
dfm_mod = pmss.BayesianDynamicFactor(
117+
k_factors=1,
118+
factor_order=2,
119+
endog_names=data.columns,
120+
error_order=1,
121+
error_var=False,
122+
error_cov_type="diagonal",
123+
filter_type="standard",
124+
verbose=True
125+
)
70126
127+
# Unpack dims and coords
128+
x0_dims, P0_dims, factor_loadings_dims, factor_sigma_dims, factor_ar_dims, error_ar_dims, error_sigma_dims = dfm_mod.param_dims.values()
129+
coords = dfm_mod.coords
130+
131+
with pm.Model(coords=coords) as pymc_mod:
132+
# Initial state
133+
x0 = pm.Normal("x0", dims=x0_dims)
134+
P0 = pm.Normal("P0", dims=P0_dims)
135+
factor_loadings = pm.Normal("factor_loadings", sigma=1, dims=factor_loadings_dims)
136+
factor_ar = pm.Normal("factor_ar", sigma=1, dims=factor_ar_dims)
137+
factor_sigma = pm.Deterministic("factor_sigma", pt.constant([1.0], dtype=float))
138+
error_ar = pm.Normal("error_ar", sigma=1, dims=error_ar_dims)
139+
sigmas = pm.HalfNormal("error_sigma", dims=error_sigma_dims)
140+
# Build symbolic graph
141+
dfm_mod.build_statespace_graph(data=data, mode="JAX")
142+
143+
with pymc_mod:
144+
idata = pm.sample(
145+
draws=500,
146+
chains=2,
147+
nuts_sampler="nutpie",
148+
nuts_sampler_kwargs={"backend": "jax", "gradient_backend": "jax"},
149+
)
71150
72151
"""
73152

@@ -95,6 +174,8 @@ def __init__(
95174
raise NotImplementedError(
96175
"Joint error modeling (error_var=True) is not yet implemented."
97176
)
177+
if exog is not None:
178+
raise NotImplementedError("Exogenous variables (exog) are not yet implemented.")
98179

99180
self.endog_names = endog_names
100181
self.k_endog = k_endog
@@ -110,7 +191,7 @@ def __init__(
110191
# Determine the dimension for the latent factor states.
111192
# For static factors, one might use k_factors.
112193
# For dynamic factors with lags, the state might include current factors and past lags.
113-
# TODO: what if we want different factor orders for different factors?
194+
# TODO: what if we want different factor orders for different factors? (follow suggestions in GitHub)
114195
k_factor_states = k_factors * factor_order
115196

116197
# Determine the dimension for the error component.
@@ -152,7 +233,7 @@ def param_names(self):
152233
names.remove("factor_ar")
153234
if self.error_order == 0:
154235
names.remove("error_ar")
155-
if self.error_cov_type in ["unstructured"]:
236+
if self.error_cov_type == "unstructured":
156237
names.remove("error_sigma")
157238
names.append("error_cov")
158239

@@ -186,7 +267,7 @@ def param_info(self) -> dict[str, dict[str, Any]]:
186267
"constraints": None,
187268
},
188269
"error_sigma": {
189-
"shape": (self.k_endog,) if self.error_cov_type in ["diagonal"] else (),
270+
"shape": (self.k_endog,) if self.error_cov_type == "diagonal" else (),
190271
"constraints": "Positive",
191272
},
192273
"error_cov": {
@@ -207,17 +288,17 @@ def state_names(self) -> list[str]:
207288
then idiosyncratic error states (with lags).
208289
"""
209290
names = []
210-
291+
# TODO adjust notation by looking at the VARMAX implementation
211292
# Factor states
212293
for i in range(self.k_factors):
213294
for lag in range(self.factor_order):
214-
names.append(f"factor_{i+1}_lag{lag}")
295+
names.append(f"L{lag}.factor_{i+1}")
215296

216297
# Idiosyncratic error states
217298
if self.error_order > 0:
218299
for i in range(self.k_endog):
219300
for lag in range(self.error_order):
220-
names.append(f"error_{i+1}_lag{lag}")
301+
names.append(f"L{lag}.error_{i+1}")
221302

222303
return names
223304

@@ -231,7 +312,7 @@ def observed_states(self) -> list[str]:
231312
@property
232313
def coords(self) -> dict[str, Sequence]:
233314
coords = make_default_coords(self)
234-
315+
# Add factor dimensions
235316
coords[FACTOR_DIM] = [f"factor_{i+1}" for i in range(self.k_factors)]
236317

237318
# AR parameter dimensions - add if needed
@@ -240,7 +321,7 @@ def coords(self) -> dict[str, Sequence]:
240321

241322
# If error_order > 0
242323
if self.error_order > 0:
243-
coords["error_ar_param"] = list(range(1, self.error_order + 1))
324+
coords[ERROR_AR_PARAM_DIM] = list(range(1, self.error_order + 1))
244325

245326
return coords
246327

@@ -272,13 +353,13 @@ def param_dims(self):
272353
coord_map["factor_ar"] = (FACTOR_DIM, AR_PARAM_DIM)
273354

274355
if self.error_order > 0:
275-
coord_map["error_ar"] = (OBS_STATE_DIM, "error_ar_param")
356+
coord_map["error_ar"] = (OBS_STATE_DIM, ERROR_AR_PARAM_DIM)
276357

277358
if self.error_cov_type in ["scalar"]:
278359
coord_map["error_sigma"] = ()
279360
elif self.error_cov_type in ["diagonal"]:
280361
coord_map["error_sigma"] = (OBS_STATE_DIM,)
281-
elif self.error_cov_type in ["unstructured"]:
362+
if self.error_cov_type == "unstructured":
282363
coord_map["error_sigma"] = (OBS_STATE_DIM, OBS_STATE_AUX_DIM)
283364

284365
return coord_map

pymc_extras/statespace/utils/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
SEASONAL_MA_PARAM_DIM = "seasonal_ma_lag"
1414
ETS_SEASONAL_DIM = "seasonal_lag"
1515
FACTOR_DIM = "factor"
16+
ERROR_AR_PARAM_DIM = "error_ar_param"
1617

1718
NEVER_TIME_VARYING = ["initial_state", "initial_state_cov", "a0", "P0"]
1819
VECTOR_VALUED = ["initial_state", "state_intercept", "obs_intercept", "a0", "c", "d"]

0 commit comments

Comments
 (0)