tidy up + fixes

drbenvincent · drbenvincent · commit 1f4d17e7d629 · 2025-04-21T17:37:24.000+01:00
diff --git a/causalpy/experiments/synthetic_control.py b/causalpy/experiments/synthetic_control.py
@@ -85,17 +85,15 @@ def __init__(
         self.input_validation(data, treatment_time)
         self.treatment_time = treatment_time
         self.control_units = control_units
+        self.labels = control_units
         self.treated_units = treated_units
         self.expt_type = "SyntheticControl"
         # split data in to pre and post intervention
         self.datapre = data[data.index < self.treatment_time]
         self.datapost = data[data.index >= self.treatment_time]
 
-        # split data into the 4 quadrants (pre/post, control/treated) and store as xarray dataarray
-        # self.datapre_control = self.datapre[self.control_units]
-        # self.datapre_treated = self.datapre[self.treated_units]
-        # self.datapost_control = self.datapost[self.control_units]
-        # self.datapost_treated = self.datapost[self.treated_units]
+        # split data into the 4 quadrants (pre/post, control/treated) and store as
+        # xarray DataArray objects
         self.datapre_control = xr.DataArray(
             self.datapre[self.control_units],
             dims=["obs_ind", "control_units"],
@@ -137,14 +135,12 @@ def __init__(
                 "obs_ind": np.arange(self.datapre.shape[0]),
             }
             self.model.fit(
-                X=self.datapre_control.to_numpy(),
-                y=self.datapre_treated.to_numpy(),
+                X=self.datapre_control,
+                y=self.datapre_treated,
                 coords=COORDS,
             )
         elif isinstance(self.model, RegressorMixin):
-            self.model.fit(
-                X=self.datapre_control.to_numpy(), y=self.datapre_treated.to_numpy()
-            )
+            self.model.fit(X=self.datapre_control, y=self.datapre_treated)
         else:
             raise ValueError("Model type not recognized")
 
@@ -154,20 +150,10 @@ def __init__(
         )
 
         # get the model predictions of the observed (pre-intervention) data
-        self.pre_pred = self.model.predict(X=self.datapre_control.to_numpy())
+        self.pre_pred = self.model.predict(X=self.datapre_control)
 
         # calculate the counterfactual
-        self.post_pred = self.model.predict(X=self.datapost_control.to_numpy())
-        # TODO: Remove the need for this 'hack' by properly updating the coords when we
-        # run model.predict
-        # TEMPORARY HACK: --------------------------------------------------------------
-        # : set the coords (obs_ind) for self.post_pred to be the same as the datapost
-        # index. This is needed for xarray to properly do the comparison (-) between
-        # datapre_treated and self.post_pred
-        # self.post_pred["posterior_predictive"] = self.post_pred[
-        #     "posterior_predictive"
-        # ].assign_coords(obs_ind=self.datapost.index)
-        # ------------------------------------------------------------------------------
+        self.post_pred = self.model.predict(X=self.datapost_control)
         self.pre_impact = self.model.calculate_impact(
             self.datapre_treated, self.pre_pred
         )
diff --git a/causalpy/pymc_models.py b/causalpy/pymc_models.py
@@ -135,13 +135,20 @@ def predict(self, X):
         random_seed = self.sample_kwargs.get("random_seed", None)
         self._data_setter(X)
         with self:  # sample with new input data
-            post_pred = pm.sample_posterior_predictive(
+            pp = pm.sample_posterior_predictive(
                 self.idata,
                 var_names=["y_hat", "mu"],
                 progressbar=False,
                 random_seed=random_seed,
             )
-        return post_pred
+
+        # TODO: This is a bit of a hack. Maybe it could be done properly in _data_setter?
+        if isinstance(X, xr.DataArray):
+            pp["posterior_predictive"] = pp["posterior_predictive"].assign_coords(
+                obs_ind=X.obs_ind
+            )
+
+        return pp
 
     def score(self, X, y) -> pd.Series:
         """Score the Bayesian :math:`R^2` given inputs ``X`` and outputs ``y``.
@@ -161,10 +168,13 @@ def score(self, X, y) -> pd.Series:
         return r2_score(y.flatten(), mu)
 
     def calculate_impact(
-        self, y_true: xr.DataArray, y_pred: az.InferenceData
+        self, y_true: xr.DataArray | np.ndarray, y_pred: az.InferenceData
     ) -> xr.DataArray:
+        if isinstance(y_true, np.ndarray):
+            y_true = xr.DataArray(y_true, dims=["obs_ind"])
+
         impact = y_true - y_pred["posterior_predictive"]["y_hat"]
-        return impact.transpose(..., "treated_units", "obs_ind")
+        return impact.transpose(..., "obs_ind")
 
     def calculate_cumulative_impact(self, impact):
         return impact.cumsum(dim="obs_ind")
@@ -269,9 +279,9 @@ def build_model(self, X, y, coords):
         with self:
             self.add_coords(coords)
             n_predictors = X.shape[1]
-            X = pm.Data("X", X, dims=["obs_ind", "control_units"])
+            X = pm.Data("X", X, dims=["obs_ind", "coeffs"])
             y = pm.Data("y", y[:, 0], dims="obs_ind")
-            beta = pm.Dirichlet("beta", a=np.ones(n_predictors), dims="control_units")
+            beta = pm.Dirichlet("beta", a=np.ones(n_predictors), dims="coeffs")
             sigma = pm.HalfNormal("sigma", 1)
             mu = pm.Deterministic("mu", pm.math.dot(X, beta), dims="obs_ind")
             pm.Normal("y_hat", mu, sigma, observed=y, dims="obs_ind")