initial efforts

drbenvincent · drbenvincent · commit 7bbff4f60c55 · 2025-04-21T10:16:33.000+01:00
diff --git a/causalpy/experiments/synthetic_control.py b/causalpy/experiments/synthetic_control.py
@@ -20,8 +20,8 @@
 import arviz as az
 import numpy as np
 import pandas as pd
+import xarray as xr
 from matplotlib import pyplot as plt
-from patsy import build_design_matrices, dmatrices
 from sklearn.base import RegressorMixin
 
 from causalpy.custom_exceptions import BadIndexException
@@ -41,8 +41,10 @@ class SyntheticControl(BaseExperiment):
         A pandas dataframe
     :param treatment_time:
         The time when treatment occurred, should be in reference to the data index
-    :param formula:
-        A statistical model formula
+    :param control_units:
+        A list of control units to be used in the experiment
+    :param treated_units:
+        A list of treated units to be used in the experiment
     :param model:
         A PyMC model
 
@@ -55,7 +57,8 @@ class SyntheticControl(BaseExperiment):
     >>> result = cp.SyntheticControl(
     ...     df,
     ...     treatment_time,
-    ...     formula="actual ~ 0 + a + b + c + d + e + f + g",
+    ...     control_units=["a", "b", "c", "d", "e", "f", "g"],
+    ...     treated_units=["actual"],
     ...     model=cp.pymc_models.WeightedSumFitter(
     ...         sample_kwargs={
     ...             "target_accept": 0.95,
@@ -66,63 +69,111 @@ class SyntheticControl(BaseExperiment):
     ... )
     """
 
-    expt_type = "SyntheticControl"
     supports_ols = True
     supports_bayes = True
 
     def __init__(
         self,
         data: pd.DataFrame,
         treatment_time: Union[int, float, pd.Timestamp],
-        formula: str,
+        control_units: list[str],
+        treated_units: list[str],
         model=None,
         **kwargs,
     ) -> None:
         super().__init__(model=model)
         self.input_validation(data, treatment_time)
         self.treatment_time = treatment_time
-        # set experiment type - usually done in subclasses
-        self.expt_type = "Pre-Post Fit"
+        self.control_units = control_units
+        self.treated_units = treated_units
+        self.expt_type = "SyntheticControl"
         # split data in to pre and post intervention
         self.datapre = data[data.index < self.treatment_time]
         self.datapost = data[data.index >= self.treatment_time]
 
-        self.formula = formula
-
-        # set things up with pre-intervention data
-        y, X = dmatrices(formula, self.datapre)
-        self.outcome_variable_name = y.design_info.column_names[0]
-        self._y_design_info = y.design_info
-        self._x_design_info = X.design_info
-        self.labels = X.design_info.column_names
-        self.pre_y, self.pre_X = np.asarray(y), np.asarray(X)
-        # process post-intervention data
-        (new_y, new_x) = build_design_matrices(
-            [self._y_design_info, self._x_design_info], self.datapost
+        # split data into the 4 quadrants (pre/post, control/treated) and store as xarray dataarray
+        # self.datapre_control = self.datapre[self.control_units]
+        # self.datapre_treated = self.datapre[self.treated_units]
+        # self.datapost_control = self.datapost[self.control_units]
+        # self.datapost_treated = self.datapost[self.treated_units]
+        self.datapre_control = xr.DataArray(
+            self.datapre[self.control_units],
+            dims=["obs_ind", "control_units"],
+            coords={
+                "obs_ind": self.datapre[self.control_units].index,
+                "control_units": self.control_units,
+            },
+        )
+        self.datapre_treated = xr.DataArray(
+            self.datapre[self.treated_units],
+            dims=["obs_ind", "treated_units"],
+            coords={
+                "obs_ind": self.datapre[self.treated_units].index,
+                "treated_units": self.treated_units,
+            },
+        )
+        self.datapost_control = xr.DataArray(
+            self.datapost[self.control_units],
+            dims=["obs_ind", "control_units"],
+            coords={
+                "obs_ind": self.datapost[self.control_units].index,
+                "control_units": self.control_units,
+            },
+        )
+        self.datapost_treated = xr.DataArray(
+            self.datapost[self.treated_units],
+            dims=["obs_ind", "treated_units"],
+            coords={
+                "obs_ind": self.datapost[self.treated_units].index,
+                "treated_units": self.treated_units,
+            },
         )
-        self.post_X = np.asarray(new_x)
-        self.post_y = np.asarray(new_y)
 
         # fit the model to the observed (pre-intervention) data
         if isinstance(self.model, PyMCModel):
-            COORDS = {"coeffs": self.labels, "obs_indx": np.arange(self.pre_X.shape[0])}
-            self.model.fit(X=self.pre_X, y=self.pre_y, coords=COORDS)
+            COORDS = {
+                "control_units": self.control_units,
+                "treated_units": self.treated_units,
+                "obs_indx": np.arange(self.datapre.shape[0]),
+            }
+            self.model.fit(
+                X=self.datapre_control.to_numpy(),
+                y=self.datapre_treated.to_numpy(),
+                coords=COORDS,
+            )
         elif isinstance(self.model, RegressorMixin):
-            self.model.fit(X=self.pre_X, y=self.pre_y)
+            self.model.fit(
+                X=self.datapre_control.to_numpy(), y=self.datapre_treated.to_numpy()
+            )
         else:
             raise ValueError("Model type not recognized")
 
         # score the goodness of fit to the pre-intervention data
-        self.score = self.model.score(X=self.pre_X, y=self.pre_y)
+        self.score = self.model.score(
+            X=self.datapre_control.to_numpy(), y=self.datapre_treated.to_numpy()
+        )
 
         # get the model predictions of the observed (pre-intervention) data
-        self.pre_pred = self.model.predict(X=self.pre_X)
+        self.pre_pred = self.model.predict(X=self.datapre_control.to_numpy())
 
         # calculate the counterfactual
-        self.post_pred = self.model.predict(X=self.post_X)
-        self.pre_impact = self.model.calculate_impact(self.pre_y[:, 0], self.pre_pred)
+        self.post_pred = self.model.predict(X=self.datapost_control.to_numpy())
+        # TODO: Remove the need for this 'hack' by properly updating the coords when we
+        # run model.predict
+        # TEMPORARY HACK: --------------------------------------------------------------
+        # : set the coords (obs_ind) for self.post_pred to be the same as the datapost
+        # index. This is needed for xarray to properly do the comparison (-) between
+        # datapre_treated and self.post_pred
+        # self.post_pred["posterior_predictive"] = self.post_pred[
+        #     "posterior_predictive"
+        # ].assign_coords(obs_ind=self.datapost.index)
+        # ------------------------------------------------------------------------------
+        self.pre_impact = self.model.calculate_impact(
+            self.datapre_treated, self.pre_pred
+        )
+
         self.post_impact = self.model.calculate_impact(
-            self.post_y[:, 0], self.post_pred
+            self.datapost_treated, self.post_pred
         )
         self.post_impact_cumulative = self.model.calculate_cumulative_impact(
             self.post_impact
@@ -150,7 +201,11 @@ def summary(self, round_to=None) -> None:
             Number of decimals used to round results. Defaults to 2. Use "None" to return raw numbers
         """
         print(f"{self.expt_type:=^80}")
-        print(f"Formula: {self.formula}")
+        print(f"Control units: {self.control_units}")
+        if len(self.treated_units) > 1:
+            print(f"Treated units: {self.treated_units}")
+        else:
+            print(f"Treated unit: {self.treated_units[0]}")
         self.print_coefficients(round_to)
 
     def _bayesian_plot(
@@ -176,7 +231,9 @@ def _bayesian_plot(
         handles = [(h_line, h_patch)]
         labels = ["Pre-intervention period"]
 
-        (h,) = ax[0].plot(self.datapre.index, self.pre_y, "k.", label="Observations")
+        (h,) = ax[0].plot(
+            self.datapre.index, self.datapre_treated, "k.", label="Observations"
+        )
         handles.append(h)
         labels.append("Observations")
 
@@ -190,14 +247,14 @@ def _bayesian_plot(
         handles.append((h_line, h_patch))
         labels.append(counterfactual_label)
 
-        ax[0].plot(self.datapost.index, self.post_y, "k.")
+        ax[0].plot(self.datapost.index, self.datapost_treated, "k.")
         # Shaded causal effect
         h = ax[0].fill_between(
             self.datapost.index,
             y1=az.extract(
                 self.post_pred, group="posterior_predictive", var_names="mu"
             ).mean("sample"),
-            y2=np.squeeze(self.post_y),
+            y2=np.squeeze(self.datapost_treated),
             color="C0",
             alpha=0.25,
         )
@@ -214,20 +271,20 @@ def _bayesian_plot(
         # MIDDLE PLOT -----------------------------------------------
         plot_xY(
             self.datapre.index,
-            self.pre_impact,
+            self.pre_impact.sel(treated_units="actual"),
             ax=ax[1],
             plot_hdi_kwargs={"color": "C0"},
         )
         plot_xY(
             self.datapost.index,
-            self.post_impact,
+            self.post_impact.sel(treated_units="actual"),
             ax=ax[1],
             plot_hdi_kwargs={"color": "C1"},
         )
         ax[1].axhline(y=0, c="k")
         ax[1].fill_between(
             self.datapost.index,
-            y1=self.post_impact.mean(["chain", "draw"]),
+            y1=self.post_impact.mean(["chain", "draw"]).sel(treated_units="actual"),
             color="C0",
             alpha=0.25,
             label="Causal impact",
@@ -238,7 +295,7 @@ def _bayesian_plot(
         ax[2].set(title="Cumulative Causal Impact")
         plot_xY(
             self.datapost.index,
-            self.post_impact_cumulative,
+            self.post_impact_cumulative.sel(treated_units="actual"),
             ax=ax[2],
             plot_hdi_kwargs={"color": "C1"},
         )
@@ -259,15 +316,22 @@ def _bayesian_plot(
             fontsize=LEGEND_FONT_SIZE,
         )
 
-        # code above: same as `PrePostFit._bayesian_plot` -------------------------------
-        # code below: additional for the synthetic control experiment ------------------
-
         plot_predictors = kwargs.get("plot_predictors", False)
         if plot_predictors:
             # plot control units as well
-            ax[0].plot(self.datapre.index, self.pre_X, "-", c=[0.8, 0.8, 0.8], zorder=1)
             ax[0].plot(
-                self.datapost.index, self.post_X, "-", c=[0.8, 0.8, 0.8], zorder=1
+                self.datapre.index,
+                self.datapre_control,
+                "-",
+                c=[0.8, 0.8, 0.8],
+                zorder=1,
+            )
+            ax[0].plot(
+                self.datapost.index,
+                self.datapost_control,
+                "-",
+                c=[0.8, 0.8, 0.8],
+                zorder=1,
             )
 
         return fig, ax
diff --git a/causalpy/pymc_models.py b/causalpy/pymc_models.py
@@ -89,6 +89,7 @@ def _data_setter(self, X) -> None:
         prediction.
         """
         with self:
+            # TODO: update coords
             pm.set_data({"X": X})
 
     def fit(self, X, y, coords: Optional[Dict[str, Any]] = None) -> None:
@@ -150,10 +151,11 @@ def score(self, X, y) -> pd.Series:
         # Note: First argument must be a 1D array
         return r2_score(y.flatten(), mu)
 
-    def calculate_impact(self, y_true, y_pred):
-        pre_data = xr.DataArray(y_true, dims=["obs_ind"])
-        impact = pre_data - y_pred["posterior_predictive"]["y_hat"]
-        return impact.transpose(..., "obs_ind")
+    def calculate_impact(
+        self, y_true: xr.DataArray, y_pred: az.InferenceData
+    ) -> xr.DataArray:
+        impact = y_true - y_pred["posterior_predictive"]["y_hat"]
+        return impact.transpose(..., "treated_units", "obs_ind")
 
     def calculate_cumulative_impact(self, impact):
         return impact.cumsum(dim="obs_ind")
@@ -255,17 +257,13 @@ def build_model(self, X, y, coords):
         """
         Defines the PyMC model
         """
+        print(coords)
         with self:
             self.add_coords(coords)
             n_predictors = X.shape[1]
-            X = pm.Data("X", X, dims=["obs_ind", "coeffs"])
+            X = pm.Data("X", X, dims=["obs_ind", "control_units"])
             y = pm.Data("y", y[:, 0], dims="obs_ind")
-            # TODO: There we should allow user-specified priors here
-            beta = pm.Dirichlet("beta", a=np.ones(n_predictors), dims="coeffs")
-            # beta = pm.Dirichlet(
-            #     name="beta", a=(1 / n_predictors) * np.ones(n_predictors),
-            #     dims="coeffs"
-            # )
+            beta = pm.Dirichlet("beta", a=np.ones(n_predictors), dims="control_units")
             sigma = pm.HalfNormal("sigma", 1)
             mu = pm.Deterministic("mu", pm.math.dot(X, beta), dims="obs_ind")
             pm.Normal("y_hat", mu, sigma, observed=y, dims="obs_ind")
diff --git a/docs/source/notebooks/sc_pymc.ipynb b/docs/source/notebooks/sc_pymc.ipynb