first stab at regression kink design

drbenvincent · drbenvincent · commit 15ef4eca20d3 · 2023-10-20T13:17:42.000+01:00
diff --git a/causalpy/pymc_experiments.py b/causalpy/pymc_experiments.py
@@ -957,6 +957,182 @@ def summary(self) -> None:
         self.print_coefficients()
 
 
+class RegressionKink(ExperimentalDesign):
+    def __init__(
+        self,
+        data: pd.DataFrame,
+        formula: str,
+        kink_point: float,
+        model=None,
+        running_variable_name: str = "x",
+        epsilon: float = 0.001,
+        bandwidth: Optional[float] = None,
+        **kwargs,
+    ):
+        super().__init__(model=model, **kwargs)
+        self.expt_type = "Regression Discontinuity"
+        self.data = data
+        self.formula = formula
+        self.running_variable_name = running_variable_name
+        self.kink_point = kink_point
+        self.epsilon = epsilon
+        self.bandwidth = bandwidth
+        self._input_validation()
+
+        if self.bandwidth is not None:
+            fmin = self.kink_point - self.bandwidth
+            fmax = self.kink_point + self.bandwidth
+            filtered_data = self.data.query(f"{fmin} <= x <= {fmax}")
+            if len(filtered_data) <= 10:
+                warnings.warn(
+                    f"Choice of bandwidth parameter has lead to only {len(filtered_data)} remaining datapoints. Consider increasing the bandwidth parameter.",  # noqa: E501
+                    UserWarning,
+                )
+            y, X = dmatrices(formula, filtered_data)
+        else:
+            y, X = dmatrices(formula, self.data)
+
+        self._y_design_info = y.design_info
+        self._x_design_info = X.design_info
+        self.labels = X.design_info.column_names
+        self.y, self.X = np.asarray(y), np.asarray(X)
+        self.outcome_variable_name = y.design_info.column_names[0]
+
+        COORDS = {"coeffs": self.labels, "obs_indx": np.arange(self.X.shape[0])}
+        self.model.fit(X=self.X, y=self.y, coords=COORDS)
+
+        # score the goodness of fit to all data
+        self.score = self.model.score(X=self.X, y=self.y)
+
+        # get the model predictions of the observed data
+        if self.bandwidth is not None:
+            xi = np.linspace(fmin, fmax, 200)
+        else:
+            xi = np.linspace(
+                np.min(self.data[self.running_variable_name]),
+                np.max(self.data[self.running_variable_name]),
+                200,
+            )
+        self.x_pred = pd.DataFrame(
+            {self.running_variable_name: xi, "treated": self._is_treated(xi)}
+        )
+        (new_x,) = build_design_matrices([self._x_design_info], self.x_pred)
+        self.pred = self.model.predict(X=np.asarray(new_x))
+
+        # Calculate the change in gradient by evaluating the function below the kink
+        # point, at the kink point, and above the kink point.
+        # NOTE: `"treated": np.array([0, 1])`` assumes treatment is applied above
+        # (not below) the threshold
+        self.x_discon = pd.DataFrame(
+            {
+                self.running_variable_name: np.array(
+                    [
+                        self.kink_point - self.epsilon,
+                        self.kink_point,
+                        self.kink_point + self.epsilon,
+                    ]
+                ),
+                "treated": np.array([0, 1, 1]),
+            }
+        )
+        (new_x,) = build_design_matrices([self._x_design_info], self.x_discon)
+        self.pred_discon = self.model.predict(X=np.asarray(new_x))
+
+        self.gradient_left = (
+            self.pred_discon["posterior_predictive"].sel(obs_ind=1)["mu"]
+            - self.pred_discon["posterior_predictive"].sel(obs_ind=0)["mu"]
+        )
+        self.gradient_right = (
+            self.pred_discon["posterior_predictive"].sel(obs_ind=2)["mu"]
+            - self.pred_discon["posterior_predictive"].sel(obs_ind=1)["mu"]
+        )
+        self.gradient_change = self.gradient_right - self.gradient_left
+
+    def _input_validation(self):
+        """Validate the input data and model formula for correctness"""
+        # if "treated" not in self.formula:
+        #     raise FormulaException(
+        #         "A predictor called `treated` should be in the formula"
+        #     )
+
+        if _is_variable_dummy_coded(self.data["treated"]) is False:
+            raise DataException(
+                """The treated variable should be dummy coded. Consisting of 0's and 1's only."""  # noqa: E501
+            )
+
+    def _is_treated(self, x):
+        """Returns ``True`` if `x` is greater than or equal to the treatment threshold.
+
+        .. warning::
+
+            Assumes treatment is given to those ABOVE the treatment threshold.
+        """
+        return np.greater_equal(x, self.kink_point)
+
+    def plot(self):
+        """
+        Plot the results
+        """
+        fig, ax = plt.subplots()
+        # Plot raw data
+        sns.scatterplot(
+            self.data,
+            x=self.running_variable_name,
+            y=self.outcome_variable_name,
+            c="k",  # hue="treated",
+            ax=ax,
+        )
+
+        # Plot model fit to data
+        h_line, h_patch = plot_xY(
+            self.x_pred[self.running_variable_name],
+            self.pred["posterior_predictive"].mu,
+            ax=ax,
+            plot_hdi_kwargs={"color": "C1"},
+        )
+        handles = [(h_line, h_patch)]
+        labels = ["Posterior mean"]
+
+        # create strings to compose title
+        title_info = f"{self.score.r2:.3f} (std = {self.score.r2_std:.3f})"
+        r2 = f"Bayesian $R^2$ on all data = {title_info}"
+        percentiles = self.gradient_change.quantile([0.03, 1 - 0.03]).values
+        ci = r"$CI_{94\%}$" + f"[{percentiles[0]:.2f}, {percentiles[1]:.2f}]"
+        grad_change = f"""
+            Change in gradient = {self.gradient_change.mean():.2f},
+            """
+        ax.set(title=r2 + "\n" + grad_change + ci)
+        # Intervention line
+        ax.axvline(
+            x=self.kink_point,
+            ls="-",
+            lw=3,
+            color="r",
+            label="treatment threshold",
+        )
+        ax.legend(
+            handles=(h_tuple for h_tuple in handles),
+            labels=labels,
+            fontsize=LEGEND_FONT_SIZE,
+        )
+        return (fig, ax)
+
+    def summary(self) -> None:
+        """
+        Print text output summarising the results
+        """
+
+        print(f"{self.expt_type:=^80}")
+        print(f"Formula: {self.formula}")
+        print(f"Running variable: {self.running_variable_name}")
+        print(f"Threshold on running variable: {self.kink_point}")
+        print("\nResults:")
+        print(
+            f"Discontinuity at threshold = {self.discontinuity_at_threshold.mean():.2f}"
+        )
+        self.print_coefficients()
+
+
 class PrePostNEGD(ExperimentalDesign):
     """
     A class to analyse data from pretest/posttest designs
diff --git a/docs/source/_static/interrogate_badge.svg b/docs/source/_static/interrogate_badge.svg
@@ -1,5 +1,5 @@
 <svg width="140" height="20" viewBox="0 0 140 20" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" xmlns:serif="http://www.serif.com/" style="fill-rule:evenodd;clip-rule:evenodd;stroke-linejoin:round;stroke-miterlimit:2;">
-    <title>interrogate: 97.3%</title>
+    <title>interrogate: 96.7%</title>
     <g transform="matrix(1,0,0,1,22,0)">
         <g id="backgrounds" transform="matrix(1.32789,0,0,1,-22.3892,0)">
             <rect x="0" y="0" width="71" height="20" style="fill:rgb(85,85,85);"/>
@@ -12,8 +12,8 @@
     <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="110">
         <text x="590" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="610">interrogate</text>
         <text x="590" y="140" transform="scale(.1)" textLength="610">interrogate</text>
-        <text x="1160" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="370" data-interrogate="result">97.3%</text>
-        <text x="1160" y="140" transform="scale(.1)" textLength="370" data-interrogate="result">97.3%</text>
+        <text x="1160" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="370" data-interrogate="result">96.7%</text>
+        <text x="1160" y="140" transform="scale(.1)" textLength="370" data-interrogate="result">96.7%</text>
     </g>
     <g id="logo-shadow" serif:id="logo shadow" transform="matrix(0.854876,0,0,0.854876,-6.73514,1.732)">
         <g transform="matrix(0.299012,0,0,0.299012,9.70229,-6.68582)">