RegressionEstimator class to combine common elements of Linear and Logistic Regression Estimator classes.

jmafoster1 · jmafoster1 · commit 05c5499629e6 · 2024-08-08T12:19:19.000+01:00
diff --git a/causal_testing/estimation/cubic_spline_estimator.py b/causal_testing/estimation/cubic_spline_estimator.py
@@ -46,7 +46,7 @@ def __init__(
             self.formula = f"{outcome} ~ cr({'+'.join(terms)}, df={basis})"
 
     def estimate_ate_calculated(self, adjustment_config: dict = None) -> pd.Series:
-        model = self._run_linear_regression()
+        model = self._run_regression()
 
         x = {"Intercept": 1, self.treatment: self.treatment_value}
         if adjustment_config is not None:
diff --git a/causal_testing/estimation/linear_regression_estimator.py b/causal_testing/estimation/linear_regression_estimator.py
@@ -5,22 +5,22 @@
 
 import pandas as pd
 import statsmodels.formula.api as smf
-from patsy import dmatrix  # pylint: disable = no-name-in-module
-from patsy import ModelDesc
-from statsmodels.regression.linear_model import RegressionResultsWrapper
+from patsy import dmatrix, ModelDesc  # pylint: disable = no-name-in-module
 
 from causal_testing.specification.variable import Variable
 from causal_testing.estimation.gp import GP
-from causal_testing.estimation.estimator import Estimator
+from causal_testing.estimation.regression_estimator import RegressionEstimator
 
 logger = logging.getLogger(__name__)
 
 
-class LinearRegressionEstimator(Estimator):
+class LinearRegressionEstimator(RegressionEstimator):
     """A Linear Regression Estimator is a parametric estimator which restricts the variables in the data to a linear
     combination of parameters and functions of the variables (note these functions need not be linear).
     """
 
+    regressor = smf.ols
+
     def __init__(
         # pylint: disable=too-many-arguments
         self,
@@ -35,6 +35,7 @@ def __init__(
         alpha: float = 0.05,
         query: str = "",
     ):
+        # pylint: disable=too-many-arguments
         super().__init__(
             treatment,
             treatment_value,
@@ -43,20 +44,10 @@ def __init__(
             outcome,
             df,
             effect_modifiers,
-            alpha=alpha,
-            query=query,
+            formula,
+            alpha,
+            query,
         )
-
-        self.model = None
-        if effect_modifiers is None:
-            effect_modifiers = []
-
-        if formula is not None:
-            self.formula = formula
-        else:
-            terms = [treatment] + sorted(list(adjustment_set)) + sorted(list(effect_modifiers))
-            self.formula = f"{outcome} ~ {'+'.join(terms)}"
-
         for term in self.effect_modifiers:
             self.adjustment_set.add(term)
 
@@ -118,7 +109,7 @@ def estimate_coefficient(self) -> tuple[pd.Series, list[pd.Series, pd.Series]]:
 
         :return: The unit average treatment effect and the 95% Wald confidence intervals.
         """
-        model = self._run_linear_regression()
+        model = self._run_regression()
         newline = "\n"
         patsy_md = ModelDesc.from_formula(self.treatment)
 
@@ -147,7 +138,7 @@ def estimate_ate(self) -> tuple[pd.Series, list[pd.Series, pd.Series]]:
 
         :return: The average treatment effect and the 95% Wald confidence intervals.
         """
-        model = self._run_linear_regression()
+        model = self._run_regression()
 
         # Create an empty individual for the control and treated
         individuals = pd.DataFrame(1, index=["control", "treated"], columns=model.params.index)
@@ -167,37 +158,6 @@ def estimate_ate(self) -> tuple[pd.Series, list[pd.Series, pd.Series]]:
         confidence_intervals = [pd.Series(interval) for interval in confidence_intervals]
         return ate, confidence_intervals
 
-    def estimate_control_treatment(self, adjustment_config: dict = None) -> tuple[pd.Series, pd.Series]:
-        """Estimate the outcomes under control and treatment.
-
-        :return: The estimated outcome under control and treatment in the form
-        (control_outcome, treatment_outcome).
-        """
-        if adjustment_config is None:
-            adjustment_config = {}
-        model = self._run_linear_regression()
-
-        x = pd.DataFrame(columns=self.df.columns)
-        x[self.treatment] = [self.treatment_value, self.control_value]
-        x["Intercept"] = 1  # self.intercept
-
-        print(x[self.treatment])
-        for k, v in adjustment_config.items():
-            x[k] = v
-        for k, v in self.effect_modifiers.items():
-            x[k] = v
-        x = dmatrix(self.formula.split("~")[1], x, return_type="dataframe")
-        for col in x:
-            if str(x.dtypes[col]) == "object":
-                x = pd.get_dummies(x, columns=[col], drop_first=True)
-        x = x[model.params.index]
-
-        x[self.treatment] = [self.treatment_value, self.control_value]
-
-        y = model.get_prediction(x).summary_frame()
-
-        return y.iloc[1], y.iloc[0]
-
     def estimate_risk_ratio(self, adjustment_config: dict = None) -> tuple[pd.Series, list[pd.Series, pd.Series]]:
         """Estimate the risk_ratio effect of the treatment on the outcome. That is, the change in outcome caused
         by changing the treatment variable from the control value to the treatment value.
@@ -206,7 +166,8 @@ def estimate_risk_ratio(self, adjustment_config: dict = None) -> tuple[pd.Series
         """
         if adjustment_config is None:
             adjustment_config = {}
-        control_outcome, treatment_outcome = self.estimate_control_treatment(adjustment_config=adjustment_config)
+        prediction = self._predict(adjustment_config=adjustment_config)
+        control_outcome, treatment_outcome = prediction.iloc[1], prediction.iloc[0]
         ci_low = pd.Series(treatment_outcome["mean_ci_lower"] / control_outcome["mean_ci_upper"])
         ci_high = pd.Series(treatment_outcome["mean_ci_upper"] / control_outcome["mean_ci_lower"])
         return pd.Series(treatment_outcome["mean"] / control_outcome["mean"]), [ci_low, ci_high]
@@ -221,20 +182,12 @@ def estimate_ate_calculated(self, adjustment_config: dict = None) -> tuple[pd.Se
         """
         if adjustment_config is None:
             adjustment_config = {}
-        control_outcome, treatment_outcome = self.estimate_control_treatment(adjustment_config=adjustment_config)
+        prediction = self._predict(adjustment_config=adjustment_config)
+        control_outcome, treatment_outcome = prediction.iloc[1], prediction.iloc[0]
         ci_low = pd.Series(treatment_outcome["mean_ci_lower"] - control_outcome["mean_ci_upper"])
         ci_high = pd.Series(treatment_outcome["mean_ci_upper"] - control_outcome["mean_ci_lower"])
         return pd.Series(treatment_outcome["mean"] - control_outcome["mean"]), [ci_low, ci_high]
 
-    def _run_linear_regression(self) -> RegressionResultsWrapper:
-        """Run linear regression of the treatment and adjustment set against the outcome and return the model.
-
-        :return: The model after fitting to data.
-        """
-        model = smf.ols(formula=self.formula, data=self.df).fit()
-        self.model = model
-        return model
-
     def _get_confidence_intervals(self, model, treatment):
         confidence_intervals = model.conf_int(alpha=self.alpha, cols=None)
         ci_low, ci_high = (
diff --git a/causal_testing/estimation/logistic_regression_estimator.py b/causal_testing/estimation/logistic_regression_estimator.py
@@ -1,59 +1,25 @@
 """This module contains the LogisticRegressionEstimator class for estimating categorical outcomes."""
 
 import logging
-from typing import Any
 from math import ceil
 
 import numpy as np
 import pandas as pd
 import statsmodels.formula.api as smf
-from patsy import dmatrix  # pylint: disable = no-name-in-module
-from statsmodels.regression.linear_model import RegressionResultsWrapper
 from statsmodels.tools.sm_exceptions import PerfectSeparationError
 
-from causal_testing.estimation.estimator import Estimator
+from causal_testing.estimation.regression_estimator import RegressionEstimator
 
 logger = logging.getLogger(__name__)
 
 
-class LogisticRegressionEstimator(Estimator):
+class LogisticRegressionEstimator(RegressionEstimator):
     """A Logistic Regression Estimator is a parametric estimator which restricts the variables in the data to a linear
     combination of parameters and functions of the variables (note these functions need not be linear). It is designed
     for estimating categorical outcomes.
     """
 
-    def __init__(
-        # pylint: disable=too-many-arguments
-        self,
-        treatment: str,
-        treatment_value: float,
-        control_value: float,
-        adjustment_set: set,
-        outcome: str,
-        df: pd.DataFrame = None,
-        effect_modifiers: dict[str:Any] = None,
-        formula: str = None,
-        query: str = "",
-    ):
-        super().__init__(
-            treatment=treatment,
-            treatment_value=treatment_value,
-            control_value=control_value,
-            adjustment_set=adjustment_set,
-            outcome=outcome,
-            df=df,
-            effect_modifiers=effect_modifiers,
-            query=query,
-        )
-
-        self.model = None
-        if effect_modifiers is None:
-            effect_modifiers = []
-        if formula is not None:
-            self.formula = formula
-        else:
-            terms = [treatment] + sorted(list(adjustment_set)) + sorted(list(self.effect_modifiers))
-            self.formula = f"{outcome} ~ {'+'.join(((terms)))}"
+    regressor = smf.logit
 
     def add_modelling_assumptions(self):
         """
@@ -68,43 +34,6 @@ def add_modelling_assumptions(self):
         self.modelling_assumptions.append("The outcome must be binary.")
         self.modelling_assumptions.append("Independently and identically distributed errors.")
 
-    def _run_logistic_regression(self, data) -> RegressionResultsWrapper:
-        """Run logistic regression of the treatment and adjustment set against the outcome and return the model.
-
-        :return: The model after fitting to data.
-        """
-        model = smf.logit(formula=self.formula, data=data).fit(disp=0)
-        self.model = model
-        return model
-
-    def estimate(self, data: pd.DataFrame, adjustment_config: dict = None) -> RegressionResultsWrapper:
-        """add terms to the dataframe and estimate the outcome from the data
-        :param data: A pandas dataframe containing execution data from the system-under-test.
-        :param adjustment_config: Dictionary containing the adjustment configuration of the adjustment set
-        """
-        if adjustment_config is None:
-            adjustment_config = {}
-        if set(self.adjustment_set) != set(adjustment_config):
-            raise ValueError(
-                f"Invalid adjustment configuration {adjustment_config}. Must specify values for {self.adjustment_set}"
-            )
-
-        model = self._run_logistic_regression(data)
-
-        x = pd.DataFrame(columns=self.df.columns)
-        x["Intercept"] = 1  # self.intercept
-        x[self.treatment] = [self.treatment_value, self.control_value]
-        for k, v in adjustment_config.items():
-            x[k] = v
-        for k, v in self.effect_modifiers.items():
-            x[k] = v
-        x = dmatrix(self.formula.split("~")[1], x, return_type="dataframe")
-        for col in x:
-            if str(x.dtypes[col]) == "object":
-                x = pd.get_dummies(x, columns=[col], drop_first=True)
-        # x = x[model.params.index]
-        return model.predict(x)
-
     def estimate_control_treatment(
         self, adjustment_config: dict = None, bootstrap_size: int = 100
     ) -> tuple[pd.Series, pd.Series]:
@@ -115,11 +44,13 @@ def estimate_control_treatment(
         """
         if adjustment_config is None:
             adjustment_config = {}
-        y = self.estimate(self.df, adjustment_config=adjustment_config)
+        y = self._predict(self.df, adjustment_config=adjustment_config)["predicted"]
 
         try:
             bootstrap_samples = [
-                self.estimate(self.df.sample(len(self.df), replace=True), adjustment_config=adjustment_config)
+                self._predict(self.df.sample(len(self.df), replace=True), adjustment_config=adjustment_config)[
+                    "predicted"
+                ]
                 for _ in range(bootstrap_size)
             ]
             control, treatment = zip(*[(x.iloc[1], x.iloc[0]) for x in bootstrap_samples])
@@ -214,5 +145,5 @@ def estimate_unit_odds_ratio(self) -> float:
 
         :return: The odds ratio. Confidence intervals are not yet supported.
         """
-        model = self._run_logistic_regression(self.df)
+        model = self._run_regression(self.df)
         return np.exp(model.params[self.treatment])
diff --git a/causal_testing/estimation/regression_estimator.py b/causal_testing/estimation/regression_estimator.py
@@ -2,16 +2,13 @@
 
 import logging
 from typing import Any
-from abc import abstractmethod, abstractmethod
+from abc import abstractmethod
 
 import pandas as pd
-import statsmodels.formula.api as smf
-from patsy import dmatrix  # pylint: disable = no-name-in-module
-from patsy import ModelDesc
 from statsmodels.regression.linear_model import RegressionResultsWrapper
+from patsy import dmatrix  # pylint: disable = no-name-in-module
 
 from causal_testing.specification.variable import Variable
-from causal_testing.estimation.gp import GP
 from causal_testing.estimation.estimator import Estimator
 
 logger = logging.getLogger(__name__)
@@ -50,17 +47,23 @@ def __init__(
         self.model = None
         if effect_modifiers is None:
             effect_modifiers = []
+        if adjustment_set is None:
+            adjustment_set = []
         if formula is not None:
             self.formula = formula
         else:
             terms = [treatment] + sorted(list(adjustment_set)) + sorted(list(effect_modifiers))
             self.formula = f"{outcome} ~ {'+'.join(terms)}"
-        for term in self.effect_modifiers:
-            self.adjustment_set.add(term)
 
     @property
     @abstractmethod
-    def regression(self):
+    def regressor(self):
+        """
+        The regressor to use, e.g. ols or logit.
+        This should be a property accessible with self.regressor.
+        Define as `regressor = ...`` outside of __init__, not as `self.regressor = ...`, otherwise
+        you'll get an "cannot instantiate with abstract method" error.
+        """
         raise NotImplementedError("Subclasses must implement the 'model' property.")
 
     def add_modelling_assumptions(self):
@@ -81,6 +84,37 @@ def _run_regression(self, data=None) -> RegressionResultsWrapper:
         """
         if data is None:
             data = self.df
-        model = self.regression(formula=self.formula, data=data).fit(disp=0)
+        model = self.regressor(formula=self.formula, data=data).fit(disp=0)
         self.model = model
         return model
+
+    def _predict(self, data=None, adjustment_config: dict = None) -> tuple[pd.Series, pd.Series]:
+        """Estimate the outcomes under control and treatment.
+
+        :param data: The data to use, defaults to `self.df`. Controllable for boostrap sampling.
+        :param: adjustment_config: The values of the adjustment variables to use.
+
+        :return: The estimated outcome under control and treatment, with confidence intervals in the form of a
+                 dataframe with columns "predicted", "se", "ci_lower", and "ci_upper".
+        """
+        if adjustment_config is None:
+            adjustment_config = {}
+
+        model = self._run_regression(data)
+
+        x = pd.DataFrame(columns=self.df.columns)
+        x["Intercept"] = 1  # self.intercept
+        x[self.treatment] = [self.treatment_value, self.control_value]
+
+        for k, v in adjustment_config.items():
+            x[k] = v
+        for k, v in self.effect_modifiers.items():
+            x[k] = v
+        x = dmatrix(self.formula.split("~")[1], x, return_type="dataframe")
+        for col in x:
+            if str(x.dtypes[col]) == "object":
+                x = pd.get_dummies(x, columns=[col], drop_first=True)
+
+        # This has to be here in case the treatment variable is in an I(...) block in the self.formula
+        x[self.treatment] = [self.treatment_value, self.control_value]
+        return model.get_prediction(x).summary_frame()
diff --git a/tests/estimation_tests/test_cubic_spline_estimator.py b/tests/estimation_tests/test_cubic_spline_estimator.py
@@ -27,7 +27,7 @@ def test_program_11_3_cublic_spline(self):
 
         cublic_spline_estimator = CubicSplineRegressionEstimator("treatments", 1, 0, set(), "outcomes", 3, df)
 
-        model = cublic_spline_estimator._run_linear_regression()
+        ate_1 = cublic_spline_estimator.estimate_ate_calculated()
 
         self.assertEqual(
             round(
@@ -37,7 +37,6 @@ def test_program_11_3_cublic_spline(self):
             195.6,
         )
 
-        ate_1 = cublic_spline_estimator.estimate_ate_calculated()
         cublic_spline_estimator.treatment_value = 2
         ate_2 = cublic_spline_estimator.estimate_ate_calculated()