all tests pass

jmafoster1 · jmafoster1 · commit e7deb788c3de · 2024-08-08T10:54:19.000+01:00
diff --git a/causal_testing/estimation/gp.py b/causal_testing/estimation/gp.py
@@ -98,6 +98,8 @@ class GP:
     Object to perform genetic programming.
     """
 
+    # pylint: disable=too-many-instance-attributes
+
     def __init__(
         self,
         df: pd.DataFrame,
@@ -109,7 +111,6 @@ def __init__(
         seed=0,
     ):
         # pylint: disable=too-many-arguments
-        # pylint: disable=too-many-instance-attributes
         random.seed(seed)
         self.df = df
         self.features = features
diff --git a/causal_testing/estimation/logistic_regression_estimator.py b/causal_testing/estimation/logistic_regression_estimator.py
@@ -47,7 +47,8 @@ def __init__(
         )
 
         self.model = None
-
+        if effect_modifiers is None:
+            effect_modifiers = []
         if formula is not None:
             self.formula = formula
         else:
diff --git a/causal_testing/estimation/regression_estimator.py b/causal_testing/estimation/regression_estimator.py
@@ -0,0 +1,86 @@
+"""This module contains the RegressionEstimator, which is an abstract class for concrete regression estimators."""
+
+import logging
+from typing import Any
+from abc import abstractmethod, abstractmethod
+
+import pandas as pd
+import statsmodels.formula.api as smf
+from patsy import dmatrix  # pylint: disable = no-name-in-module
+from patsy import ModelDesc
+from statsmodels.regression.linear_model import RegressionResultsWrapper
+
+from causal_testing.specification.variable import Variable
+from causal_testing.estimation.gp import GP
+from causal_testing.estimation.estimator import Estimator
+
+logger = logging.getLogger(__name__)
+
+
+class RegressionEstimator(Estimator):
+    """A Linear Regression Estimator is a parametric estimator which restricts the variables in the data to a linear
+    combination of parameters and functions of the variables (note these functions need not be linear).
+    """
+
+    def __init__(
+        # pylint: disable=too-many-arguments
+        self,
+        treatment: str,
+        treatment_value: float,
+        control_value: float,
+        adjustment_set: set,
+        outcome: str,
+        df: pd.DataFrame = None,
+        effect_modifiers: dict[Variable:Any] = None,
+        formula: str = None,
+        alpha: float = 0.05,
+        query: str = "",
+    ):
+        super().__init__(
+            treatment=treatment,
+            treatment_value=treatment_value,
+            control_value=control_value,
+            adjustment_set=adjustment_set,
+            outcome=outcome,
+            df=df,
+            effect_modifiers=effect_modifiers,
+            query=query,
+        )
+
+        self.model = None
+        if effect_modifiers is None:
+            effect_modifiers = []
+        if formula is not None:
+            self.formula = formula
+        else:
+            terms = [treatment] + sorted(list(adjustment_set)) + sorted(list(effect_modifiers))
+            self.formula = f"{outcome} ~ {'+'.join(terms)}"
+        for term in self.effect_modifiers:
+            self.adjustment_set.add(term)
+
+    @property
+    @abstractmethod
+    def regression(self):
+        raise NotImplementedError("Subclasses must implement the 'model' property.")
+
+    def add_modelling_assumptions(self):
+        """
+        Add modelling assumptions to the estimator. This is a list of strings which list the modelling assumptions that
+        must hold if the resulting causal inference is to be considered valid.
+        """
+        self.modelling_assumptions.append(
+            "The variables in the data must fit a shape which can be expressed as a linear"
+            "combination of parameters and functions of variables. Note that these functions"
+            "do not need to be linear."
+        )
+
+    def _run_regression(self, data=None) -> RegressionResultsWrapper:
+        """Run logistic regression of the treatment and adjustment set against the outcome and return the model.
+
+        :return: The model after fitting to data.
+        """
+        if data is None:
+            data = self.df
+        model = self.regression(formula=self.formula, data=data).fit(disp=0)
+        self.model = model
+        return model
diff --git a/examples/poisson-line-process/example_poisson_process.py b/examples/poisson-line-process/example_poisson_process.py
@@ -4,7 +4,8 @@
 from causal_testing.specification.causal_specification import CausalSpecification
 from causal_testing.testing.causal_test_case import CausalTestCase
 from causal_testing.testing.causal_test_outcome import ExactValue, Positive
-from causal_testing.estimation.linear_regression_estimator import LinearRegressionEstimator, Estimator
+from causal_testing.estimation.linear_regression_estimator import LinearRegressionEstimator
+from causal_testing.estimation.estimator import Estimator
 from causal_testing.testing.base_test_case import BaseTestCase
 
 import pandas as pd
diff --git a/tests/estimation_tests/test_linear_regression_estimator.py b/tests/estimation_tests/test_linear_regression_estimator.py
@@ -81,33 +81,48 @@ def test_program_11_2(self):
         """Test whether our linear regression implementation produces the same results as program 11.2 (p. 141)."""
         df = self.chapter_11_df
         linear_regression_estimator = LinearRegressionEstimator("treatments", None, None, set(), "outcomes", df)
-        model = linear_regression_estimator._run_linear_regression()
         ate, _ = linear_regression_estimator.estimate_coefficient()
 
-        self.assertEqual(round(model.params["Intercept"] + 90 * model.params["treatments"], 1), 216.9)
+        self.assertEqual(
+            round(
+                linear_regression_estimator.model.params["Intercept"]
+                + 90 * linear_regression_estimator.model.params["treatments"],
+                1,
+            ),
+            216.9,
+        )
 
         # Increasing treatments from 90 to 100 should be the same as 10 times the unit ATE
-        self.assertTrue(all(round(model.params["treatments"], 1) == round(ate_single, 1) for ate_single in ate))
+        self.assertTrue(
+            all(
+                round(linear_regression_estimator.model.params["treatments"], 1) == round(ate_single, 1)
+                for ate_single in ate
+            )
+        )
 
     def test_program_11_3(self):
         """Test whether our linear regression implementation produces the same results as program 11.3 (p. 144)."""
         df = self.chapter_11_df.copy()
         linear_regression_estimator = LinearRegressionEstimator(
             "treatments", None, None, set(), "outcomes", df, formula="outcomes ~ treatments + I(treatments ** 2)"
         )
-        model = linear_regression_estimator._run_linear_regression()
         ate, _ = linear_regression_estimator.estimate_coefficient()
         self.assertEqual(
             round(
-                model.params["Intercept"]
-                + 90 * model.params["treatments"]
-                + 90 * 90 * model.params["I(treatments ** 2)"],
+                linear_regression_estimator.model.params["Intercept"]
+                + 90 * linear_regression_estimator.model.params["treatments"]
+                + 90 * 90 * linear_regression_estimator.model.params["I(treatments ** 2)"],
                 1,
             ),
             197.1,
         )
         # Increasing treatments from 90 to 100 should be the same as 10 times the unit ATE
-        self.assertTrue(all(round(model.params["treatments"], 3) == round(ate_single, 3) for ate_single in ate))
+        self.assertTrue(
+            all(
+                round(linear_regression_estimator.model.params["treatments"], 3) == round(ate_single, 3)
+                for ate_single in ate
+            )
+        )
 
     def test_program_15_1A(self):
         """Test whether our linear regression implementation produces the same results as program 15.1 (p. 163, 184)."""
@@ -149,9 +164,9 @@ def test_program_15_1A(self):
         # for term_a, term_b in terms_to_product:
         #     linear_regression_estimator.add_product_term_to_df(term_a, term_b)
 
-        model = linear_regression_estimator._run_linear_regression()
-        self.assertEqual(round(model.params["qsmk"], 1), 2.6)
-        self.assertEqual(round(model.params["qsmk:smokeintensity"], 2), 0.05)
+        linear_regression_estimator.estimate_coefficient()
+        self.assertEqual(round(linear_regression_estimator.model.params["qsmk"], 1), 2.6)
+        self.assertEqual(round(linear_regression_estimator.model.params["qsmk:smokeintensity"], 2), 0.05)
 
     def test_program_15_no_interaction(self):
         """Test whether our linear regression implementation produces the same results as program 15.1 (p. 163, 184)
@@ -266,10 +281,10 @@ def test_program_11_2_with_robustness_validation(self):
         """Test whether our linear regression estimator, as used in test_program_11_2 can correctly estimate robustness."""
         df = self.chapter_11_df.copy()
         linear_regression_estimator = LinearRegressionEstimator("treatments", 100, 90, set(), "outcomes", df)
-        model = linear_regression_estimator._run_linear_regression()
+        linear_regression_estimator.estimate_coefficient()
 
         cv = CausalValidator()
-        self.assertEqual(round(cv.estimate_robustness(model)["treatments"], 4), 0.7353)
+        self.assertEqual(round(cv.estimate_robustness(linear_regression_estimator.model)["treatments"], 4), 0.7353)
 
     def test_gp(self):
         df = pd.DataFrame()
@@ -291,7 +306,7 @@ def test_gp_power(self):
         linear_regression_estimator.gp_formula(seed=1, max_order=0)
         self.assertEqual(
             linear_regression_estimator.formula,
-            "Y ~ I(2.0*X**2 + 3.8205100524608823e-31) - 1",
+            "Y ~ I(1.9999999999999999*X**2 - 1.0043240235058056e-116*X + 2.6645352591003757e-15) - 1",
         )
         ate, (ci_low, ci_high) = linear_regression_estimator.estimate_ate_calculated()
         self.assertEqual(round(ate[0], 2), -2.00)
diff --git a/tests/json_front_tests/test_json_class.py b/tests/json_front_tests/test_json_class.py
@@ -4,7 +4,8 @@
 import scipy
 import os
 
-from causal_testing.estimation.linear_regression_estimator import LinearRegressionEstimator, Estimator
+from causal_testing.estimation.linear_regression_estimator import LinearRegressionEstimator
+from causal_testing.estimation.estimator import Estimator
 from causal_testing.testing.causal_test_outcome import NoEffect, Positive
 from causal_testing.json_front.json_class import JsonUtility, CausalVariables
 from causal_testing.specification.variable import Input, Output, Meta
@@ -313,7 +314,7 @@ def add_modelling_assumptions(self):
         effects = {"Positive": Positive()}
         mutates = {
             "Increase": lambda x: self.json_class.scenario.treatment_variables[x].z3
-                                  > self.json_class.scenario.variables[x].z3
+            > self.json_class.scenario.variables[x].z3
         }
         estimators = {"ExampleEstimator": ExampleEstimator}
         with self.assertRaises(TypeError):

Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,8 @@ def __init__(`
`47`	`47`	`)`
`48`	`48`
`49`	`49`	`self.model = None`
`50`		`-`
	`50`	`+ if effect_modifiers is None:`
	`51`	`+ effect_modifiers = []`
`51`	`52`	`if formula is not None:`
`52`	`53`	`self.formula = formula`
`53`	`54`	`else:`