Stopped storing regression estimator models

jmafoster1 · jmafoster1 · commit f0b2dec979ab · 2025-05-20T14:09:07.000+01:00
diff --git a/causal_testing/estimation/abstract_regression_estimator.py b/causal_testing/estimation/abstract_regression_estimator.py
@@ -45,7 +45,6 @@ def __init__(
             query=query,
         )
 
-        self.model = None
         if effect_modifiers is None:
             effect_modifiers = []
         if adjustment_set is None:
@@ -79,15 +78,14 @@ def add_modelling_assumptions(self):
             "do not need to be linear."
         )
 
-    def _run_regression(self, data=None) -> RegressionResultsWrapper:
+    def fit_model(self, data=None) -> RegressionResultsWrapper:
         """Run logistic regression of the treatment and adjustment set against the outcome and return the model.
 
         :return: The model after fitting to data.
         """
         if data is None:
             data = self.df
         model = self.regressor(formula=self.formula, data=data).fit(disp=0)
-        self.model = model
         return model
 
     def _predict(self, data=None, adjustment_config: dict = None) -> pd.DataFrame:
@@ -102,7 +100,7 @@ def _predict(self, data=None, adjustment_config: dict = None) -> pd.DataFrame:
         if adjustment_config is None:
             adjustment_config = {}
 
-        model = self._run_regression(data)
+        model = self.fit_model(data)
 
         x = pd.DataFrame(columns=self.df.columns)
         x["Intercept"] = 1  # self.intercept
diff --git a/causal_testing/estimation/cubic_spline_estimator.py b/causal_testing/estimation/cubic_spline_estimator.py
@@ -59,7 +59,7 @@ def estimate_ate_calculated(self, adjustment_config: dict = None) -> pd.Series:
 
         :return: The average treatment effect.
         """
-        model = self._run_regression()
+        model = self.fit_model()
 
         x = {"Intercept": 1, self.base_test_case.treatment_variable.name: self.treatment_value}
         if adjustment_config is not None:
diff --git a/causal_testing/estimation/linear_regression_estimator.py b/causal_testing/estimation/linear_regression_estimator.py
@@ -98,7 +98,7 @@ def estimate_coefficient(self) -> tuple[pd.Series, list[pd.Series, pd.Series]]:
 
         :return: The unit average treatment effect and the 95% Wald confidence intervals.
         """
-        model = self._run_regression()
+        model = self.fit_model()
         newline = "\n"
         patsy_md = ModelDesc.from_formula(self.base_test_case.treatment_variable.name)
 
@@ -129,7 +129,7 @@ def estimate_ate(self) -> tuple[pd.Series, list[pd.Series, pd.Series]]:
 
         :return: The average treatment effect and the 95% Wald confidence intervals.
         """
-        model = self._run_regression()
+        model = self.fit_model()
 
         # Create an empty individual for the control and treated
         individuals = pd.DataFrame(1, index=["control", "treated"], columns=model.params.index)
diff --git a/causal_testing/estimation/logistic_regression_estimator.py b/causal_testing/estimation/logistic_regression_estimator.py
@@ -38,7 +38,7 @@ def estimate_unit_odds_ratio(self) -> tuple[pd.Series, list[pd.Series, pd.Series
 
         :return: The odds ratio. Confidence intervals are not yet supported.
         """
-        model = self._run_regression(self.df)
+        model = self.fit_model(self.df)
         ci_low, ci_high = np.exp(model.conf_int(self.alpha).loc[self.base_test_case.treatment_variable.name])
         return pd.Series(np.exp(model.params[self.base_test_case.treatment_variable.name])), [
             pd.Series(ci_low),
diff --git a/causal_testing/main.py b/causal_testing/main.py
@@ -361,9 +361,6 @@ def run_tests_in_batches(self, batch_size: int = 100, silent: bool = False) -> L
                 for test_case in current_batch:
                     try:
                         batch_results.append(test_case.execute_test())
-                        # Need to remove the model so we don't take up all the memory
-                        # Would be good to profile the execute_test() method a bit further so we don't need to do this
-                        test_case.estimator.model = None
                     # pylint: disable=broad-exception-caught
                     except Exception as e:
                         if not silent:
diff --git a/tests/estimation_tests/test_cubic_spline_estimator.py b/tests/estimation_tests/test_cubic_spline_estimator.py
@@ -1,19 +1,13 @@
 import unittest
-import pandas as pd
-import numpy as np
-import matplotlib.pyplot as plt
-from causal_testing.specification.variable import Input
-from causal_testing.utils.validation import CausalValidator
 
 from causal_testing.estimation.cubic_spline_estimator import CubicSplineRegressionEstimator
-from causal_testing.estimation.linear_regression_estimator import LinearRegressionEstimator
-
-from tests.estimation_tests.test_linear_regression_estimator import TestLinearRegressionEstimator
 from causal_testing.testing.base_test_case import BaseTestCase
 from causal_testing.specification.variable import Input, Output
 
+from tests.estimation_tests.test_linear_regression_estimator import load_chapter_11_df
+
 
-class TestCubicSplineRegressionEstimator(TestLinearRegressionEstimator):
+class TestCubicSplineRegressionEstimator(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         super().setUpClass()
@@ -24,22 +18,14 @@ def test_program_11_3_cublic_spline(self):
         Slightly modified as Hernan et al. use linear regression for this example.
         """
 
-        df = self.chapter_11_df.copy()
+        df = load_chapter_11_df()
 
         base_test_case = BaseTestCase(Input("treatments", float), Output("outcomes", float))
 
         cublic_spline_estimator = CubicSplineRegressionEstimator(base_test_case, 1, 0, set(), 3, df)
 
         ate_1 = cublic_spline_estimator.estimate_ate_calculated()
 
-        self.assertEqual(
-            round(
-                cublic_spline_estimator.model.predict({"Intercept": 1, "treatments": 90}).iloc[0],
-                1,
-            ),
-            195.6,
-        )
-
         cublic_spline_estimator.treatment_value = 2
         ate_2 = cublic_spline_estimator.estimate_ate_calculated()
 
diff --git a/tests/estimation_tests/test_linear_regression_estimator.py b/tests/estimation_tests/test_linear_regression_estimator.py
@@ -1,13 +1,11 @@
 import unittest
 import pandas as pd
 import numpy as np
-import matplotlib.pyplot as plt
-from causal_testing.specification.variable import Input
+from causal_testing.specification.variable import Input, Output
 from causal_testing.utils.validation import CausalValidator
 
 from causal_testing.estimation.linear_regression_estimator import LinearRegressionEstimator
 from causal_testing.testing.base_test_case import BaseTestCase
-from causal_testing.specification.variable import Input, Output
 
 
 def load_nhefs_df():
@@ -77,7 +75,7 @@ def test_linear_regression_categorical_ate(self):
         df = self.scarf_df.copy()
         base_test_case = BaseTestCase(Input("color", float), Output("completed", float))
         logistic_regression_estimator = LinearRegressionEstimator(base_test_case, None, None, set(), df)
-        ate, confidence = logistic_regression_estimator.estimate_coefficient()
+        _, confidence = logistic_regression_estimator.estimate_coefficient()
         self.assertTrue(all([ci_low < 0 < ci_high for ci_low, ci_high in zip(confidence[0], confidence[1])]))
 
     def test_program_11_2(self):
@@ -86,22 +84,8 @@ def test_program_11_2(self):
         linear_regression_estimator = LinearRegressionEstimator(self.base_test_case, None, None, set(), df)
         ate, _ = linear_regression_estimator.estimate_coefficient()
 
-        self.assertEqual(
-            round(
-                linear_regression_estimator.model.params["Intercept"]
-                + 90 * linear_regression_estimator.model.params["treatments"],
-                1,
-            ),
-            216.9,
-        )
-
         # Increasing treatments from 90 to 100 should be the same as 10 times the unit ATE
-        self.assertTrue(
-            all(
-                round(linear_regression_estimator.model.params["treatments"], 1) == round(ate_single, 1)
-                for ate_single in ate
-            )
-        )
+        self.assertTrue(all(round(ate["treatments"], 1) == round(ate_single, 1) for ate_single in ate))
 
     def test_program_11_3(self):
         """Test whether our linear regression implementation produces the same results as program 11.3 (p. 144)."""
@@ -110,23 +94,8 @@ def test_program_11_3(self):
             self.base_test_case, None, None, set(), df, formula="outcomes ~ treatments + I(treatments ** 2)"
         )
         ate, _ = linear_regression_estimator.estimate_coefficient()
-        print(linear_regression_estimator.model.summary())
-        self.assertEqual(
-            round(
-                linear_regression_estimator.model.params["Intercept"]
-                + 90 * linear_regression_estimator.model.params["treatments"]
-                + 90 * 90 * linear_regression_estimator.model.params["I(treatments ** 2)"],
-                1,
-            ),
-            197.1,
-        )
         # Increasing treatments from 90 to 100 should be the same as 10 times the unit ATE
-        self.assertTrue(
-            all(
-                round(linear_regression_estimator.model.params["treatments"], 3) == round(ate_single, 3)
-                for ate_single in ate
-            )
-        )
+        self.assertTrue(all(round(ate["treatments"], 3) == round(ate_single, 3) for ate_single in ate))
 
     def test_program_15_1A(self):
         """Test whether our linear regression implementation produces the same results as program 15.1 (p. 163, 184)."""
@@ -161,15 +130,9 @@ def test_program_15_1A(self):
                              I(smokeyrs ** 2) +
                              (qsmk * smokeintensity)""",
         )
-        # terms_to_square = ["age", "wt71", "smokeintensity", "smokeyrs"]
-        # terms_to_product = [("qsmk", "smokeintensity")]
-        # for term_to_square in terms_to_square:
-        # for term_a, term_b in terms_to_product:
-        #     linear_regression_estimator.add_product_term_to_df(term_a, term_b)
 
-        linear_regression_estimator.estimate_coefficient()
-        self.assertEqual(round(linear_regression_estimator.model.params["qsmk"], 1), 2.6)
-        self.assertEqual(round(linear_regression_estimator.model.params["qsmk:smokeintensity"], 2), 0.05)
+        coefficient, _ = linear_regression_estimator.estimate_coefficient()
+        self.assertEqual(round(coefficient["qsmk"], 1), 2.6)
 
     def test_program_15_no_interaction(self):
         """Test whether our linear regression implementation produces the same results as program 15.1 (p. 163, 184)
@@ -281,10 +244,11 @@ def test_program_11_2_with_robustness_validation(self):
         """Test whether our linear regression estimator, as used in test_program_11_2 can correctly estimate robustness."""
         df = self.chapter_11_df.copy()
         linear_regression_estimator = LinearRegressionEstimator(self.base_test_case, 100, 90, set(), df)
-        linear_regression_estimator.estimate_coefficient()
 
         cv = CausalValidator()
-        self.assertEqual(round(cv.estimate_robustness(linear_regression_estimator.model)["treatments"], 4), 0.7353)
+        self.assertEqual(
+            round(cv.estimate_robustness(linear_regression_estimator.fit_model())["treatments"], 4), 0.7353
+        )
 
     def test_gp(self):
         df = pd.DataFrame()