Nearly passing

jmafoster1 · jmafoster1 · commit a809a00e081e · 2023-02-22T14:21:59.000Z
diff --git a/causal_testing/testing/estimators.py b/causal_testing/testing/estimators.py
@@ -9,6 +9,7 @@
 import statsmodels.api as sm
 import statsmodels.formula.api as smf
 from econml.dml import CausalForestDML
+from patsy import dmatrix
 
 from sklearn.ensemble import GradientBoostingRegressor
 from statsmodels.regression.linear_model import RegressionResultsWrapper
@@ -314,7 +315,7 @@ def __init__(
             self.formula = formula
         else:
             terms = [treatment] + sorted(list(adjustment_set)) + sorted(list(effect_modifiers))
-            self.formula = f"{outcome} ~ {'+'.join(((terms)))} + Intercept"
+            self.formula = f"{outcome} ~ {'+'.join(((terms)))}"
 
         for term in self.effect_modifiers:
             self.adjustment_set.add(term)
@@ -330,53 +331,53 @@ def add_modelling_assumptions(self):
             "do not need to be linear."
         )
 
-    def add_squared_term_to_df(self, term_to_square: str):
-        """Add a squared term to the linear regression model and df.
-
-        This enables the user to capture curvilinear relationships with a linear regression model, not just straight
-        lines, while automatically adding the modelling assumption imposed by the addition of this term.
-
-        :param term_to_square: The term (column in data and variable in DAG) which is to be squared.
-        """
-        new_term = str(term_to_square) + "^2"
-        self.df[new_term] = self.df[term_to_square] ** 2
-        self.adjustment_set.add(new_term)
-        self.modelling_assumptions += (
-            f"Relationship between {self.treatment} and {self.outcome} varies quadratically" f"with {term_to_square}."
-        )
-        self.square_terms.append(term_to_square)
-
-    def add_inverse_term_to_df(self, term_to_invert: str):
-        """Add an inverse term to the linear regression model and df.
-
-        This enables the user to capture curvilinear relationships with a linear regression model, not just straight
-        lines, while automatically adding the modelling assumption imposed by the addition of this term.
-
-        :param term_to_square: The term (column in data and variable in DAG) which is to be squared.
-        """
-        new_term = "1/" + str(term_to_invert)
-        self.df[new_term] = 1 / self.df[term_to_invert]
-        self.adjustment_set.add(new_term)
-        self.modelling_assumptions += (
-            f"Relationship between {self.treatment} and {self.outcome} varies inversely" f"with {term_to_invert}."
-        )
-        self.inverse_terms.append(term_to_invert)
-
-    def add_product_term_to_df(self, term_a: str, term_b: str):
-        """Add a product term to the linear regression model and df.
-
-        This enables the user to capture interaction between a pair of variables in the model. In other words, while
-        each covariate's contribution to the mean is assumed to be independent of the other covariates, the pair of
-        product terms term_a*term_b a are restricted to vary linearly with each other.
-
-        :param term_a: The first term of the product term.
-        :param term_b: The second term of the product term.
-        """
-        new_term = str(term_a) + "*" + str(term_b)
-        self.df[new_term] = self.df[term_a] * self.df[term_b]
-        self.adjustment_set.add(new_term)
-        self.modelling_assumptions += f"{term_a} and {term_b} vary linearly with each other."
-        self.product_terms.append((term_a, term_b))
+    # def add_squared_term_to_df(self, term_to_square: str):
+    #     """Add a squared term to the linear regression model and df.
+    #
+    #     This enables the user to capture curvilinear relationships with a linear regression model, not just straight
+    #     lines, while automatically adding the modelling assumption imposed by the addition of this term.
+    #
+    #     :param term_to_square: The term (column in data and variable in DAG) which is to be squared.
+    #     """
+    #     new_term = str(term_to_square) + "^2"
+    #     self.df[new_term] = self.df[term_to_square] ** 2
+    #     self.adjustment_set.add(new_term)
+    #     self.modelling_assumptions += (
+    #         f"Relationship between {self.treatment} and {self.outcome} varies quadratically" f"with {term_to_square}."
+    #     )
+    #     self.square_terms.append(term_to_square)
+
+    # def add_inverse_term_to_df(self, term_to_invert: str):
+    #     """Add an inverse term to the linear regression model and df.
+    #
+    #     This enables the user to capture curvilinear relationships with a linear regression model, not just straight
+    #     lines, while automatically adding the modelling assumption imposed by the addition of this term.
+    #
+    #     :param term_to_square: The term (column in data and variable in DAG) which is to be squared.
+    #     """
+    #     new_term = "1/" + str(term_to_invert)
+    #     self.df[new_term] = 1 / self.df[term_to_invert]
+    #     self.adjustment_set.add(new_term)
+    #     self.modelling_assumptions += (
+    #         f"Relationship between {self.treatment} and {self.outcome} varies inversely" f"with {term_to_invert}."
+    #     )
+    #     self.inverse_terms.append(term_to_invert)
+
+    # def add_product_term_to_df(self, term_a: str, term_b: str):
+    #     """Add a product term to the linear regression model and df.
+    #
+    #     This enables the user to capture interaction between a pair of variables in the model. In other words, while
+    #     each covariate's contribution to the mean is assumed to be independent of the other covariates, the pair of
+    #     product terms term_a*term_b a are restricted to vary linearly with each other.
+    #
+    #     :param term_a: The first term of the product term.
+    #     :param term_b: The second term of the product term.
+    #     """
+    #     new_term = str(term_a) + "*" + str(term_b)
+    #     self.df[new_term] = self.df[term_a] * self.df[term_b]
+    #     self.adjustment_set.add(new_term)
+    #     self.modelling_assumptions += f"{term_a} and {term_b} vary linearly with each other."
+    #     self.product_terms.append((term_a, term_b))
 
     def estimate_unit_ate(self) -> float:
         """Estimate the unit average treatment effect of the treatment on the outcome. That is, the change in outcome
@@ -430,19 +431,16 @@ def estimate_control_treatment(self, adjustment_config: dict = None) -> tuple[pd
         model = self._run_linear_regression()
         self.model = model
 
-        x = pd.DataFrame()
+
+
+        x = pd.DataFrame(columns=self.df.columns)
         x[self.treatment] = [self.treatment_value, self.control_value]
         x["Intercept"] = 1#self.intercept
         for k, v in adjustment_config.items():
             x[k] = v
         for k, v in self.effect_modifiers.items():
             x[k] = v
-        for t in self.square_terms:
-            x[t + "^2"] = x[t] ** 2
-        for t in self.inverse_terms:
-            x["1/" + t] = 1 / x[t]
-        for a, b in self.product_terms:
-            x[f"{a}*{b}"] = x[a] * x[b]
+        x = dmatrix(self.formula.split("~")[1], x, return_type="dataframe")
         for col in x:
             if str(x.dtypes[col]) == "object":
                 x = pd.get_dummies(x, columns=[col], drop_first=True)
@@ -534,6 +532,7 @@ def _run_linear_regression(self) -> RegressionResultsWrapper:
                 )
         # model = sm.OLS(outcome_col, treatment_and_adjustments_cols).fit()
         model = smf.ols(formula=self.formula, data=self.df).fit()
+        print(model.summary())
         return model
 
     def _get_confidence_intervals(self, model):
diff --git a/tests/testing_tests/test_estimators.py b/tests/testing_tests/test_estimators.py
@@ -158,6 +158,7 @@ def test_program_11_2(self):
         model = linear_regression_estimator._run_linear_regression()
         ate, _ = linear_regression_estimator.estimate_unit_ate()
 
+        print(model.summary())
         self.assertEqual(round(model.params["Intercept"] + 90 * model.params["treatments"], 1), 216.9)
 
         # Increasing treatments from 90 to 100 should be the same as 10 times the unit ATE
@@ -166,13 +167,13 @@ def test_program_11_2(self):
     def test_program_11_3(self):
         """Test whether our linear regression implementation produces the same results as program 11.3 (p. 144)."""
         df = self.chapter_11_df.copy()
-        linear_regression_estimator = LinearRegressionEstimator("treatments", 100, 90, set(), "outcomes", df)
-        linear_regression_estimator.add_squared_term_to_df("treatments")
+        linear_regression_estimator = LinearRegressionEstimator("treatments", 100, 90, set(), "outcomes", df, formula="outcomes ~ treatments + np.power(treatments, 2)")
+        # linear_regression_estimator.add_squared_term_to_df("treatments")
         model = linear_regression_estimator._run_linear_regression()
         ate, _ = linear_regression_estimator.estimate_unit_ate()
         self.assertEqual(
             round(
-                model.params["Intercept"] + 90 * model.params["treatments"] + 90 * 90 * model.params["treatments^2"], 1
+                model.params["Intercept"] + 90 * model.params["treatments"] + 90 * 90 * model.params["np.power(treatments, 2)"], 1
             ),
             197.1,
         )
@@ -198,13 +199,19 @@ def test_program_15_1A(self):
             "smokeintensity",
             "smokeyrs",
         }
-        linear_regression_estimator = LinearRegressionEstimator("qsmk", 1, 0, covariates, "wt82_71", df)
-        terms_to_square = ["age", "wt71", "smokeintensity", "smokeyrs"]
-        terms_to_product = [("qsmk", "smokeintensity")]
-        for term_to_square in terms_to_square:
-            linear_regression_estimator.add_squared_term_to_df(term_to_square)
-        for term_a, term_b in terms_to_product:
-            linear_regression_estimator.add_product_term_to_df(term_a, term_b)
+        linear_regression_estimator = LinearRegressionEstimator("qsmk", 1, 0, covariates, "wt82_71", df,
+        formula="""wt82_71 ~ qsmk +
+                             age + np.power(age, 2) +
+                             wt71 + np.power(wt71, 2) +
+                             smokeintensity + np.power(smokeintensity, 2) +
+                             smokeyrs + np.power(smokeyrs, 2) +
+                             (qsmk * smokeintensity)""")
+        # terms_to_square = ["age", "wt71", "smokeintensity", "smokeyrs"]
+        # terms_to_product = [("qsmk", "smokeintensity")]
+        # for term_to_square in terms_to_square:
+        #     linear_regression_estimator.add_squared_term_to_df(term_to_square)
+        # for term_a, term_b in terms_to_product:
+        #     linear_regression_estimator.add_product_term_to_df(term_a, term_b)
 
         model = linear_regression_estimator._run_linear_regression()
         self.assertEqual(round(model.params["qsmk"], 1), 2.6)
@@ -230,10 +237,11 @@ def test_program_15_no_interaction(self):
             "smokeintensity",
             "smokeyrs",
         }
-        linear_regression_estimator = LinearRegressionEstimator("qsmk", 1, 0, covariates, "wt82_71", df)
-        terms_to_square = ["age", "wt71", "smokeintensity", "smokeyrs"]
-        for term_to_square in terms_to_square:
-            linear_regression_estimator.add_squared_term_to_df(term_to_square)
+        linear_regression_estimator = LinearRegressionEstimator("qsmk", 1, 0, covariates, "wt82_71", df,
+        formula="wt82_71 ~ qsmk + age + np.power(age, 2) + wt71 + np.power(wt71, 2) + smokeintensity + np.power(smokeintensity, 2) + smokeyrs + np.power(smokeyrs, 2)")
+        # terms_to_square = ["age", "wt71", "smokeintensity", "smokeyrs"]
+        # for term_to_square in terms_to_square:
+        #     linear_regression_estimator.add_squared_term_to_df(term_to_square)
         ate, [ci_low, ci_high] = linear_regression_estimator.estimate_unit_ate()
         self.assertEqual(round(ate, 1), 3.5)
         self.assertEqual([round(ci_low, 1), round(ci_high, 1)], [2.6, 4.3])
@@ -258,10 +266,11 @@ def test_program_15_no_interaction_ate(self):
             "smokeintensity",
             "smokeyrs",
         }
-        linear_regression_estimator = LinearRegressionEstimator("qsmk", 1, 0, covariates, "wt82_71", df)
-        terms_to_square = ["age", "wt71", "smokeintensity", "smokeyrs"]
-        for term_to_square in terms_to_square:
-            linear_regression_estimator.add_squared_term_to_df(term_to_square)
+        linear_regression_estimator = LinearRegressionEstimator("qsmk", 1, 0, covariates, "wt82_71", df,
+        formula="wt82_71 ~ qsmk + age + np.power(age, 2) + wt71 + np.power(wt71, 2) + smokeintensity + np.power(smokeintensity, 2) + smokeyrs + np.power(smokeyrs, 2)")
+        # terms_to_square = ["age", "wt71", "smokeintensity", "smokeyrs"]
+        # for term_to_square in terms_to_square:
+        #     linear_regression_estimator.add_squared_term_to_df(term_to_square)
         ate, [ci_low, ci_high] = linear_regression_estimator.estimate_ate()
         self.assertEqual(round(ate, 1), 3.5)
         self.assertEqual([round(ci_low, 1), round(ci_high, 1)], [2.6, 4.3])
@@ -286,10 +295,11 @@ def test_program_15_no_interaction_ate_calculated(self):
             "smokeintensity",
             "smokeyrs",
         }
-        linear_regression_estimator = LinearRegressionEstimator("qsmk", 1, 0, covariates, "wt82_71", df)
-        terms_to_square = ["age", "wt71", "smokeintensity", "smokeyrs"]
-        for term_to_square in terms_to_square:
-            linear_regression_estimator.add_squared_term_to_df(term_to_square)
+        linear_regression_estimator = LinearRegressionEstimator("qsmk", 1, 0, covariates, "wt82_71", df,
+        formula="wt82_71 ~ qsmk + age + np.power(age, 2) + wt71 + np.power(wt71, 2) + smokeintensity + np.power(smokeintensity, 2) + smokeyrs + np.power(smokeyrs, 2)")
+        # terms_to_square = ["age", "wt71", "smokeintensity", "smokeyrs"]
+        # for term_to_square in terms_to_square:
+        #     linear_regression_estimator.add_squared_term_to_df(term_to_square)
         ate, [ci_low, ci_high] = linear_regression_estimator.estimate_ate_calculated(
             {k: self.nhefs_df.mean()[k] for k in covariates}
         )
@@ -377,7 +387,7 @@ def test_X1_effect(self):
         """When we fix the value of X2 to 0, the effect of X1 on Y should become ~2 (because X2 terms are cancelled)."""
         x2 = Input("X2", float)
         lr_model = LinearRegressionEstimator(
-            ("X1",), 1, 0, {"X2"}, ("Y",), effect_modifiers={x2: 0}, formula="Y ~ X1 + X2 + (X1 * X2)", df=self.df
+            "X1", 1, 0, {"X2"}, "Y", effect_modifiers={x2: 0}, formula="Y ~ X1 + X2 + (X1 * X2)", df=self.df
         )
         test_results = lr_model.estimate_ate()
         ate = test_results[0]