Fixed pytest errors on logit. Closes #25

jmafoster1 · jmafoster1 · commit 508244bb8500 · 2023-03-13T16:35:50.000Z
diff --git a/causal_testing/testing/estimators.py b/causal_testing/testing/estimators.py
@@ -16,6 +16,7 @@
 from statsmodels.tools.sm_exceptions import PerfectSeparationError
 
 from causal_testing.specification.variable import Variable
+from math import ceil
 
 logger = logging.getLogger(__name__)
 
@@ -106,7 +107,7 @@ def __init__(
         outcome: str,
         df: pd.DataFrame = None,
         effect_modifiers: dict[Variable:Any] = None,
-        formula: str = None
+        formula: str = None,
     ):
         super().__init__(treatment, treatment_value, control_value, adjustment_set, outcome, df, effect_modifiers)
 
@@ -151,7 +152,7 @@ def _run_logistic_regression(self, data) -> RegressionResultsWrapper:
         logger.debug(reduced_df[necessary_cols])
 
         # 2. Add intercept
-        reduced_df["Intercept"] = 1#self.intercept
+        reduced_df["Intercept"] = 1  # self.intercept
 
         # 3. Estimate the unit difference in outcome caused by unit difference in treatment
         cols = [self.treatment]
@@ -164,22 +165,23 @@ def _run_logistic_regression(self, data) -> RegressionResultsWrapper:
                     treatment_and_adjustments_cols, columns=[col], drop_first=True
                 )
         # regression = sm.Logit(outcome_col, treatment_and_adjustments_cols) # This one works
-        model = smf.logit(formula=self.formula, data=self.df).fit(disp=0)
+        model = smf.logit(formula=self.formula, data=data).fit(disp=0)
         return model
 
     def estimate(self, data: pd.DataFrame, adjustment_config=None) -> RegressionResultsWrapper:
         """add terms to the dataframe and estimate the outcome from the data
         :param data: A pandas dataframe containing execution data from the system-under-test.
 
         """
+        print(data)
         if adjustment_config is None:
             adjustment_config = {}
 
         model = self._run_logistic_regression(data)
         self.model = model
 
         x = pd.DataFrame(columns=self.df.columns)
-        x["Intercept"] = 1#self.intercept
+        x["Intercept"] = 1  # self.intercept
         x[self.treatment] = [self.treatment_value, self.control_value]
         for k, v in adjustment_config.items():
             x[k] = v
@@ -235,7 +237,7 @@ def estimate_ate(self, bootstrap_size=100) -> float:
         (control_outcome, control_bootstraps), (
             treatment_outcome,
             treatment_bootstraps,
-        ) = self.estimate_control_treatment()
+        ) = self.estimate_control_treatment(bootstrap_size=bootstrap_size)
         estimate = treatment_outcome - control_outcome
 
         if control_bootstraps is None or treatment_bootstraps is None:
@@ -265,14 +267,16 @@ def estimate_risk_ratio(self, bootstrap_size=100) -> float:
         (control_outcome, control_bootstraps), (
             treatment_outcome,
             treatment_bootstraps,
-        ) = self.estimate_control_treatment()
+        ) = self.estimate_control_treatment(bootstrap_size=bootstrap_size)
         estimate = treatment_outcome / control_outcome
 
         if control_bootstraps is None or treatment_bootstraps is None:
             return estimate, (None, None)
 
         bootstraps = sorted(list(treatment_bootstraps / control_bootstraps))
-        bound = int((bootstrap_size * 0.05) / 2)
+        bound = ceil((bootstrap_size * 0.05) / 2)
+        print("bootstraps", bootstraps)
+        print("bound", bound)
         ci_low = bootstraps[bound]
         ci_high = bootstraps[bootstrap_size - bound]
 
@@ -309,7 +313,7 @@ def __init__(
         outcome: str,
         df: pd.DataFrame = None,
         effect_modifiers: dict[Variable:Any] = None,
-        formula: str = None
+        formula: str = None,
     ):
         super().__init__(treatment, treatment_value, control_value, adjustment_set, outcome, df, effect_modifiers)
 
@@ -392,7 +396,7 @@ def estimate_control_treatment(self, adjustment_config: dict = None) -> tuple[pd
 
         x = pd.DataFrame(columns=self.df.columns)
         x[self.treatment] = [self.treatment_value, self.control_value]
-        x["Intercept"] = 1#self.intercept
+        x["Intercept"] = 1  # self.intercept
         for k, v in adjustment_config.items():
             x[k] = v
         for k, v in self.effect_modifiers.items():
@@ -443,7 +447,7 @@ def estimate_cates(self) -> tuple[float, list[float, float]]:
         ), f"Must have at least one effect modifier to compute CATE - {self.effect_modifiers}."
         x = pd.DataFrame()
         x[self.treatment] = [self.treatment_value, self.control_value]
-        x["Intercept"] = 1#self.intercept
+        x["Intercept"] = 1  # self.intercept
         for k, v in self.effect_modifiers.items():
             self.adjustment_set.add(k)
             x[k] = v
@@ -475,7 +479,7 @@ def _run_linear_regression(self) -> RegressionResultsWrapper:
         logger.debug(reduced_df[necessary_cols])
 
         # 2. Add intercept
-        reduced_df["Intercept"] = 1#self.intercept
+        reduced_df["Intercept"] = 1  # self.intercept
 
         # 3. Estimate the unit difference in outcome caused by unit difference in treatment
         cols = [self.treatment]
diff --git a/tests/testing_tests/test_causal_test_engine.py b/tests/testing_tests/test_causal_test_engine.py
@@ -218,7 +218,7 @@ def test_execute_test_observational_linear_regression_estimator_squared_term(sel
             self.minimal_adjustment_set,
             "C",
             self.causal_test_engine.scenario_execution_data_df,
-            formula=f"C ~ A + {'+'.join(self.minimal_adjustment_set)} + (D ** 2)"
+            formula=f"C ~ A + {'+'.join(self.minimal_adjustment_set)} + (D ** 2)",
         )
         causal_test_result = self.causal_test_engine.execute_test(estimation_model, self.causal_test_case)
         self.assertAlmostEqual(round(causal_test_result.test_value.value, 1), 4, delta=1)
diff --git a/tests/testing_tests/test_causal_test_outcome.py b/tests/testing_tests/test_causal_test_outcome.py
@@ -27,13 +27,17 @@ def test_None_ci(self):
 
         self.assertIsNone(ctr.ci_low())
         self.assertIsNone(ctr.ci_high())
-        self.assertEqual(ctr.to_dict(),
-            {"treatment": "A",
-            "control_value": 0,
-            "treatment_value": 1,
-            "outcome": "A",
-            "adjustment_set": set(),
-            "test_value": test_value})
+        self.assertEqual(
+            ctr.to_dict(),
+            {
+                "treatment": "A",
+                "control_value": 0,
+                "treatment_value": 1,
+                "outcome": "A",
+                "adjustment_set": set(),
+                "test_value": test_value,
+            },
+        )
 
     def test_empty_adjustment_set(self):
         test_value = TestValue(type="ate", value=0)
@@ -46,13 +50,18 @@ def test_empty_adjustment_set(self):
 
         self.assertIsNone(ctr.ci_low())
         self.assertIsNone(ctr.ci_high())
-        self.assertEqual(str(ctr), ("Causal Test Result\n==============\n"
-            "Treatment: A\n"
-            "Control value: 0\n"
-            "Treatment value: 1\n"
-            "Outcome: A\n"
-            "Adjustment set: set()\n"
-            "ate: 0\n" ))
+        self.assertEqual(
+            str(ctr),
+            (
+                "Causal Test Result\n==============\n"
+                "Treatment: A\n"
+                "Control value: 0\n"
+                "Treatment value: 1\n"
+                "Outcome: A\n"
+                "Adjustment set: set()\n"
+                "ate: 0\n"
+            ),
+        )
 
     def test_exactValue_pass(self):
         test_value = TestValue(type="ate", value=5.05)
@@ -97,20 +106,29 @@ def test_someEffect_fail(self):
         )
         ev = SomeEffect()
         self.assertFalse(ev.apply(ctr))
-        self.assertEqual(str(ctr), ("Causal Test Result\n==============\n"
-            "Treatment: A\n"
-            "Control value: 0\n"
-            "Treatment value: 1\n"
-            "Outcome: A\n"
-            "Adjustment set: set()\n"
-            "ate: 0\n"
-            "Confidence intervals: [-0.1, 0.2]\n" ))
-        self.assertEqual(ctr.to_dict(),
-            {"treatment": "A",
-            "control_value": 0,
-            "treatment_value": 1,
-            "outcome": "A",
-            "adjustment_set": set(),
-            "test_value": test_value,
-            "ci_low": -0.1,
-            "ci_high": 0.2})
+        self.assertEqual(
+            str(ctr),
+            (
+                "Causal Test Result\n==============\n"
+                "Treatment: A\n"
+                "Control value: 0\n"
+                "Treatment value: 1\n"
+                "Outcome: A\n"
+                "Adjustment set: set()\n"
+                "ate: 0\n"
+                "Confidence intervals: [-0.1, 0.2]\n"
+            ),
+        )
+        self.assertEqual(
+            ctr.to_dict(),
+            {
+                "treatment": "A",
+                "control_value": 0,
+                "treatment_value": 1,
+                "outcome": "A",
+                "adjustment_set": set(),
+                "test_value": test_value,
+                "ci_low": -0.1,
+                "ci_high": 0.2,
+            },
+        )
diff --git a/tests/testing_tests/test_estimators.py b/tests/testing_tests/test_estimators.py
@@ -167,12 +167,17 @@ def test_program_11_2(self):
     def test_program_11_3(self):
         """Test whether our linear regression implementation produces the same results as program 11.3 (p. 144)."""
         df = self.chapter_11_df.copy()
-        linear_regression_estimator = LinearRegressionEstimator("treatments", 100, 90, set(), "outcomes", df, formula="outcomes ~ treatments + np.power(treatments, 2)")
+        linear_regression_estimator = LinearRegressionEstimator(
+            "treatments", 100, 90, set(), "outcomes", df, formula="outcomes ~ treatments + np.power(treatments, 2)"
+        )
         model = linear_regression_estimator._run_linear_regression()
         ate, _ = linear_regression_estimator.estimate_unit_ate()
         self.assertEqual(
             round(
-                model.params["Intercept"] + 90 * model.params["treatments"] + 90 * 90 * model.params["np.power(treatments, 2)"], 1
+                model.params["Intercept"]
+                + 90 * model.params["treatments"]
+                + 90 * 90 * model.params["np.power(treatments, 2)"],
+                1,
             ),
             197.1,
         )
@@ -198,14 +203,21 @@ def test_program_15_1A(self):
             "smokeintensity",
             "smokeyrs",
         }
-        linear_regression_estimator = LinearRegressionEstimator("qsmk", 1, 0, covariates, "wt82_71", df,
-        formula=f"""wt82_71 ~ qsmk +
+        linear_regression_estimator = LinearRegressionEstimator(
+            "qsmk",
+            1,
+            0,
+            covariates,
+            "wt82_71",
+            df,
+            formula=f"""wt82_71 ~ qsmk +
                              {'+'.join(sorted(list(covariates)))} +
                              np.power(age, 2) +
                              np.power(wt71, 2) +
                              np.power(smokeintensity, 2) +
                              np.power(smokeyrs, 2) +
-                             (qsmk * smokeintensity)""")
+                             (qsmk * smokeintensity)""",
+        )
         # terms_to_square = ["age", "wt71", "smokeintensity", "smokeyrs"]
         # terms_to_product = [("qsmk", "smokeintensity")]
         # for term_to_square in terms_to_square:
@@ -236,8 +248,15 @@ def test_program_15_no_interaction(self):
             "smokeintensity",
             "smokeyrs",
         }
-        linear_regression_estimator = LinearRegressionEstimator("qsmk", 1, 0, covariates, "wt82_71", df,
-        formula="wt82_71 ~ qsmk + age + np.power(age, 2) + wt71 + np.power(wt71, 2) + smokeintensity + np.power(smokeintensity, 2) + smokeyrs + np.power(smokeyrs, 2)")
+        linear_regression_estimator = LinearRegressionEstimator(
+            "qsmk",
+            1,
+            0,
+            covariates,
+            "wt82_71",
+            df,
+            formula="wt82_71 ~ qsmk + age + np.power(age, 2) + wt71 + np.power(wt71, 2) + smokeintensity + np.power(smokeintensity, 2) + smokeyrs + np.power(smokeyrs, 2)",
+        )
         # terms_to_square = ["age", "wt71", "smokeintensity", "smokeyrs"]
         # for term_to_square in terms_to_square:
         ate, [ci_low, ci_high] = linear_regression_estimator.estimate_unit_ate()
@@ -264,8 +283,15 @@ def test_program_15_no_interaction_ate(self):
             "smokeintensity",
             "smokeyrs",
         }
-        linear_regression_estimator = LinearRegressionEstimator("qsmk", 1, 0, covariates, "wt82_71", df,
-        formula="wt82_71 ~ qsmk + age + np.power(age, 2) + wt71 + np.power(wt71, 2) + smokeintensity + np.power(smokeintensity, 2) + smokeyrs + np.power(smokeyrs, 2)")
+        linear_regression_estimator = LinearRegressionEstimator(
+            "qsmk",
+            1,
+            0,
+            covariates,
+            "wt82_71",
+            df,
+            formula="wt82_71 ~ qsmk + age + np.power(age, 2) + wt71 + np.power(wt71, 2) + smokeintensity + np.power(smokeintensity, 2) + smokeyrs + np.power(smokeyrs, 2)",
+        )
         # terms_to_square = ["age", "wt71", "smokeintensity", "smokeyrs"]
         # for term_to_square in terms_to_square:
         ate, [ci_low, ci_high] = linear_regression_estimator.estimate_ate()
@@ -292,8 +318,15 @@ def test_program_15_no_interaction_ate_calculated(self):
             "smokeintensity",
             "smokeyrs",
         }
-        linear_regression_estimator = LinearRegressionEstimator("qsmk", 1, 0, covariates, "wt82_71", df,
-        formula="wt82_71 ~ qsmk + age + np.power(age, 2) + wt71 + np.power(wt71, 2) + smokeintensity + np.power(smokeintensity, 2) + smokeyrs + np.power(smokeyrs, 2)")
+        linear_regression_estimator = LinearRegressionEstimator(
+            "qsmk",
+            1,
+            0,
+            covariates,
+            "wt82_71",
+            df,
+            formula="wt82_71 ~ qsmk + age + np.power(age, 2) + wt71 + np.power(wt71, 2) + smokeintensity + np.power(smokeintensity, 2) + smokeyrs + np.power(smokeyrs, 2)",
+        )
         # terms_to_square = ["age", "wt71", "smokeintensity", "smokeyrs"]
         # for term_to_square in terms_to_square:
         ate, [ci_low, ci_high] = linear_regression_estimator.estimate_ate_calculated(

Original file line number	Diff line number	Diff line change
`@@ -218,7 +218,7 @@ def test_execute_test_observational_linear_regression_estimator_squared_term(sel`
`218`	`218`	`self.minimal_adjustment_set,`
`219`	`219`	`"C",`
`220`	`220`	`self.causal_test_engine.scenario_execution_data_df,`
`221`		`- formula=f"C ~ A + {'+'.join(self.minimal_adjustment_set)} + (D ** 2)"`
	`221`	`+ formula=f"C ~ A + {'+'.join(self.minimal_adjustment_set)} + (D ** 2)",`
`222`	`222`	`)`
`223`	`223`	`causal_test_result = self.causal_test_engine.execute_test(estimation_model, self.causal_test_case)`
`224`	`224`	`self.assertAlmostEqual(round(causal_test_result.test_value.value, 1), 4, delta=1)`