Pushing so others can see - VERY BROKEN

jmafoster1 · jmafoster1 · commit 7611771a4b36 · 2023-05-18T14:17:14.000+01:00
diff --git a/causal_testing/json_front/json_class.py b/causal_testing/json_front/json_class.py
@@ -56,7 +56,7 @@ def __init__(self, output_path: str, output_overwrite: bool = False):
         self.output_path = Path(output_path)
         self.check_file_exists(self.output_path, output_overwrite)
 
-    def set_paths(self, json_path: str, dag_path: str, data_paths: str):
+    def set_paths(self, json_path: str, dag_path: str, data_paths: str=[]):
         """
         Takes a path of the directory containing all scenario specific files and creates individual paths for each file
         :param json_path: string path representation to .json file containing test specifications
@@ -73,7 +73,12 @@ def setup(self, scenario: Scenario):
         self.causal_specification = CausalSpecification(
             scenario=self.scenario, causal_dag=CausalDAG(self.input_paths.dag_path)
         )
-        self._json_parse()
+        # Parse the JSON test plan
+        with open(self.input_paths.json_path, encoding="utf-8") as f:
+            self.test_plan = json.load(f)
+        # Populate the data
+        if self.input_paths.data_paths:
+            self.data = pd.concat([pd.read_csv(data_file, header=0) for data_file in self.input_paths.data_paths])
         self._populate_metas()
 
     def _create_abstract_test_case(self, test, mutates, effects):
@@ -144,6 +149,7 @@ def run_json_tests(self, effects: dict, estimators: dict, f_flag: bool = False,
                         + "==============\n"
                         + f"  Result: {'FAILED' if result[0] else 'Passed'}"
                     )
+                    print(msg)
                 else:
                     abstract_test = self._create_abstract_test_case(test, mutates, effects)
                     concrete_tests, _ = abstract_test.generate_concrete_tests(5, 0.05)
@@ -198,15 +204,6 @@ def _execute_tests(self, concrete_tests, test, f_flag):
                 failures += 1
         return failures, details
 
-    def _json_parse(self):
-        """Parse a JSON input file into inputs, outputs, metas and a test plan"""
-        with open(self.input_paths.json_path, encoding="utf-8") as f:
-            self.test_plan = json.load(f)
-        for data_file in self.input_paths.data_paths:
-            df = pd.read_csv(data_file, header=0)
-            self.data.append(df)
-        self.data = pd.concat(self.data)
-
     def _populate_metas(self):
         """
         Populate data with meta-variable values and add distributions to Causal Testing Framework Variables
@@ -236,7 +233,7 @@ def _execute_test_case(
 
         test_passes = causal_test_case.expected_causal_effect.apply(causal_test_result)
 
-        if causal_test_result.ci_low() and causal_test_result.ci_high():
+        if causal_test_result.ci_low() is not None and causal_test_result.ci_high() is not None:
             result_string = (
                 f"{causal_test_result.ci_low()} < {causal_test_result.test_value.value} <  "
                 f"{causal_test_result.ci_high()}"
@@ -351,7 +348,6 @@ def get_args(test_args=None) -> argparse.Namespace:
         parser.add_argument(
             "--data_path",
             help="Specify path to file containing runtime data",
-            required=True,
             nargs="+",
         )
         parser.add_argument(
diff --git a/causal_testing/testing/causal_test_outcome.py b/causal_testing/testing/causal_test_outcome.py
@@ -27,7 +27,8 @@ class SomeEffect(CausalTestOutcome):
 
     def apply(self, res: CausalTestResult) -> bool:
         if res.test_value.type in {"ate", "coefficient"}:
-            return (0 < res.ci_low() < res.ci_high()) or (res.ci_low() < res.ci_high() < 0)
+            return any([0 < ci_low < ci_high or ci_low < ci_high < 0 for ci_low, ci_high in zip(res.ci_low(), res.ci_high())])
+            # return (0 < res.ci_low() < res.ci_high()) or (res.ci_low() < res.ci_high() < 0)
         if res.test_value.type == "risk_ratio":
             return (1 < res.ci_low() < res.ci_high()) or (res.ci_low() < res.ci_high() < 1)
         raise ValueError(f"Test Value type {res.test_value.type} is not valid for this TestOutcome")
@@ -36,10 +37,11 @@ def apply(self, res: CausalTestResult) -> bool:
 class NoEffect(CausalTestOutcome):
     """An extension of TestOutcome representing that the expected causal effect should be zero."""
 
-    def apply(self, res: CausalTestResult) -> bool:
+    def apply(self, res: CausalTestResult, threshold: float = 1e-10) -> bool:
         print("RESULT", res)
         if res.test_value.type in {"ate", "coefficient"}:
-            return (res.ci_low() < 0 < res.ci_high()) or (abs(res.test_value.value) < 1e-10)
+            return all([ci_low < 0< ci_high for ci_low, ci_high in zip(res.ci_low(), res.ci_high())]) or all([abs(v) < 1e-10 for v in res.test_value.value])
+            # return (res.ci_low() < 0 < res.ci_high()) or (abs(res.test_value.value) < 1e-10)
         if res.test_value.type == "risk_ratio":
             return (res.ci_low() < 1 < res.ci_high()) or np.isclose(res.test_value.value, 1.0, atol=1e-10)
         raise ValueError(f"Test Value type {res.test_value.type} is not valid for this TestOutcome")
diff --git a/causal_testing/testing/causal_test_result.py b/causal_testing/testing/causal_test_result.py
@@ -43,18 +43,20 @@ def __init__(
             self.effect_modifier_configuration = {}
 
     def __str__(self):
+        def push(s, inc="  "):
+            return inc + str(s).replace("\n", "\n"+inc)
         base_str = (
             f"Causal Test Result\n==============\n"
             f"Treatment: {self.estimator.treatment}\n"
             f"Control value: {self.estimator.control_value}\n"
             f"Treatment value: {self.estimator.treatment_value}\n"
             f"Outcome: {self.estimator.outcome}\n"
             f"Adjustment set: {self.adjustment_set}\n"
-            f"{self.test_value.type}: {self.test_value.value}\n"
+            f"{self.test_value.type}:\n{push(self.test_value.value)}\n"
         )
         confidence_str = ""
         if self.confidence_intervals:
-            confidence_str += f"Confidence intervals: {self.confidence_intervals}\n"
+            confidence_str += f"Confidence intervals:\n{push(pd.DataFrame(self.confidence_intervals).transpose().to_string(header=False))}\n"
         return base_str + confidence_str
 
     def to_dict(self):
@@ -76,14 +78,14 @@ def to_dict(self):
 
     def ci_low(self):
         """Return the lower bracket of the confidence intervals."""
-        if self.confidence_intervals and all(self.confidence_intervals):
-            return min(self.confidence_intervals)
+        if self.confidence_intervals:
+            return self.confidence_intervals[0]
         return None
 
     def ci_high(self):
         """Return the higher bracket of the confidence intervals."""
-        if self.confidence_intervals and all(self.confidence_intervals):
-            return max(self.confidence_intervals)
+        if self.confidence_intervals:
+            return self.confidence_intervals[1]
         return None
 
     def ci_valid(self) -> bool:
diff --git a/causal_testing/testing/estimators.py b/causal_testing/testing/estimators.py
@@ -335,10 +335,15 @@ def estimate_unit_ate(self) -> float:
         :return: The unit average treatment effect and the 95% Wald confidence intervals.
         """
         model = self._run_linear_regression()
-        assert self.treatment in model.params, f"{self.treatment} not in {model.params}"
-        unit_effect = model.params[[self.treatment]].values[0]  # Unit effect is the coefficient of the treatment
-        [ci_low, ci_high] = self._get_confidence_intervals(model)
-
+        newline = "\n"
+        print(model.conf_int())
+        treatment = [self.treatment]
+        if str(self.df.dtypes[self.treatment]) == "object":
+                reference = min(self.df[self.treatment])
+                treatment = [x.replace("[", "[T.") for x in dmatrix(f"{self.treatment}-1", self.df.query(f"{self.treatment} != '{reference}'"), return_type="dataframe").columns]
+        assert set(treatment).issubset(model.params.index.tolist()), f"{treatment} not in\n{'  '+str(model.params.index).replace(newline, newline+'  ')}"
+        unit_effect = model.params[treatment]  # Unit effect is the coefficient of the treatment
+        [ci_low, ci_high] = self._get_confidence_intervals(model, treatment)
         return unit_effect, [ci_low, ci_high]
 
     def estimate_ate(self) -> tuple[float, list[float, float], float]:
@@ -353,19 +358,14 @@ def estimate_ate(self) -> tuple[float, list[float, float], float]:
         # Create an empty individual for the control and treated
         individuals = pd.DataFrame(1, index=["control", "treated"], columns=model.params.index)
 
-        # This is a temporary hack
-        # for t in self.square_terms:
-        #     individuals[t + "^2"] = individuals[t] ** 2
-        # for a, b in self.product_terms:
-        #     individuals[f"{a}*{b}"] = individuals[a] * individuals[b]
-
         # It is ABSOLUTELY CRITICAL that these go last, otherwise we can't index
         # the effect with "ate = t_test_results.effect[0]"
         individuals.loc["control", [self.treatment]] = self.control_value
         individuals.loc["treated", [self.treatment]] = self.treatment_value
 
         # Perform a t-test to compare the predicted outcome of the control and treated individual (ATE)
         t_test_results = model.t_test(individuals.loc["treated"] - individuals.loc["control"])
+        print("t_test_results", t_test_results.effect)
         ate = t_test_results.effect[0]
         confidence_intervals = list(t_test_results.conf_int().flatten())
         return ate, confidence_intervals
@@ -473,21 +473,16 @@ def _run_linear_regression(self) -> RegressionResultsWrapper:
         cols = [self.treatment]
         cols += [x for x in self.adjustment_set if x not in cols]
         treatment_and_adjustments_cols = reduced_df[cols + ["Intercept"]]
-        for col in treatment_and_adjustments_cols:
-            if str(treatment_and_adjustments_cols.dtypes[col]) == "object":
-                treatment_and_adjustments_cols = pd.get_dummies(
-                    treatment_and_adjustments_cols, columns=[col], drop_first=True
-                )
         model = smf.ols(formula=self.formula, data=self.df).fit()
         return model
 
-    def _get_confidence_intervals(self, model):
+    def _get_confidence_intervals(self, model, treatment):
         confidence_intervals = model.conf_int(alpha=0.05, cols=None)
         ci_low, ci_high = (
-            confidence_intervals[0][[self.treatment]],
-            confidence_intervals[1][[self.treatment]],
+            confidence_intervals[0].loc[treatment],
+            confidence_intervals[1].loc[treatment],
         )
-        return [ci_low.values[0], ci_high.values[0]]
+        return [ci_low, ci_high]
 
 
 class InstrumentalVariableEstimator(Estimator):