Case study code now works

jmafoster1 · jmafoster1 · commit 7e443b706583 · 2023-04-14T11:45:51.000+01:00
diff --git a/causal_testing/data_collection/data_collector.py b/causal_testing/data_collection/data_collector.py
@@ -61,7 +61,7 @@ def filter_valid_data(self, data: pd.DataFrame, check_pos: bool = True) -> pd.Da
                 self.scenario.variables[var].z3
                 == self.scenario.variables[var].z3_val(self.scenario.variables[var].z3, row[var])
                 for var in self.scenario.variables
-                if var in row
+                if var in row and not pd.isnull(row[var])
             ]
             for c in model:
                 solver.assert_and_track(c, f"model: {c}")
@@ -147,7 +147,8 @@ def collect_data(self, **kwargs) -> pd.DataFrame:
 
         execution_data_df = self.data
         for meta in self.scenario.metas():
-            meta.populate(execution_data_df)
+            if meta.name not in self.data:
+                meta.populate(execution_data_df)
         scenario_execution_data_df = self.filter_valid_data(execution_data_df)
         for var_name, var in self.scenario.variables.items():
             if issubclass(var.datatype, Enum):
diff --git a/causal_testing/json_front/json_class.py b/causal_testing/json_front/json_class.py
@@ -170,11 +170,14 @@ def _populate_metas(self):
 
         for var in self.variables.metas + self.variables.outputs:
             if not var.distribution:
-                fitter = Fitter(self.data[var.name], distributions=get_common_distributions())
-                fitter.fit()
-                (dist, params) = list(fitter.get_best(method="sumsquare_error").items())[0]
-                var.distribution = getattr(scipy.stats, dist)(**params)
-                logger.info(var.name + f" {dist}({params})")
+                try:
+                    fitter = Fitter(self.data[var.name], distributions=get_common_distributions())
+                    fitter.fit()
+                    (dist, params) = list(fitter.get_best(method="sumsquare_error").items())[0]
+                    var.distribution = getattr(scipy.stats, dist)(**params)
+                    logger.info(var.name + f" {dist}({params})")
+                except:
+                    logger.warn(f"Could not fit distriubtion for {var.name}.")
 
     def _execute_test_case(
         self, causal_test_case: CausalTestCase, estimator: Estimator, f_flag: bool, conditions: list[str]
@@ -224,7 +227,7 @@ def _setup_test(
                 - estimation_model - Estimator instance for the test being run
         """
 
-        data_collector = ObservationalDataCollector(self.modelling_scenario, self.data.query(" & ".join(conditions)))
+        data_collector = ObservationalDataCollector(self.modelling_scenario, self.data.query(" & ".join(conditions)) if conditions else self.data)
         causal_test_engine = CausalTestEngine(self.causal_specification, data_collector, index_col=0)
 
         minimal_adjustment_set = self.causal_specification.causal_dag.identification(causal_test_case.base_test_case)
diff --git a/causal_testing/specification/metamorphic_relation.py b/causal_testing/specification/metamorphic_relation.py
@@ -102,6 +102,10 @@ def execute_tests(self, data_collector: ExperimentalDataCollector):
     def assertion(self, source_output, follow_up_output):
         """An assertion that should be applied to an individual metamorphic test run."""
 
+    @abstractmethod
+    def to_json_stub(self, skip=True) -> dict:
+        """Convert to a JSON frontend stub string for user customisation"""
+
     @abstractmethod
     def test_oracle(self, test_results):
         """A test oracle that assert whether the MR holds or not based on ALL test results.
@@ -129,6 +133,18 @@ def test_oracle(self, test_results):
             self.tests
         ), f"{str(self)}: {len(test_results['fail'])}/{len(self.tests)} tests failed."
 
+    def to_json_stub(self, skip=True) -> dict:
+        """Convert to a JSON frontend stub string for user customisation"""
+        return {
+                "name": str(self),
+                "estimator": "LinearRegressionEstimator",
+                "estimate_type": "coefficient",
+                "effect": "direct",
+                "mutations": [self.treatment_var],
+                "expectedEffect": {self.output_var: "SomeEffect"},
+                "skip": skip
+              }
+
     def __str__(self):
         formatted_str = f"{self.treatment_var} --> {self.output_var}"
         if self.adjustment_vars:
@@ -149,6 +165,19 @@ def test_oracle(self, test_results):
             len(test_results["fail"]) == 0
         ), f"{str(self)}: {len(test_results['fail'])}/{len(self.tests)} tests failed."
 
+
+    def to_json_stub(self, skip=True) -> dict:
+        """Convert to a JSON frontend stub string for user customisation"""
+        return {
+                "name": str(self),
+                "estimator": "LinearRegressionEstimator",
+                "estimate_type": "coefficient",
+                "effect": "direct",
+                "mutations": [self.treatment_var],
+                "expectedEffect": {self.output_var: "NoEffect"},
+                "skip": skip
+              }
+
     def __str__(self):
         formatted_str = f"{self.treatment_var} _||_ {self.output_var}"
         if self.adjustment_vars:
diff --git a/causal_testing/testing/causal_test_outcome.py b/causal_testing/testing/causal_test_outcome.py
@@ -37,7 +37,8 @@ class NoEffect(CausalTestOutcome):
     """An extension of TestOutcome representing that the expected causal effect should be zero."""
 
     def apply(self, res: CausalTestResult) -> bool:
-        if res.test_value.type == "ate":
+        print("RESULT", res)
+        if res.test_value.type == "ate" or res.test_value.type == "coefficient":
             return (res.ci_low() < 0 < res.ci_high()) or (abs(res.test_value.value) < 1e-10)
         if res.test_value.type == "risk_ratio":
             return (res.ci_low() < 1 < res.ci_high()) or np.isclose(res.test_value.value, 1.0, atol=1e-10)
diff --git a/causal_testing/testing/causal_test_result.py b/causal_testing/testing/causal_test_result.py
@@ -44,10 +44,10 @@ def __init__(
     def __str__(self):
         base_str = (
             f"Causal Test Result\n==============\n"
-            f"Treatment: {self.estimator.treatment[0]}\n"
+            f"Treatment: {self.estimator.treatment}\n"
             f"Control value: {self.estimator.control_value}\n"
             f"Treatment value: {self.estimator.treatment_value}\n"
-            f"Outcome: {self.estimator.outcome[0]}\n"
+            f"Outcome: {self.estimator.outcome}\n"
             f"Adjustment set: {self.adjustment_set}\n"
             f"{self.test_value.type}: {self.test_value.value}\n"
         )
diff --git a/causal_testing/testing/estimators.py b/causal_testing/testing/estimators.py
@@ -73,14 +73,6 @@ def add_modelling_assumptions(self):
         must hold if the resulting causal inference is to be considered valid.
         """
 
-    @abstractmethod
-    def estimate_ate(self) -> float:
-        """
-        Estimate the unit effect of the treatment on the outcome. That is, the coefficient of the treatment variable
-        in the linear regression equation.
-        :return: The intercept and coefficient of the linear regression equation
-        """
-
     def compute_confidence_intervals(self) -> list[float, float]:
         """
         Estimate the 95% Wald confidence intervals for the effect of changing the treatment from control values to
@@ -535,21 +527,26 @@ def add_modelling_assumptions(self):
             (iii) Instrument and outcome do not share causes
         """
 
-    def estimate_coefficient(self):
+    def estimate_coefficient(self, df):
         """
         Estimate the linear regression coefficient of the treatment on the outcome.
         """
         # Estimate the total effect of instrument I on outcome Y = abI + c1
-        ab = sm.OLS(self.df[self.outcome], self.df[[self.instrument]]).fit().params[self.instrument]
+        ab = sm.OLS(df[self.outcome], df[[self.instrument]]).fit().params[self.instrument]
 
         # Estimate the direct effect of instrument I on treatment X = aI + c1
-        a = sm.OLS(self.df[self.treatment], self.df[[self.instrument]]).fit().params[self.instrument]
+        a = sm.OLS(df[self.treatment], df[[self.instrument]]).fit().params[self.instrument]
 
         # Estimate the coefficient of I on X by cancelling
         return ab / a
 
-    def estimate_ate(self):
-        return (self.treatment_value - self.control_value) * self.estimate_coefficient(), (None, None)
+    def estimate_unit_ate(self, bootstrap_size=100):
+        bootstraps = sorted([self.estimate_coefficient(self.df.sample(len(self.df), replace=True)) for _ in range(bootstrap_size)])
+        bound = ceil((bootstrap_size * 0.05) / 2)
+        ci_low = bootstraps[bound]
+        ci_high = bootstraps[bootstrap_size - bound]
+
+        return self.estimate_coefficient(self.df), (ci_low, ci_high)
 
 
 class CausalForestEstimator(Estimator):