Pass in datacollector not dataframe

christopher-wild · christopher-wild · commit b76179530caf · 2023-07-25T13:53:08.000+01:00
diff --git a/causal_testing/data_collection/data_collector.py b/causal_testing/data_collection/data_collector.py
@@ -140,6 +140,7 @@ class ObservationalDataCollector(DataCollector):
     def __init__(self, scenario: Scenario, data: pd.DataFrame):
         super().__init__(scenario)
         self.data = data
+        self.data_checked = False
 
     def collect_data(self, **kwargs) -> pd.DataFrame:
         """Read a pandas dataframe and filter to remove
@@ -149,7 +150,6 @@ def collect_data(self, **kwargs) -> pd.DataFrame:
 
         :return: A pandas dataframe containing execution data that is valid for the scenario-under-test.
         """
-
         execution_data_df = self.data
         for meta in self.scenario.metas():
             if meta.name not in self.data:
@@ -158,4 +158,5 @@ def collect_data(self, **kwargs) -> pd.DataFrame:
         for var_name, var in self.scenario.variables.items():
             if issubclass(var.datatype, Enum):
                 scenario_execution_data_df[var_name] = [var.datatype(x) for x in scenario_execution_data_df[var_name]]
-        return scenario_execution_data_df
+        self.data_checked = True
+        self.data = scenario_execution_data_df
diff --git a/causal_testing/testing/causal_test_case.py b/causal_testing/testing/causal_test_case.py
@@ -9,8 +9,11 @@
 from causal_testing.testing.base_test_case import BaseTestCase
 from causal_testing.testing.estimators import Estimator
 from causal_testing.testing.causal_test_result import CausalTestResult
-from causal_testing.data_collection.data_collector import DataCollector
+from causal_testing.data_collection.data_collector import ObservationalDataCollector
+from causal_testing.specification.causal_dag import CausalDAG
+from causal_testing.specification.scenario import Scenario
 
+from causal_testing.specification.causal_specification import CausalSpecification
 logger = logging.getLogger(__name__)
 
 
@@ -28,15 +31,15 @@ class CausalTestCase:
     """
 
     def __init__(
-        # pylint: disable=too-many-arguments
-        self,
-        base_test_case: BaseTestCase,
-        expected_causal_effect: CausalTestOutcome,
-        control_value: Any = None,
-        treatment_value: Any = None,
-        estimate_type: str = "ate",
-        estimate_params: dict = None,
-        effect_modifier_configuration: dict[Variable:Any] = None,
+            # pylint: disable=too-many-arguments
+            self,
+            base_test_case: BaseTestCase,
+            expected_causal_effect: CausalTestOutcome,
+            control_value: Any = None,
+            treatment_value: Any = None,
+            estimate_type: str = "ate",
+            estimate_params: dict = None,
+            effect_modifier_configuration: dict[Variable:Any] = None,
     ):
         """
         :param base_test_case: A BaseTestCase object consisting of a treatment variable, outcome variable and effect
@@ -78,35 +81,81 @@ def get_treatment_value(self):
         """Return the treatment value of the treatment variable in this causal test case."""
         return self.treatment_value
 
-    def execute_test(self, estimator: type(Estimator), dataframe: pd.DataFrame) -> CausalTestResult:
+    def execute_test(self, estimator: type(Estimator), data_collector: ObservationalDataCollector, causal_specification: CausalSpecification) -> CausalTestResult:
         """Execute a causal test case and return the causal test result.
 
         :param estimator: A reference to an Estimator class.
         :param causal_test_case: The CausalTestCase object to be tested
         :return causal_test_result: A CausalTestResult for the executed causal test case.
         """
-        if self.scenario_execution_data_df.empty:
-            raise ValueError("No data has been loaded. Please call load_data prior to executing a causal test case.")
+        if not data_collector.data_checked:
+            data_collector.collect_data()
         if estimator.df is None:
-            estimator.df = dataframe
+            estimator.df = data_collector.data
         treatment_variable = self.treatment_variable
         treatments = treatment_variable.name
         outcome_variable = self.outcome_variable
 
         logger.info("treatments: %s", treatments)
         logger.info("outcomes: %s", outcome_variable)
-        minimal_adjustment_set = self.causal_dag.identification(BaseTestCase(treatment_variable, outcome_variable))
+        minimal_adjustment_set = causal_specification.causal_dag.identification(BaseTestCase(treatment_variable, outcome_variable))
         minimal_adjustment_set = minimal_adjustment_set - set(treatment_variable.name)
         minimal_adjustment_set = minimal_adjustment_set - set(outcome_variable.name)
 
         variables_for_positivity = list(minimal_adjustment_set) + [treatment_variable.name] + [outcome_variable.name]
 
-        if self._check_positivity_violation(variables_for_positivity):
+        if self._check_positivity_violation(variables_for_positivity, causal_specification.scenario, data_collector.data):
             raise ValueError("POSITIVITY VIOLATION -- Cannot proceed.")
 
         causal_test_result = self._return_causal_test_results(estimator)
         return causal_test_result
 
+    def _return_causal_test_results(self, estimator, causal_test_case):
+        """Depending on the estimator used, calculate the 95% confidence intervals and return in a causal_test_result
+
+        :param estimator: An Estimator class object
+        :param causal_test_case: The concrete test case to be executed
+        :return: a CausalTestResult object containing the confidence intervals
+        """
+        if not hasattr(estimator, f"estimate_{causal_test_case.estimate_type}"):
+            raise AttributeError(f"{estimator.__class__} has no {causal_test_case.estimate_type} method.")
+        estimate_effect = getattr(estimator, f"estimate_{causal_test_case.estimate_type}")
+        effect, confidence_intervals = estimate_effect(**causal_test_case.estimate_params)
+        causal_test_result = CausalTestResult(
+            estimator=estimator,
+            test_value=TestValue(causal_test_case.estimate_type, effect),
+            effect_modifier_configuration=causal_test_case.effect_modifier_configuration,
+            confidence_intervals=confidence_intervals,
+        )
+
+        return causal_test_result
+
+    def _check_positivity_violation(self, variables_list, scenario: Scenario, df):
+        """Check whether the dataframe has a positivity violation relative to the specified variables list.
+
+        A positivity violation occurs when there is a stratum of the dataframe which does not have any data. Put simply,
+        if we split the dataframe into covariate sub-groups, each sub-group must contain both a treated and untreated
+        individual. If a positivity violation occurs, causal inference is still possible using a properly specified
+        parametric estimator. Therefore, we should not throw an exception upon violation but raise a warning instead.
+
+        :param variables_list: The list of variables for which positivity must be satisfied.
+        :return: True if positivity is violated, False otherwise.
+        """
+        if not (set(variables_list) - {x.name for x in scenario.hidden_variables()}).issubset(
+                df.columns
+        ):
+            missing_variables = set(variables_list) - set(df.columns)
+            logger.warning(
+                "Positivity violation: missing data for variables %s.\n"
+                "Causal inference is only valid if a well-specified parametric model is used.\n"
+                "Alternatively, consider restricting analysis to executions without the variables:"
+                ".",
+                missing_variables,
+            )
+            return True
+
+        return False
+
     def __str__(self):
         treatment_config = {self.treatment_variable.name: self.treatment_value}
         control_config = {self.treatment_variable.name: self.control_value}