CITCOM-project · jmafoster1 · Feb 27, 2025 · Feb 17, 2025 · Feb 17, 2025 · Feb 17, 2025
diff --git a/causal_testing/data_collection/__init__.py b/causal_testing/data_collection/__init__.py
diff --git a/causal_testing/data_collection/data_collector.py b/causal_testing/data_collection/data_collector.py
diff --git a/causal_testing/estimation/experimental_estimator.py b/causal_testing/estimation/experimental_estimator.py
@@ -0,0 +1,103 @@
+"""This module contains the ExperimentalEstimator class for directly interacting with the system under test."""
+
+from typing import Any
+from abc import abstractmethod
+import pandas as pd
+
+from causal_testing.estimation.abstract_estimator import Estimator
+
+
+class ExperimentalEstimator(Estimator):
+    """A Logistic Regression Estimator is a parametric estimator which restricts the variables in the data to a linear
+    combination of parameters and functions of the variables (note these functions need not be linear). It is designed
+    for estimating categorical outcomes.
+    """
+
+    def __init__(
+        # pylint: disable=too-many-arguments
+        self,
+        treatment: str,
+        treatment_value: float,
+        control_value: float,
+        adjustment_set: dict[str:Any],
+        outcome: str,
+        effect_modifiers: dict[str:Any] = None,
+        alpha: float = 0.05,
+        repeats: int = 200,
+    ):
+        # pylint: disable=R0801
+        super().__init__(
+            treatment=treatment,
+            treatment_value=treatment_value,
+            control_value=control_value,
+            adjustment_set=adjustment_set,
+            outcome=outcome,
+            effect_modifiers=effect_modifiers,
+            alpha=alpha,
+        )
+        if effect_modifiers is None:
+            self.effect_modifiers = {}
+        self.repeats = repeats
+
+    def add_modelling_assumptions(self):
+        """
+        Add modelling assumptions to the estimator. This is a list of strings which list the modelling assumptions that
+        must hold if the resulting causal inference is to be considered valid.
+        """
+        self.modelling_assumptions.append(
+            "The supplied number of repeats must be sufficient for statistical significance"
+        )
+
+    @abstractmethod
+    def run_system(self, configuration: dict) -> dict:
+        """
+        Runs the system under test with the supplied configuration and supplies the outputs as a dict.
+        :param configuration: The run configuration arguments.
+        :returns: The resulting output as a dict.
+        """
+
+    def estimate_ate(self) -> tuple[pd.Series, list[pd.Series, pd.Series]]:
+        """Estimate the average treatment effect of the treatment on the outcome. That is, the change in outcome caused
+        by changing the treatment variable from the control value to the treatment value.
+
+        :return: The average treatment effect and the bootstrapped confidence intervals.
+        """
+        control_configuration = self.adjustment_set | self.effect_modifiers | {self.treatment: self.control_value}
+        treatment_configuration = self.adjustment_set | self.effect_modifiers | {self.treatment: self.treatment_value}
+
+        control_outcomes = pd.DataFrame([self.run_system(control_configuration) for _ in range(self.repeats)])
+        treatment_outcomes = pd.DataFrame([self.run_system(treatment_configuration) for _ in range(self.repeats)])
+
+        difference = (treatment_outcomes[self.outcome] - control_outcomes[self.outcome]).sort_values().reset_index()
+
+        ci_low_index = round(self.repeats * (self.alpha / 2))
+        ci_low = difference.iloc[ci_low_index]
+        ci_high = difference.iloc[self.repeats - ci_low_index]
+
+        return pd.Series({self.treatment: difference.mean()[self.outcome]}), [
+            pd.Series({self.treatment: ci_low[self.outcome]}),
+            pd.Series({self.treatment: ci_high[self.outcome]}),
+        ]
+
+    def estimate_risk_ratio(self) -> tuple[pd.Series, list[pd.Series, pd.Series]]:
+        """Estimate the risk ratio of the treatment on the outcome. That is, the change in outcome caused
+        by changing the treatment variable from the control value to the treatment value.
+
+        :return: The average treatment effect and the bootstrapped confidence intervals.
+        """
+        control_configuration = self.adjustment_set | self.effect_modifiers | {self.treatment: self.control_value}
+        treatment_configuration = self.adjustment_set | self.effect_modifiers | {self.treatment: self.treatment_value}
+
+        control_outcomes = pd.DataFrame([self.run_system(control_configuration) for _ in range(self.repeats)])
+        treatment_outcomes = pd.DataFrame([self.run_system(treatment_configuration) for _ in range(self.repeats)])
+
+        difference = (treatment_outcomes[self.outcome] / control_outcomes[self.outcome]).sort_values().reset_index()
+
+        ci_low_index = round(self.repeats * (self.alpha / 2))
+        ci_low = difference.iloc[ci_low_index]
+        ci_high = difference.iloc[self.repeats - ci_low_index]
+
+        return pd.Series({self.treatment: difference.mean()[self.outcome]}), [
+            pd.Series({self.treatment: ci_low[self.outcome]}),
+            pd.Series({self.treatment: ci_high[self.outcome]}),
+        ]
diff --git a/causal_testing/json_front/json_class.py b/causal_testing/json_front/json_class.py
@@ -14,7 +14,6 @@
 import scipy
 from fitter import Fitter, get_common_distributions
 
-from causal_testing.data_collection.data_collector import ObservationalDataCollector
 from causal_testing.generation.abstract_causal_test_case import AbstractCausalTestCase
 from causal_testing.specification.causal_dag import CausalDAG
 from causal_testing.specification.causal_specification import CausalSpecification
@@ -56,8 +55,8 @@ def __init__(self, output_path: str, output_overwrite: bool = False):
         self.scenario = None
         self.causal_specification = None
         self.output_path = Path(output_path)
+        self.df = None
         self.check_file_exists(self.output_path, output_overwrite)
-        self.data_collector = None
 
     def set_paths(self, json_path: str, dag_path: str, data_paths: list[str] = None):
         """
@@ -70,7 +69,7 @@ def set_paths(self, json_path: str, dag_path: str, data_paths: list[str] = None)
             data_paths = []
         self.input_paths = JsonClassPaths(json_path=json_path, dag_path=dag_path, data_paths=data_paths)
 
-    def setup(self, scenario: Scenario, data=None, ignore_cycles=False):
+    def setup(self, scenario: Scenario, ignore_cycles=False):
         """Function to populate all the necessary parts of the json_class needed to execute tests"""
         self.scenario = scenario
         self._get_scenario_variables()
@@ -83,21 +82,20 @@ def setup(self, scenario: Scenario, data=None, ignore_cycles=False):
             self.test_plan = json.load(f)
         # Populate the data
         if self.input_paths.data_paths:
-            data = pd.concat([pd.read_csv(data_file, header=0) for data_file in self.input_paths.data_paths])
-        if data is None or len(data) == 0:
+            self.df = pd.concat([pd.read_csv(data_file, header=0) for data_file in self.input_paths.data_paths])
+        if self.df is None or len(self.df) == 0:
             raise ValueError(
                 "No data found. Please either provide a path to a file containing data or manually populate the .data "
                 "attribute with a dataframe before calling .setup()"
             )
-        self.data_collector = ObservationalDataCollector(self.scenario, data)
         self._populate_metas()
 
     def _create_abstract_test_case(self, test, mutates, effects):
         assert len(test["mutations"]) == 1
         treatment_var = next(self.scenario.variables[v] for v in test["mutations"])
 
         if not treatment_var.distribution:
-            fitter = Fitter(self.data_collector.data[treatment_var.name], distributions=get_common_distributions())
+            fitter = Fitter(self.df[treatment_var.name], distributions=get_common_distributions())
             fitter.fit()
             (dist, params) = list(fitter.get_best(method="sumsquare_error").items())[0]
             treatment_var.distribution = getattr(scipy.stats, dist)(**params)
@@ -257,7 +255,7 @@ def _populate_metas(self):
         Populate data with meta-variable values and add distributions to Causal Testing Framework Variables
         """
         for meta in self.scenario.variables_of_type(Meta):
-            meta.populate(self.data_collector.data)
+            meta.populate(self.df)
 
     def _execute_test_case(
         self, causal_test_case: CausalTestCase, test: Mapping, f_flag: bool
@@ -273,9 +271,7 @@ def _execute_test_case(
         failed = False
 
         estimation_model = self._setup_test(causal_test_case=causal_test_case, test=test)
-        causal_test_result = causal_test_case.execute_test(
-            estimator=estimation_model, data_collector=self.data_collector
-        )
+        causal_test_result = causal_test_case.execute_test(estimator=estimation_model)
         test_passes = causal_test_case.expected_causal_effect.apply(causal_test_result)
 
         if "coverage" in test and test["coverage"]:
@@ -329,7 +325,7 @@ def _setup_test(self, causal_test_case: CausalTestCase, test: Mapping) -> Estima
         estimator_kwargs["control_value"] = causal_test_case.control_value
         estimator_kwargs["outcome"] = causal_test_case.outcome_variable.name
         estimator_kwargs["effect_modifiers"] = causal_test_case.effect_modifier_configuration
-        estimator_kwargs["df"] = self.data_collector.collect_data()
+        estimator_kwargs["df"] = self.df
         estimator_kwargs["alpha"] = test["alpha"] if "alpha" in test else 0.05
 
         estimation_model = test["estimator"](**estimator_kwargs)