Removed data collector from causal test suite

jmafoster1 · jmafoster1 · commit b8ace8edc237 · 2025-02-17T10:05:16.000Z
diff --git a/causal_testing/json_front/json_class.py b/causal_testing/json_front/json_class.py
@@ -14,7 +14,6 @@
 import scipy
 from fitter import Fitter, get_common_distributions
 
-from causal_testing.data_collection.data_collector import ObservationalDataCollector
 from causal_testing.generation.abstract_causal_test_case import AbstractCausalTestCase
 from causal_testing.specification.causal_dag import CausalDAG
 from causal_testing.specification.causal_specification import CausalSpecification
@@ -56,8 +55,8 @@ def __init__(self, output_path: str, output_overwrite: bool = False):
         self.scenario = None
         self.causal_specification = None
         self.output_path = Path(output_path)
+        self.df = None
         self.check_file_exists(self.output_path, output_overwrite)
-        self.data_collector = None
 
     def set_paths(self, json_path: str, dag_path: str, data_paths: list[str] = None):
         """
@@ -70,7 +69,7 @@ def set_paths(self, json_path: str, dag_path: str, data_paths: list[str] = None)
             data_paths = []
         self.input_paths = JsonClassPaths(json_path=json_path, dag_path=dag_path, data_paths=data_paths)
 
-    def setup(self, scenario: Scenario, data=None, ignore_cycles=False):
+    def setup(self, scenario: Scenario, ignore_cycles=False):
         """Function to populate all the necessary parts of the json_class needed to execute tests"""
         self.scenario = scenario
         self._get_scenario_variables()
@@ -83,21 +82,20 @@ def setup(self, scenario: Scenario, data=None, ignore_cycles=False):
             self.test_plan = json.load(f)
         # Populate the data
         if self.input_paths.data_paths:
-            data = pd.concat([pd.read_csv(data_file, header=0) for data_file in self.input_paths.data_paths])
-        if data is None or len(data) == 0:
+            self.df = pd.concat([pd.read_csv(data_file, header=0) for data_file in self.input_paths.data_paths])
+        if self.df is None or len(self.df) == 0:
             raise ValueError(
                 "No data found. Please either provide a path to a file containing data or manually populate the .data "
                 "attribute with a dataframe before calling .setup()"
             )
-        self.data_collector = ObservationalDataCollector(self.scenario, data)
         self._populate_metas()
 
     def _create_abstract_test_case(self, test, mutates, effects):
         assert len(test["mutations"]) == 1
         treatment_var = next(self.scenario.variables[v] for v in test["mutations"])
 
         if not treatment_var.distribution:
-            fitter = Fitter(self.data_collector.data[treatment_var.name], distributions=get_common_distributions())
+            fitter = Fitter(self.df[treatment_var.name], distributions=get_common_distributions())
             fitter.fit()
             (dist, params) = list(fitter.get_best(method="sumsquare_error").items())[0]
             treatment_var.distribution = getattr(scipy.stats, dist)(**params)
@@ -257,7 +255,7 @@ def _populate_metas(self):
         Populate data with meta-variable values and add distributions to Causal Testing Framework Variables
         """
         for meta in self.scenario.variables_of_type(Meta):
-            meta.populate(self.data_collector.data)
+            meta.populate(self.df)
 
     def _execute_test_case(
         self, causal_test_case: CausalTestCase, test: Mapping, f_flag: bool
@@ -273,9 +271,7 @@ def _execute_test_case(
         failed = False
 
         estimation_model = self._setup_test(causal_test_case=causal_test_case, test=test)
-        causal_test_result = causal_test_case.execute_test(
-            estimator=estimation_model, data_collector=self.data_collector
-        )
+        causal_test_result = causal_test_case.execute_test(estimator=estimation_model)
         test_passes = causal_test_case.expected_causal_effect.apply(causal_test_result)
 
         if "coverage" in test and test["coverage"]:
@@ -329,7 +325,7 @@ def _setup_test(self, causal_test_case: CausalTestCase, test: Mapping) -> Estima
         estimator_kwargs["control_value"] = causal_test_case.control_value
         estimator_kwargs["outcome"] = causal_test_case.outcome_variable.name
         estimator_kwargs["effect_modifiers"] = causal_test_case.effect_modifier_configuration
-        estimator_kwargs["df"] = self.data_collector.collect_data()
+        estimator_kwargs["df"] = self.df
         estimator_kwargs["alpha"] = test["alpha"] if "alpha" in test else 0.05
 
         estimation_model = test["estimator"](**estimator_kwargs)
diff --git a/causal_testing/testing/causal_test_adequacy.py b/causal_testing/testing/causal_test_adequacy.py
@@ -105,7 +105,7 @@ def measure_adequacy(self):
             else:
                 estimator.df = estimator.df.sample(len(estimator.df), replace=True, random_state=i)
             try:
-                results.append(self.test_case.execute_test(estimator, None))
+                results.append(self.test_case.execute_test(estimator))
             except LinAlgError:
                 logger.warning("Adequacy LinAlgError")
                 continue
diff --git a/causal_testing/testing/causal_test_case.py b/causal_testing/testing/causal_test_case.py
@@ -2,13 +2,13 @@
 
 import logging
 from typing import Any
+import pandas as pd
 
 from causal_testing.specification.variable import Variable
 from causal_testing.testing.causal_test_outcome import CausalTestOutcome
 from causal_testing.testing.base_test_case import BaseTestCase
 from causal_testing.estimation.abstract_estimator import Estimator
 from causal_testing.testing.causal_test_result import CausalTestResult, TestValue
-from causal_testing.data_collection.data_collector import DataCollector
 
 
 logger = logging.getLogger(__name__)
@@ -58,25 +58,13 @@ def __init__(
         else:
             self.effect_modifier_configuration = {}
 
-    def execute_test(self, estimator: type(Estimator), data_collector: DataCollector) -> CausalTestResult:
+    def execute_test(self, estimator: type(Estimator)) -> CausalTestResult:
         """Execute a causal test case and return the causal test result.
 
-        :param estimator: A reference to an Estimator class.
-        :param data_collector: The data collector to be used which provides a dataframe for the Estimator
+        :param estimator: An Estimator class object
         :return causal_test_result: A CausalTestResult for the executed causal test case.
         """
-        if estimator.df is None:
-            estimator.df = data_collector.collect_data()
-
-        causal_test_result = self._return_causal_test_results(estimator)
-        return causal_test_result
-
-    def _return_causal_test_results(self, estimator) -> CausalTestResult:
-        """Depending on the estimator used, calculate the 95% confidence intervals and return in a causal_test_result
 
-        :param estimator: An Estimator class object
-        :return: a CausalTestResult object containing the confidence intervals
-        """
         if not hasattr(estimator, f"estimate_{self.estimate_type}"):
             raise AttributeError(f"{estimator.__class__} has no {self.estimate_type} method.")
         estimate_effect = getattr(estimator, f"estimate_{self.estimate_type}")
diff --git a/causal_testing/testing/causal_test_suite.py b/causal_testing/testing/causal_test_suite.py
@@ -2,14 +2,14 @@
 https://causal-testing-framework.readthedocs.io/en/latest/test_suite.html"""
 
 import logging
-
-from collections import UserDict
 from typing import Type, Iterable
+from collections import UserDict
+import pandas as pd
+
 from causal_testing.testing.base_test_case import BaseTestCase
 from causal_testing.testing.causal_test_case import CausalTestCase
 from causal_testing.estimation.abstract_estimator import Estimator
 from causal_testing.testing.causal_test_result import CausalTestResult
-from causal_testing.data_collection.data_collector import DataCollector
 from causal_testing.specification.causal_specification import CausalSpecification
 
 logger = logging.getLogger(__name__)
@@ -47,17 +47,14 @@ def add_test_object(
         self.data[base_test_case] = test_object
 
     def execute_test_suite(
-        self, data_collector: DataCollector, causal_specification: CausalSpecification
+        self, causal_specification: CausalSpecification, df: pd.DataFrame
     ) -> dict[str, CausalTestResult]:
         """Execute a suite of causal tests and return the results in a list
-        :param data_collector: The data collector to be used for the test_suite. Can be observational, experimental or
-         custom
-        :param causal_specification:
+        :param causal_specification: A causal specification object which wraps up the scenario and causal DAG.
+        :param df: A dataframe containing the test data.
         :return: A dictionary where each key is the name of the estimators specified and the values are lists of
                 causal_test_result objects
         """
-        if data_collector.data.empty:
-            raise ValueError("No data has been loaded. Please call load_data prior to executing a causal test case.")
         test_suite_results = {}
         for edge in self:
             logger.info("treatment: %s", edge.treatment_variable)
@@ -79,8 +76,9 @@ def execute_test_suite(
                         test.control_value,
                         minimal_adjustment_set,
                         test.outcome_variable.name,
+                        df=df,
                     )
-                    causal_test_result = test.execute_test(estimator, data_collector)
+                    causal_test_result = test.execute_test(estimator)
                     causal_test_results.append(causal_test_result)
 
                 results[estimator_class.__name__] = causal_test_results
diff --git a/examples/covasim_/doubling_beta/example_beta.py b/examples/covasim_/doubling_beta/example_beta.py
@@ -65,9 +65,7 @@ def doubling_beta_CATE_on_csv(
     )
 
     # Add squared terms for beta, since it has a quadratic relationship with cumulative infections
-    causal_test_result = causal_test_case.execute_test(
-        estimator=linear_regression_estimator, data_collector=data_collector
-    )
+    causal_test_result = causal_test_case.execute_test(estimator=linear_regression_estimator)
 
     # Repeat for association estimate (no adjustment)
     no_adjustment_linear_regression_estimator = LinearRegressionEstimator(
@@ -79,9 +77,7 @@ def doubling_beta_CATE_on_csv(
         df=past_execution_df,
         formula="cum_infections ~ beta + I(beta ** 2)",
     )
-    association_test_result = causal_test_case.execute_test(
-        estimator=no_adjustment_linear_regression_estimator, data_collector=data_collector
-    )
+    association_test_result = causal_test_case.execute_test(estimator=no_adjustment_linear_regression_estimator)
 
     # Store results for plotting
     results_dict["association"] = {
@@ -111,9 +107,7 @@ def doubling_beta_CATE_on_csv(
             df=counterfactual_past_execution_df,
             formula="cum_infections ~ beta + I(beta ** 2) + avg_age + contacts",
         )
-        counterfactual_causal_test_result = causal_test_case.execute_test(
-            estimator=linear_regression_estimator, data_collector=data_collector
-        )
+        counterfactual_causal_test_result = causal_test_case.execute_test(estimator=linear_regression_estimator)
 
         results_dict["counterfactual"] = {
             "ate": counterfactual_causal_test_result.test_value.value,
diff --git a/examples/lr91/example_max_conductances.py b/examples/lr91/example_max_conductances.py
@@ -5,7 +5,6 @@
 from causal_testing.specification.scenario import Scenario
 from causal_testing.specification.variable import Input, Output
 from causal_testing.specification.causal_specification import CausalSpecification
-from causal_testing.data_collection.data_collector import ObservationalDataCollector
 from causal_testing.testing.causal_test_case import CausalTestCase
 from causal_testing.testing.causal_test_outcome import Positive, Negative, NoEffect
 from causal_testing.estimation.linear_regression_estimator import LinearRegressionEstimator
@@ -134,17 +133,19 @@ def effects_on_APD90(observational_data_path, treatment_var, control_val, treatm
         treatment_value=treatment_val,
     )
 
-    # 7. Create a data collector
-    data_collector = ObservationalDataCollector(scenario, pd.read_csv(observational_data_path))
-
     # 8. Obtain the minimal adjustment set from the causal DAG
     minimal_adjustment_set = causal_dag.identification(base_test_case)
     linear_regression_estimator = LinearRegressionEstimator(
-        treatment_var.name, treatment_val, control_val, minimal_adjustment_set, "APD90"
+        treatment_var.name,
+        treatment_val,
+        control_val,
+        minimal_adjustment_set,
+        "APD90",
+        df=pd.read_csv(observational_data_path),
     )
 
     # 9. Run the causal test and print results
-    causal_test_result = causal_test_case.execute_test(linear_regression_estimator, data_collector)
+    causal_test_result = causal_test_case.execute_test(linear_regression_estimator)
     logger.info("%s", causal_test_result)
     return causal_test_result.test_value.value, causal_test_result.confidence_intervals
 
diff --git a/examples/lr91/example_max_conductances_test_suite.py b/examples/lr91/example_max_conductances_test_suite.py
@@ -5,7 +5,6 @@
 from causal_testing.specification.scenario import Scenario
 from causal_testing.specification.variable import Input, Output
 from causal_testing.specification.causal_specification import CausalSpecification
-from causal_testing.data_collection.data_collector import ObservationalDataCollector
 from causal_testing.testing.causal_test_case import CausalTestCase
 from causal_testing.testing.causal_test_outcome import Positive, Negative, NoEffect
 from causal_testing.estimation.linear_regression_estimator import LinearRegressionEstimator
@@ -143,11 +142,8 @@ def effects_on_APD90(observational_data_path, test_suite):
     # 5. Create a causal specification from the scenario and causal DAG
     causal_specification = CausalSpecification(scenario, causal_dag)
 
-    # 7. Create a data collector
-    data_collector = ObservationalDataCollector(scenario, pd.read_csv(observational_data_path))
-
     # 8. Run the causal test suite
-    causal_test_results = test_suite.execute_test_suite(data_collector, causal_specification)
+    causal_test_results = test_suite.execute_test_suite(causal_specification, pd.read_csv(observational_data_path))
     return causal_test_results
 
 
diff --git a/examples/poisson-line-process/example_poisson_process.py b/examples/poisson-line-process/example_poisson_process.py
@@ -116,7 +116,7 @@ def causal_test_intensity_num_shapes(
         )
 
     # 9. Execute the test
-    causal_test_result = causal_test_case.execute_test(estimator, None)
+    causal_test_result = causal_test_case.execute_test(estimator)
 
     return causal_test_result
 
diff --git a/examples/poisson/example_run_causal_tests.py b/examples/poisson/example_run_causal_tests.py
@@ -7,7 +7,6 @@
 from causal_testing.testing.causal_test_outcome import ExactValue, Positive, Negative, NoEffect, CausalTestOutcome
 from causal_testing.testing.causal_test_result import CausalTestResult
 from causal_testing.json_front.json_class import JsonUtility
-from causal_testing.estimation.abstract_estimator import Estimator
 from causal_testing.specification.scenario import Scenario
 from causal_testing.specification.variable import Input, Output, Meta
 
diff --git a/tests/testing_tests/test_causal_test_adequacy.py b/tests/testing_tests/test_causal_test_adequacy.py
@@ -138,7 +138,7 @@ def test_data_adequacy_group_by(self):
             treatment_value=treatment_strategy,
             estimate_type="hazard_ratio",
         )
-        causal_test_result = causal_test_case.execute_test(estimation_model, None)
+        causal_test_result = causal_test_case.execute_test(estimation_model)
         adequacy_metric = DataAdequacy(causal_test_case, estimation_model, group_by="id")
         adequacy_metric.measure_adequacy()
         adequacy_dict = adequacy_metric.to_dict()
diff --git a/tests/testing_tests/test_causal_test_case.py b/tests/testing_tests/test_causal_test_case.py
@@ -8,7 +8,6 @@
 from causal_testing.specification.causal_specification import CausalSpecification, Scenario
 from causal_testing.specification.variable import Input, Output
 from causal_testing.specification.causal_dag import CausalDAG
-from causal_testing.data_collection.data_collector import ObservationalDataCollector
 from causal_testing.testing.causal_test_case import CausalTestCase
 from causal_testing.testing.causal_test_outcome import ExactValue
 from causal_testing.estimation.linear_regression_estimator import LinearRegressionEstimator
@@ -83,17 +82,13 @@ def setUp(self) -> None:
 
         # 4. Create dummy test data and write to csv
         np.random.seed(1)
-        df = pd.DataFrame({"D": list(np.random.normal(60, 10, 1000))})  # D = exogenous
-        df["A"] = [1 if d > 50 else 0 for d in df["D"]]
-        df["C"] = df["D"] + (4 * (df["A"] + 2))  # C = (4*(A+2)) + D
-        self.observational_data_csv_path = os.path.join(self.temp_dir_path, "observational_data.csv")
-        df.to_csv(self.observational_data_csv_path, index=False)
-
-        # 5. Create observational data collector
-        # Obsolete?
-        self.data_collector = ObservationalDataCollector(self.scenario, df)
-        self.data_collector.collect_data()
-        self.df = self.data_collector.collect_data()
+        self.df = pd.DataFrame({"D": list(np.random.normal(60, 10, 1000))})  # D = exogenous
+        self.df["A"] = [1 if d > 50 else 0 for d in self.df["D"]]
+        self.df["C"] = self.df["D"] + (4 * (self.df["A"] + 2))  # C = (4*(A+2)) + D
+        # self.observational_data_csv_path = os.path.join(self.temp_dir_path, "observational_data.csv")
+        # self.df.to_csv(self.observational_data_csv_path, index=False)
+
+        # 5. Create minimal adjustment set
         self.minimal_adjustment_set = self.causal_dag.identification(self.base_test_case)
         # 6. Easier to access treatment and outcome values
         self.treatment_value = 1
@@ -126,7 +121,7 @@ def test_execute_test_observational_linear_regression_estimator(self):
             "C",
             self.df,
         )
-        causal_test_result = self.causal_test_case.execute_test(estimation_model, self.data_collector)
+        causal_test_result = self.causal_test_case.execute_test(estimation_model)
         pd.testing.assert_series_equal(causal_test_result.test_value.value, pd.Series(4.0), atol=1e-10)
 
     def test_execute_test_observational_linear_regression_estimator_direct_effect(self):
@@ -153,7 +148,7 @@ def test_execute_test_observational_linear_regression_estimator_direct_effect(se
             "C",
             self.df,
         )
-        causal_test_result = causal_test_case.execute_test(estimation_model, self.data_collector)
+        causal_test_result = causal_test_case.execute_test(estimation_model)
         pd.testing.assert_series_equal(causal_test_result.test_value.value, pd.Series(4.0), atol=1e-10)
 
     def test_execute_test_observational_linear_regression_estimator_coefficient(self):
@@ -168,7 +163,7 @@ def test_execute_test_observational_linear_regression_estimator_coefficient(self
             self.df,
         )
         self.causal_test_case.estimate_type = "coefficient"
-        causal_test_result = self.causal_test_case.execute_test(estimation_model, self.data_collector)
+        causal_test_result = self.causal_test_case.execute_test(estimation_model)
         pd.testing.assert_series_equal(causal_test_result.test_value.value, pd.Series({"D": 0.0}), atol=1e-1)
 
     def test_execute_test_observational_linear_regression_estimator_risk_ratio(self):
@@ -183,7 +178,7 @@ def test_execute_test_observational_linear_regression_estimator_risk_ratio(self)
             self.df,
         )
         self.causal_test_case.estimate_type = "risk_ratio"
-        causal_test_result = self.causal_test_case.execute_test(estimation_model, self.data_collector)
+        causal_test_result = self.causal_test_case.execute_test(estimation_model)
         pd.testing.assert_series_equal(causal_test_result.test_value.value, pd.Series(0.0), atol=1)
 
     def test_invalid_estimate_type(self):
@@ -199,7 +194,7 @@ def test_invalid_estimate_type(self):
         )
         self.causal_test_case.estimate_type = "invalid"
         with self.assertRaises(AttributeError):
-            self.causal_test_case.execute_test(estimation_model, self.data_collector)
+            self.causal_test_case.execute_test(estimation_model)
 
     def test_execute_test_observational_linear_regression_estimator_squared_term(self):
         """Check that executing the causal test case returns the correct results for dummy data with a squared term
@@ -213,5 +208,5 @@ def test_execute_test_observational_linear_regression_estimator_squared_term(sel
             self.df,
             formula=f"C ~ A + {'+'.join(self.minimal_adjustment_set)} + (D ** 2)",
         )
-        causal_test_result = self.causal_test_case.execute_test(estimation_model, self.data_collector)
+        causal_test_result = self.causal_test_case.execute_test(estimation_model)
         pd.testing.assert_series_equal(causal_test_result.test_value.value, pd.Series(4.0), atol=1)
diff --git a/tests/testing_tests/test_causal_test_suite.py b/tests/testing_tests/test_causal_test_suite.py

Original file line number	Diff line number	Diff line change
`@@ -116,7 +116,7 @@ def causal_test_intensity_num_shapes(`
`116`	`116`	`)`
`117`	`117`
`118`	`118`	`# 9. Execute the test`
`119`		`- causal_test_result = causal_test_case.execute_test(estimator, None)`
	`119`	`+ causal_test_result = causal_test_case.execute_test(estimator)`
`120`	`120`
`121`	`121`	`return causal_test_result`
`122`	`122`
Original file line number	Diff line number	Diff line change
`@@ -138,7 +138,7 @@ def test_data_adequacy_group_by(self):`
`138`	`138`	`treatment_value=treatment_strategy,`
`139`	`139`	`estimate_type="hazard_ratio",`
`140`	`140`	`)`
`141`		`- causal_test_result = causal_test_case.execute_test(estimation_model, None)`
	`141`	`+ causal_test_result = causal_test_case.execute_test(estimation_model)`
`142`	`142`	`adequacy_metric = DataAdequacy(causal_test_case, estimation_model, group_by="id")`
`143`	`143`	`adequacy_metric.measure_adequacy()`
`144`	`144`	`adequacy_dict = adequacy_metric.to_dict()`