Merge pull request #162 from CITCOM-project/json_read_multiple_files

christopher-wild · web-flow · commit e23b9b16531f · 2023-03-16T02:24:28.000-07:00
Json read multiple files
diff --git a/causal_testing/data_collection/data_collector.py b/causal_testing/data_collection/data_collector.py
@@ -127,22 +127,23 @@ def run_system_with_input_configuration(self, input_configuration: dict) -> pd.D
 
 
 class ObservationalDataCollector(DataCollector):
-    """A data collector that extracts data that is relevant to the specified scenario from a csv of execution data."""
+    """A data collector that extracts data that is relevant to the specified scenario from a dataframe of execution
+    data."""
 
-    def __init__(self, scenario: Scenario, csv_path: str):
+    def __init__(self, scenario: Scenario, data: pd.DataFrame):
         super().__init__(scenario)
-        self.csv_path = csv_path
+        self.data = data
 
     def collect_data(self, **kwargs) -> pd.DataFrame:
-        """Read a csv containing execution data for the system-under-test into a pandas dataframe and filter to remove
+        """Read a pandas dataframe and filter to remove
         any data which is invalid for the scenario-under-test.
 
         Data is invalid if it does not meet the constraints outlined in the scenario-under-test (Scenario).
 
         :return: A pandas dataframe containing execution data that is valid for the scenario-under-test.
         """
 
-        execution_data_df = pd.read_csv(self.csv_path, **kwargs)
+        execution_data_df = self.data
         for meta in self.scenario.metas():
             meta.populate(execution_data_df)
         scenario_execution_data_df = self.filter_valid_data(execution_data_df)
diff --git a/causal_testing/json_front/json_class.py b/causal_testing/json_front/json_class.py
@@ -46,20 +46,20 @@ class JsonUtility(ABC):
     def __init__(self, log_path):
         self.paths = None
         self.variables = None
-        self.data = None
+        self.data = []
         self.test_plan = None
         self.modelling_scenario = None
         self.causal_specification = None
         self.setup_logger(log_path)
 
-    def set_paths(self, json_path: str, dag_path: str, data_path: str):
+    def set_paths(self, json_path: str, dag_path: str, data_paths: str):
         """
         Takes a path of the directory containing all scenario specific files and creates individual paths for each file
         :param json_path: string path representation to .json file containing test specifications
         :param dag_path: string path representation to the .dot file containing the Causal DAG
         :param data_path: string path representation to the data file
         """
-        self.paths = JsonClassPaths(json_path=json_path, dag_path=dag_path, data_path=data_path)
+        self.paths = JsonClassPaths(json_path=json_path, dag_path=dag_path, data_paths=data_paths)
 
     def set_variables(self, inputs: list[dict], outputs: list[dict], metas: list[dict]):
         """Populate the Causal Variables
@@ -132,14 +132,15 @@ def _json_parse(self):
         """Parse a JSON input file into inputs, outputs, metas and a test plan"""
         with open(self.paths.json_path, encoding="utf-8") as f:
             self.test_plan = json.load(f)
-
-        self.data = pd.read_csv(self.paths.data_path)
+        for data_file in self.paths.data_paths:
+            df = pd.read_csv(data_file, header=0)
+            self.data.append(df)
+        self.data = pd.concat(self.data)
 
     def _populate_metas(self):
         """
         Populate data with meta-variable values and add distributions to Causal Testing Framework Variables
         """
-
         for meta in self.variables.metas:
             meta.populate(self.data)
 
@@ -193,8 +194,10 @@ def _setup_test(self, causal_test_case: CausalTestCase, estimator: Estimator) ->
                 - causal_test_engine - Test Engine instance for the test being run
                 - estimation_model - Estimator instance for the test being run
         """
-        data_collector = ObservationalDataCollector(self.modelling_scenario, self.paths.data_path)
+
+        data_collector = ObservationalDataCollector(self.modelling_scenario, self.data)
         causal_test_engine = CausalTestEngine(self.causal_specification, data_collector, index_col=0)
+
         minimal_adjustment_set = self.causal_specification.causal_dag.identification(causal_test_case.base_test_case)
         treatment_var = causal_test_case.treatment_variable
         minimal_adjustment_set = minimal_adjustment_set - {treatment_var}
@@ -252,6 +255,7 @@ def get_args(test_args=None) -> argparse.Namespace:
             "--data_path",
             help="Specify path to file containing runtime data",
             required=True,
+            nargs="+",
         )
         parser.add_argument(
             "--dag_path",
@@ -277,12 +281,12 @@ class JsonClassPaths:
 
     json_path: Path
     dag_path: Path
-    data_path: Path
+    data_paths: list[Path]
 
-    def __init__(self, json_path: str, dag_path: str, data_path: str):
+    def __init__(self, json_path: str, dag_path: str, data_paths: str):
         self.json_path = Path(json_path)
         self.dag_path = Path(dag_path)
-        self.data_path = Path(data_path)
+        self.data_paths = [Path(path) for path in data_paths]
 
 
 @dataclass()
diff --git a/causal_testing/testing/causal_test_engine.py b/causal_testing/testing/causal_test_engine.py
@@ -71,9 +71,11 @@ def execute_test_suite(self, test_suite: CausalTestSuite) -> list[CausalTestResu
             minimal_adjustment_set = minimal_adjustment_set - set(edge.treatment_variable.name)
             minimal_adjustment_set = minimal_adjustment_set - set(edge.outcome_variable.name)
 
-            variables_for_positivity = (
-                list(minimal_adjustment_set) + [edge.treatment_variable.name] + [edge.outcome_variable.name]
-            )
+            variables_for_positivity = list(minimal_adjustment_set) + [
+                edge.treatment_variable.name,
+                edge.outcome_variable.name,
+            ]
+
             if self._check_positivity_violation(variables_for_positivity):
                 raise ValueError("POSITIVITY VIOLATION -- Cannot proceed.")
 
@@ -209,13 +211,15 @@ def _check_positivity_violation(self, variables_list):
         :param variables_list: The list of variables for which positivity must be satisfied.
         :return: True if positivity is violated, False otherwise.
         """
-        if not set(variables_list).issubset(self.scenario_execution_data_df.columns):
+        if not (set(variables_list) - {x.name for x in self.scenario.hidden_variables()}).issubset(
+            self.scenario_execution_data_df.columns
+        ):
             missing_variables = set(variables_list) - set(self.scenario_execution_data_df.columns)
             logger.warning(
-                "Positivity violation: missing data for variables {missing_variables}.\n"
+                "Positivity violation: missing data for variables %s.\n"
                 "Causal inference is only valid if a well-specified parametric model is used.\n"
                 "Alternatively, consider restricting analysis to executions without the variables:"
-                " %s.",
+                ".",
                 missing_variables,
             )
             return True
diff --git a/tests/data_collection_tests/test_observational_data_collector.py b/tests/data_collection_tests/test_observational_data_collector.py
@@ -38,18 +38,18 @@ class Color(Enum):
 
     def test_not_all_variables_in_data(self):
         scenario = Scenario({self.X1, self.X2, self.X3, self.X4})
-        observational_data_collector = ObservationalDataCollector(scenario, self.observational_df_path)
+        observational_data_collector = ObservationalDataCollector(scenario, self.observational_df)
         self.assertRaises(IndexError, observational_data_collector.collect_data)
 
     def test_all_variables_in_data(self):
         scenario = Scenario({self.X1, self.X2, self.X3, self.Y1, self.Y2})
-        observational_data_collector = ObservationalDataCollector(scenario, self.observational_df_path)
+        observational_data_collector = ObservationalDataCollector(scenario, self.observational_df)
         df = observational_data_collector.collect_data(index_col=0)
         assert df.equals(self.observational_df), f"\n{df}\nwas not equal to\n{self.observational_df}"
 
     def test_data_constraints(self):
         scenario = Scenario({self.X1, self.X2, self.X3, self.Y1, self.Y2}, {self.X1.z3 > 2})
-        observational_data_collector = ObservationalDataCollector(scenario, self.observational_df_path)
+        observational_data_collector = ObservationalDataCollector(scenario, self.observational_df)
         df = observational_data_collector.collect_data(index_col=0)
         expected = self.observational_df.loc[[2, 3]]
         assert df.equals(expected), f"\n{df}\nwas not equal to\n{expected}"
@@ -60,7 +60,7 @@ def populate_m(data):
 
         meta = Meta("M", int, populate_m)
         scenario = Scenario({self.X1, meta})
-        observational_data_collector = ObservationalDataCollector(scenario, self.observational_df_path)
+        observational_data_collector = ObservationalDataCollector(scenario, self.observational_df)
         data = observational_data_collector.collect_data()
         assert all((m == 2 * x1 for x1, m in zip(data["X1"], data["M"])))
 
diff --git a/tests/json_front_tests/test_json_class.py b/tests/json_front_tests/test_json_class.py
@@ -26,9 +26,9 @@ def setUp(self) -> None:
         dag_file_name = "dag.dot"
         data_file_name = "data.csv"
         test_data_dir_path = Path("tests/resources/data")
-        self.json_path = test_data_dir_path / json_file_name
-        self.dag_path = test_data_dir_path / dag_file_name
-        self.data_path = test_data_dir_path / data_file_name
+        self.json_path = str(test_data_dir_path / json_file_name)
+        self.dag_path = str(test_data_dir_path / dag_file_name)
+        self.data_path = [str(test_data_dir_path / data_file_name)]
         self.json_class = JsonUtility("logs.log")
         self.example_distribution = scipy.stats.uniform(1, 10)
         self.input_dict_list = [{"name": "test_input", "datatype": float, "distribution": self.example_distribution}]
@@ -40,7 +40,7 @@ def setUp(self) -> None:
     def test_setting_paths(self):
         self.assertEqual(self.json_class.paths.json_path, Path(self.json_path))
         self.assertEqual(self.json_class.paths.dag_path, Path(self.dag_path))
-        self.assertEqual(self.json_class.paths.data_path, Path(self.data_path))
+        self.assertEqual(self.json_class.paths.data_paths, [Path(self.data_path[0])])  # Needs to be list of Paths
 
     def test_set_inputs(self):
         ctf_input = [Input("test_input", float, self.example_distribution)]
@@ -61,7 +61,7 @@ def test_set_metas(self):
 
     def test_argparse(self):
         args = self.json_class.get_args(["--data_path=data.csv", "--dag_path=dag.dot", "--json_path=tests.json"])
-        self.assertEqual(args.data_path, "data.csv")
+        self.assertEqual(args.data_path, ["data.csv"])
         self.assertEqual(args.dag_path, "dag.dot")
         self.assertEqual(args.json_path, "tests.json")
 
diff --git a/tests/testing_tests/test_causal_test_engine.py b/tests/testing_tests/test_causal_test_engine.py
@@ -59,7 +59,7 @@ def setUp(self) -> None:
 
         # 5. Create observational data collector
         # Obsolete?
-        self.data_collector = ObservationalDataCollector(self.scenario, self.observational_data_csv_path)
+        self.data_collector = ObservationalDataCollector(self.scenario, df)
 
         # 5. Create causal test engine
         self.causal_test_engine = CausalTestEngine(self.causal_specification, self.data_collector)
diff --git a/tests/testing_tests/test_causal_test_outcome.py b/tests/testing_tests/test_causal_test_outcome.py
@@ -27,13 +27,17 @@ def test_None_ci(self):
 
         self.assertIsNone(ctr.ci_low())
         self.assertIsNone(ctr.ci_high())
-        self.assertEqual(ctr.to_dict(),
-            {"treatment": "A",
-            "control_value": 0,
-            "treatment_value": 1,
-            "outcome": "A",
-            "adjustment_set": set(),
-            "test_value": test_value})
+        self.assertEqual(
+            ctr.to_dict(),
+            {
+                "treatment": "A",
+                "control_value": 0,
+                "treatment_value": 1,
+                "outcome": "A",
+                "adjustment_set": set(),
+                "test_value": test_value,
+            },
+        )
 
     def test_empty_adjustment_set(self):
         test_value = TestValue(type="ate", value=0)
@@ -46,13 +50,18 @@ def test_empty_adjustment_set(self):
 
         self.assertIsNone(ctr.ci_low())
         self.assertIsNone(ctr.ci_high())
-        self.assertEqual(str(ctr), ("Causal Test Result\n==============\n"
-            "Treatment: A\n"
-            "Control value: 0\n"
-            "Treatment value: 1\n"
-            "Outcome: A\n"
-            "Adjustment set: set()\n"
-            "ate: 0\n" ))
+        self.assertEqual(
+            str(ctr),
+            (
+                "Causal Test Result\n==============\n"
+                "Treatment: A\n"
+                "Control value: 0\n"
+                "Treatment value: 1\n"
+                "Outcome: A\n"
+                "Adjustment set: set()\n"
+                "ate: 0\n"
+            ),
+        )
 
     def test_exactValue_pass(self):
         test_value = TestValue(type="ate", value=5.05)
@@ -97,20 +106,29 @@ def test_someEffect_fail(self):
         )
         ev = SomeEffect()
         self.assertFalse(ev.apply(ctr))
-        self.assertEqual(str(ctr), ("Causal Test Result\n==============\n"
-            "Treatment: A\n"
-            "Control value: 0\n"
-            "Treatment value: 1\n"
-            "Outcome: A\n"
-            "Adjustment set: set()\n"
-            "ate: 0\n"
-            "Confidence intervals: [-0.1, 0.2]\n" ))
-        self.assertEqual(ctr.to_dict(),
-            {"treatment": "A",
-            "control_value": 0,
-            "treatment_value": 1,
-            "outcome": "A",
-            "adjustment_set": set(),
-            "test_value": test_value,
-            "ci_low": -0.1,
-            "ci_high": 0.2})
+        self.assertEqual(
+            str(ctr),
+            (
+                "Causal Test Result\n==============\n"
+                "Treatment: A\n"
+                "Control value: 0\n"
+                "Treatment value: 1\n"
+                "Outcome: A\n"
+                "Adjustment set: set()\n"
+                "ate: 0\n"
+                "Confidence intervals: [-0.1, 0.2]\n"
+            ),
+        )
+        self.assertEqual(
+            ctr.to_dict(),
+            {
+                "treatment": "A",
+                "control_value": 0,
+                "treatment_value": 1,
+                "outcome": "A",
+                "adjustment_set": set(),
+                "test_value": test_value,
+                "ci_low": -0.1,
+                "ci_high": 0.2,
+            },
+        )
diff --git a/tests/testing_tests/test_causal_test_suite.py b/tests/testing_tests/test_causal_test_suite.py
@@ -40,9 +40,7 @@ def setUp(self) -> None:
         df = pd.DataFrame({"D": list(np.random.normal(60, 10, 1000))})  # D = exogenous
         df["A"] = [1 if d > 50 else 0 for d in df["D"]]
         df["C"] = df["D"] + (4 * (df["A"] + 2))  # C = (4*(A+2)) + D
-        self.observational_data_csv_path = os.path.join(temp_dir_path, "observational_data.csv")
-        df.to_csv(self.observational_data_csv_path, index=False)
-
+        self.df = df
         self.causal_dag = CausalDAG(dag_dot_path)
 
         # 3. Specify data structures required for test suite
@@ -126,6 +124,6 @@ def create_causal_test_engine(self):
         """
         causal_specification = CausalSpecification(self.scenario, self.causal_dag)
 
-        data_collector = ObservationalDataCollector(self.scenario, self.observational_data_csv_path)
+        data_collector = ObservationalDataCollector(self.scenario, self.df)
         causal_test_engine = CausalTestEngine(causal_specification, data_collector)
         return causal_test_engine