CITCOM-project
diff --git a/‎README.md
Lines changed: 1 addition & 1 deletion b/‎README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎causal_testing/data_collection/data_collector.py
Lines changed: 4 additions & 0 deletions b/‎causal_testing/data_collection/data_collector.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎causal_testing/generation/abstract_causal_test_case.py
Lines changed: 72 additions & 32 deletions b/‎causal_testing/generation/abstract_causal_test_case.py
Lines changed: 72 additions & 32 deletions
diff --git a/‎causal_testing/json_front/json_class.py
Lines changed: 21 additions & 22 deletions b/‎causal_testing/json_front/json_class.py
Lines changed: 21 additions & 22 deletions
diff --git a/‎causal_testing/specification/causal_dag.py
Lines changed: 27 additions & 1 deletion b/‎causal_testing/specification/causal_dag.py
Lines changed: 27 additions & 1 deletion
@@ -31,7 +31,7 @@ Here are some explanations for the causal inference terminology used above.
 
 ## Installation
 
-To use the causal testing framework, clone the repository, `cd` into the root directory, and run `pip install -e .`. More detailled installation instructions can be found in the [online documentation](https://causal-testing-framework.readthedocs.io/en/latest/installation.html).
+See the readthedocs site for installation instructions](https://causal-testing-framework.readthedocs.io/en/latest/installation.html).
 
 ## Usage
 
 
@@ -1,5 +1,6 @@
 import logging
 from abc import ABC, abstractmethod
+from enum import Enum
 
 import pandas as pd
 import z3
@@ -140,4 +141,7 @@ def collect_data(self, **kwargs) -> pd.DataFrame:
         for meta in self.scenario.metas():
             meta.populate(execution_data_df)
         scenario_execution_data_df = self.filter_valid_data(execution_data_df)
+        for var_name, var in self.scenario.variables.items():
+            if issubclass(var.datatype, Enum):
+                scenario_execution_data_df[var_name] = [var.datatype(x) for x in scenario_execution_data_df[var_name]]
         return scenario_execution_data_df
@@ -4,18 +4,22 @@
 import pandas as pd
 import z3
 from scipy import stats
+import itertools
 
 from causal_testing.specification.scenario import Scenario
 from causal_testing.specification.variable import Variable
 from causal_testing.testing.causal_test_case import CausalTestCase
 from causal_testing.testing.causal_test_outcome import CausalTestOutcome
+from causal_testing.testing.base_test_case import BaseTestCase
+
+from enum import Enum
 
 logger = logging.getLogger(__name__)
 
 
 class AbstractCausalTestCase:
     """
-    An abstract test case serves as a generator for concrete test cases. Instead of having concrete conctrol
+    An abstract test case serves as a generator for concrete test cases. Instead of having concrete control
     and treatment values, we instead just specify the intervention and the treatment variables. This then
     enables potentially infinite concrete test cases to be generated between different values of the treatment.
     """
@@ -24,23 +28,26 @@ def __init__(
         self,
         scenario: Scenario,
         intervention_constraints: set[z3.ExprRef],
-        treatment_variables: set[Variable],
+        treatment_variable: Variable,
         expected_causal_effect: dict[Variable:CausalTestOutcome],
         effect_modifiers: set[Variable] = None,
         estimate_type: str = "ate",
+        effect: str = "total",
     ):
-        assert treatment_variables.issubset(scenario.variables.values()), (
-            "Treatment variables must be a subset of variables."
-            + f" Instead got:\ntreatment_variables={treatment_variables}\nvariables={scenario.variables}"
-        )
+        if treatment_variable not in scenario.variables.values():
+            raise ValueError(
+                "Treatment variables must be a subset of variables."
+                + f" Instead got:\ntreatment_variables={treatment_variable}\nvariables={scenario.variables}"
+            )
 
         assert len(expected_causal_effect) == 1, "We currently only support tests with one causal outcome"
 
         self.scenario = scenario
         self.intervention_constraints = intervention_constraints
-        self.treatment_variables = treatment_variables
+        self.treatment_variable = treatment_variable
         self.expected_causal_effect = expected_causal_effect
         self.estimate_type = estimate_type
+        self.effect = effect
 
         if effect_modifiers is not None:
             self.effect_modifiers = effect_modifiers
@@ -100,7 +107,12 @@ def _generate_concrete_tests(
             for c in self.intervention_constraints:
                 optimizer.assert_and_track(c, str(c))
 
-            optimizer.add_soft([self.scenario.variables[v].z3 == row[v] for v in run_columns])
+            for v in run_columns:
+                optimizer.add_soft(
+                    self.scenario.variables[v].z3
+                    == self.scenario.variables[v].z3_val(self.scenario.variables[v].z3, row[v])
+                )
+
             if optimizer.check() == z3.unsat:
                 logger.warning(
                     "Satisfiability of test case was unsat.\n" "Constraints \n %s \n Unsat core %s",
@@ -109,13 +121,19 @@ def _generate_concrete_tests(
                 )
             model = optimizer.model()
 
+            base_test_case = BaseTestCase(
+                treatment_variable=self.treatment_variable,
+                outcome_variable=list(self.expected_causal_effect.keys())[0],
+                effect=self.effect,
+            )
+
             concrete_test = CausalTestCase(
-                control_input_configuration={v: v.cast(model[v.z3]) for v in self.treatment_variables},
-                treatment_input_configuration={
-                    v: v.cast(model[self.scenario.treatment_variables[v.name].z3]) for v in self.treatment_variables
-                },
+                base_test_case=base_test_case,
+                control_value=self.treatment_variable.cast(model[self.treatment_variable.z3]),
+                treatment_value=self.treatment_variable.cast(
+                    model[self.scenario.treatment_variables[self.treatment_variable.name].z3]
+                ),
                 expected_causal_effect=list(self.expected_causal_effect.values())[0],
-                outcome_variables=list(self.expected_causal_effect.keys()),
                 estimate_type=self.estimate_type,
                 effect_modifier_configuration={v: v.cast(model[v.z3]) for v in self.effect_modifiers},
             )
@@ -128,19 +146,20 @@ def _generate_concrete_tests(
                         + f"{constraints}\nUsing value {v.cast(model[v.z3])} instead in test\n{concrete_test}"
                     )
 
-            concrete_tests.append(concrete_test)
-            # Control run
-            control_run = {
-                v.name: v.cast(model[v.z3]) for v in self.scenario.variables.values() if v.name in run_columns
-            }
-            control_run["bin"] = index
-            runs.append(control_run)
-            # Treatment run
-            if rct:
-                treatment_run = control_run.copy()
-                treatment_run.update({k.name: v for k, v in concrete_test.treatment_input_configuration.items()})
-                treatment_run["bin"] = index
-                runs.append(treatment_run)
+            if not any([vars(t) == vars(concrete_test) for t in concrete_tests]):
+                concrete_tests.append(concrete_test)
+                # Control run
+                control_run = {
+                    v.name: v.cast(model[v.z3]) for v in self.scenario.variables.values() if v.name in run_columns
+                }
+                control_run["bin"] = index
+                runs.append(control_run)
+                # Treatment run
+                if rct:
+                    treatment_run = control_run.copy()
+                    treatment_run.update({concrete_test.treatment_variable.name: concrete_test.treatment_value})
+                    treatment_run["bin"] = index
+                    runs.append(treatment_run)
 
         return concrete_tests, pd.DataFrame(runs, columns=run_columns + ["bin"])
 
@@ -176,13 +195,16 @@ def generate_concrete_tests(
         runs = pd.DataFrame()
         ks_stats = []
 
+        pre_break = False
         for i in range(hard_max):
             concrete_tests_, runs_ = self._generate_concrete_tests(sample_size, rct, seed + i)
-            concrete_tests += concrete_tests_
+            for t_ in concrete_tests_:
+                if not any([vars(t_) == vars(t) for t in concrete_tests]):
+                    concrete_tests.append(t_)
             runs = pd.concat([runs, runs_])
             assert concrete_tests_ not in concrete_tests, "Duplicate entries unlikely unless something went wrong"
 
-            control_configs = pd.DataFrame([test.control_input_configuration for test in concrete_tests])
+            control_configs = pd.DataFrame([{test.treatment_variable: test.control_value} for test in concrete_tests])
             ks_stats = {
                 var: stats.kstest(control_configs[var], var.distribution.cdf).statistic
                 for var in control_configs.columns
@@ -205,14 +227,32 @@ def generate_concrete_tests(
                     for var in effect_modifier_configs.columns
                 }
             )
-            if target_ks_score and all((stat <= target_ks_score for stat in ks_stats.values())):
+            control_values = [test.control_value for test in concrete_tests]
+            treatment_values = [test.treatment_value for test in concrete_tests]
+
+            if self.treatment_variable.datatype is bool and set([(True, False), (False, True)]).issubset(
+                set(zip(control_values, treatment_values))
+            ):
+                pre_break = True
+                break
+            if issubclass(self.treatment_variable.datatype, Enum) and set(
+                {
+                    (x, y)
+                    for x, y in itertools.product(self.treatment_variable.datatype, self.treatment_variable.datatype)
+                    if x != y
+                }
+            ).issubset(zip(control_values, treatment_values)):
+                pre_break = True
+                break
+            elif target_ks_score and all((stat <= target_ks_score for stat in ks_stats.values())):
+                pre_break = True
                 break
 
-        if target_ks_score is not None and not all((stat <= target_ks_score for stat in ks_stats.values())):
+        if target_ks_score is not None and not pre_break:
             logger.error(
-                "Hard max of %s reached but could not achieve target ks_score of %s. Got %s.",
-                hard_max,
+                "Hard max reached but could not achieve target ks_score of %s. Got %s. Generated %s distinct tests",
                 target_ks_score,
                 ks_stats,
+                len(concrete_tests),
             )
         return concrete_tests, runs
@@ -75,7 +75,7 @@ def set_variables(self, inputs: dict, outputs: dict, metas: dict):
         :param metas:
         """
         self.inputs = [Input(i["name"], i["type"], i["distribution"]) for i in inputs]
-        self.outputs = [Output(i["name"], i["type"]) for i in outputs]
+        self.outputs = [Output(i["name"], i["type"], i.get("distribution", None)) for i in outputs]
         self.metas = [Meta(i["name"], i["type"], i["populate"]) for i in metas] if metas else []
 
     def setup(self):
@@ -89,10 +89,11 @@ def setup(self):
         self._populate_metas()
 
     def _create_abstract_test_case(self, test, mutates, effects):
+        assert len(test["mutations"]) == 1
         abstract_test = AbstractCausalTestCase(
             scenario=self.modelling_scenario,
             intervention_constraints=[mutates[v](k) for k, v in test["mutations"].items()],
-            treatment_variables={self.modelling_scenario.variables[v] for v in test["mutations"]},
+            treatment_variable=next(self.modelling_scenario.variables[v] for v in test["mutations"]),
             expected_causal_effect={
                 self.modelling_scenario.variables[variable]: effects[effect]
                 for variable, effect in test["expectedEffect"].items()
@@ -101,6 +102,7 @@ def _create_abstract_test_case(self, test, mutates, effects):
             if "effect_modifiers" in test
             else {},
             estimate_type=test["estimate_type"],
+            effect=test.get("effect", "total"),
         )
         return abstract_test
 
@@ -121,10 +123,10 @@ def generate_tests(self, effects: dict, mutates: dict, estimators: dict, f_flag:
             concrete_tests, dummy = abstract_test.generate_concrete_tests(5, 0.05)
             logger.info("Executing test: %s", test["name"])
             logger.info(abstract_test)
-            logger.info([(v.name, v.distribution) for v in abstract_test.treatment_variables])
+            logger.info([abstract_test.treatment_variable.name, abstract_test.treatment_variable.distribution])
             logger.info("Number of concrete tests for test case: %s", str(len(concrete_tests)))
             failures = self._execute_tests(concrete_tests, estimators, test, f_flag)
-        logger.info("%s/%s failed", failures, len(concrete_tests))
+            logger.info("%s/%s failed for %s\n", failures, len(concrete_tests), test["name"])
 
     def _execute_tests(self, concrete_tests, estimators, test, f_flag):
         failures = 0
@@ -151,11 +153,12 @@ def _populate_metas(self):
             meta.populate(self.data)
 
         for var in self.metas + self.outputs:
-            fitter = Fitter(self.data[var.name], distributions=get_common_distributions())
-            fitter.fit()
-            (dist, params) = list(fitter.get_best(method="sumsquare_error").items())[0]
-            var.distribution = getattr(scipy.stats, dist)(**params)
-            logger.info(var.name + f"{dist}({params})")
+            if not var.distribution:
+                fitter = Fitter(self.data[var.name], distributions=get_common_distributions())
+                fitter.fit()
+                (dist, params) = list(fitter.get_best(method="sumsquare_error").items())[0]
+                var.distribution = getattr(scipy.stats, dist)(**params)
+                logger.info(var.name + f" {dist}({params})")
 
     def _execute_test_case(self, causal_test_case: CausalTestCase, estimator: Estimator, f_flag: bool) -> bool:
         """Executes a singular test case, prints the results and returns the test case result
@@ -178,19 +181,15 @@ def _execute_test_case(self, causal_test_case: CausalTestCase, estimator: Estima
         if causal_test_result.ci_low() and causal_test_result.ci_high():
             result_string = f"{causal_test_result.ci_low()} < {causal_test_result.test_value.value} <  {causal_test_result.ci_high()}"
         else:
-            result_string = causal_test_result.test_value.value
+            result_string = f"{causal_test_result.test_value.value} no confidence intervals"
         if f_flag:
             assert test_passes, (
                 f"{causal_test_case}\n    FAILED - expected {causal_test_case.expected_causal_effect}, "
                 f"got {result_string}"
             )
         if not test_passes:
             failed = True
-            logger.warning(
-                "   FAILED- expected %s, got %s",
-                causal_test_case.expected_causal_effect,
-                causal_test_result.test_value.value,
-            )
+            logger.warning("   FAILED- expected %s, got %s", causal_test_case.expected_causal_effect, result_string)
         return failed
 
     def _setup_test(self, causal_test_case: CausalTestCase, estimator: Estimator) -> tuple[CausalTestEngine, Estimator]:
@@ -202,15 +201,15 @@ def _setup_test(self, causal_test_case: CausalTestCase, estimator: Estimator) ->
         """
         data_collector = ObservationalDataCollector(self.modelling_scenario, self.data_path)
         causal_test_engine = CausalTestEngine(self.causal_specification, data_collector, index_col=0)
-        causal_test_engine.identification(causal_test_case)
-        treatment_vars = list(causal_test_case.treatment_input_configuration)
-        minimal_adjustment_set = causal_test_engine.minimal_adjustment_set - {v.name for v in treatment_vars}
+        minimal_adjustment_set = self.causal_specification.causal_dag.identification(causal_test_case.base_test_case)
+        treatment_var = causal_test_case.treatment_variable
+        minimal_adjustment_set = minimal_adjustment_set - {treatment_var}
         estimation_model = estimator(
-            (list(treatment_vars)[0].name,),
-            [causal_test_case.treatment_input_configuration[v] for v in treatment_vars][0],
-            [causal_test_case.control_input_configuration[v] for v in treatment_vars][0],
+            (treatment_var.name,),
+            causal_test_case.treatment_value,
+            causal_test_case.control_value,
             minimal_adjustment_set,
-            (list(causal_test_case.outcome_variables)[0].name,),
+            (causal_test_case.outcome_variable.name,),
             causal_test_engine.scenario_execution_data_df,
             effect_modifiers=causal_test_case.effect_modifier_configuration,
         )
 
@@ -255,7 +255,8 @@ def direct_effect_adjustment_sets(self, treatments: list[str], outcomes: list[st
         gam.add_edges_from(edges_to_add)
 
         min_seps = list(list_all_min_sep(gam, "TREATMENT", "OUTCOME", set(treatments), set(outcomes)))
-        # min_seps.remove(set(outcomes))
+        if set(outcomes) in min_seps:
+            min_seps.remove(set(outcomes))
         return min_seps
 
     def enumerate_minimal_adjustment_sets(self, treatments: list[str], outcomes: list[str]) -> list[set[str]]:
@@ -278,6 +279,7 @@ def enumerate_minimal_adjustment_sets(self, treatments: list[str], outcomes: lis
         :param outcomes: A list of strings representing outcomes.
         :return: A list of strings representing the minimal adjustment set.
         """
+
         # 1. Construct the proper back-door graph's ancestor moral graph
         proper_backdoor_graph = self.get_proper_backdoor_graph(treatments, outcomes)
         ancestor_proper_backdoor_graph = proper_backdoor_graph.get_ancestor_graph(treatments, outcomes)
@@ -316,6 +318,7 @@ def enumerate_minimal_adjustment_sets(self, treatments: list[str], outcomes: lis
             for adj in minimum_adjustment_sets
             if self.constructive_backdoor_criterion(proper_backdoor_graph, treatments, outcomes, adj)
         ]
+
         return valid_minimum_adjustment_sets
 
     def adjustment_set_is_minimal(self, treatments: list[str], outcomes: list[str], adjustment_set: set[str]) -> bool:
@@ -465,5 +468,28 @@ def depends_on_outputs(self, node: Node, scenario: Scenario) -> bool:
             return True
         return any([self.depends_on_outputs(n, scenario) for n in self.graph.predecessors(node)])
 
+    def identification(self, base_test_case):
+        """Identify and return the minimum adjustment set
+
+        :param base_test_case: A base test case instance containing the outcome_variable and the
+        treatment_variable required for identification.
+        :return minimal_adjustment_set: The smallest set of variables which can be adjusted for to obtain a causal
+        estimate as opposed to a purely associational estimate.
+        """
+        minimal_adjustment_sets = []
+        if base_test_case.effect == "total":
+            minimal_adjustment_sets = self.enumerate_minimal_adjustment_sets(
+                [base_test_case.treatment_variable.name], [base_test_case.outcome_variable.name]
+            )
+        elif base_test_case.effect == "direct":
+            minimal_adjustment_sets = self.direct_effect_adjustment_sets(
+                [base_test_case.treatment_variable.name], [base_test_case.outcome_variable.name]
+            )
+        else:
+            raise ValueError("Causal effect should be 'total' or 'direct'")
+
+        minimal_adjustment_set = min(minimal_adjustment_sets, key=len)
+        return minimal_adjustment_set
+
     def __str__(self):
         return f"Nodes: {self.graph.nodes}\nEdges: {self.graph.edges}"