Merge branch 'main' into fix-linalg-error

jmafoster1 · web-flow · commit 4b4aae8affa0 · 2023-09-19T13:44:37.000+01:00
diff --git a/causal_testing/json_front/json_class.py b/causal_testing/json_front/json_class.py
@@ -270,6 +270,9 @@ def _execute_test_case(
         failed = False
 
         estimation_model = self._setup_test(causal_test_case=causal_test_case, test=test)
+        if "formula" in test:
+            if not estimation_model.validate_formula(self.causal_specification.causal_dag):
+                raise ValueError("Formula covariates do not satisfy the constructive back-door criterion.")
         causal_test_result = causal_test_case.execute_test(
             estimator=estimation_model, data_collector=self.data_collector
         )
@@ -331,6 +334,7 @@ def _setup_test(self, causal_test_case: CausalTestCase, test: Mapping) -> Estima
         estimator_kwargs["alpha"] = test["alpha"] if "alpha" in test else 0.05
 
         estimation_model = test["estimator"](**estimator_kwargs)
+
         return estimation_model
 
     def _append_to_file(self, line: str, log_level: int = None):
diff --git a/causal_testing/testing/estimators.py b/causal_testing/testing/estimators.py
@@ -10,13 +10,14 @@
 import statsmodels.api as sm
 import statsmodels.formula.api as smf
 from econml.dml import CausalForestDML
-from patsy import dmatrix
+from patsy import dmatrix, ModelDesc
 
 from sklearn.ensemble import GradientBoostingRegressor
 from statsmodels.regression.linear_model import RegressionResultsWrapper
 from statsmodels.tools.sm_exceptions import PerfectSeparationError
 
 from causal_testing.specification.variable import Variable
+from causal_testing.specification.causal_dag import CausalDAG
 
 logger = logging.getLogger(__name__)
 
@@ -83,10 +84,10 @@ def compute_confidence_intervals(self) -> list[float, float]:
         """
 
 
-class LogisticRegressionEstimator(Estimator):
-    """A Logistic Regression Estimator is a parametric estimator which restricts the variables in the data to a linear
-    combination of parameters and functions of the variables (note these functions need not be linear). It is designed
-    for estimating categorical outcomes.
+class RegressionEstimator(Estimator):
+    """An abstract class extending the Estimator functionality to add support for formulae, which are used in
+    regression based estimators.
+
     """
 
     def __init__(
@@ -100,16 +101,97 @@ def __init__(
         df: pd.DataFrame = None,
         effect_modifiers: dict[str:Any] = None,
         formula: str = None,
+        alpha: float = 0.05,
     ):
-        super().__init__(treatment, treatment_value, control_value, adjustment_set, outcome, df, effect_modifiers)
+        super().__init__(
+            treatment, treatment_value, control_value, adjustment_set, outcome, df, effect_modifiers, alpha=alpha
+        )
 
-        self.model = None
+        if effect_modifiers is None:
+            effect_modifiers = []
 
         if formula is not None:
             self.formula = formula
+
+        else:
+            terms = [treatment] + sorted(list(adjustment_set)) + sorted(list(effect_modifiers))
+            self.formula = f"{outcome} ~ {'+'.join(terms)}"
+
+    @abstractmethod
+    def add_modelling_assumptions(self):
+        """
+        Add modelling assumptions to the estimator. This is a list of strings which list the modelling assumptions that
+        must hold if the resulting causal inference is to be considered valid.
+        """
+
+    def get_terms_from_formula(self) -> tuple[str, str, list[str]]:
+        """
+        Parse all the terms from a Patsy formula string into outcome, treatment and covariate variables.
+
+        Formulae are expected to only have a single left hand side term.
+
+        :return: a truple containing the outcome, treatment and covariate variable names in string format
+        """
+        desc = ModelDesc.from_formula(self.formula)
+        if len(desc.lhs_termlist) > 1:
+            raise ValueError("More than 1 left hand side term provided in formula, only single term is accepted")
+        outcome = desc.lhs_termlist[0].factors[0].code
+        rhs_terms = set()
+        for term in desc.rhs_termlist:
+            if term.factors:
+                rhs_terms.add(term.factors[0].code)
+        if self.treatment not in rhs_terms:
+            raise ValueError(f"Treatment variable '{self.treatment}' not found in formula")
+        rhs_terms.remove(self.treatment)
+        covariates = rhs_terms
+        if covariates is None:
+            covariates = []
         else:
-            terms = [treatment] + sorted(list(adjustment_set)) + sorted(list(self.effect_modifiers))
-            self.formula = f"{outcome} ~ {'+'.join(((terms)))}"
+            covariates = list(covariates)
+        return outcome, self.treatment, covariates
+
+    def validate_formula(self, causal_dag: CausalDAG):
+        """
+        Validate the provided Patsy formula string using the constructive backdoor criterion method found in the
+        CausalDAG class
+
+        :param causal_dag: A CausalDAG object containing for the current test scenario
+        :return: True for a formula that does not violate the criteria and False if the formula does violate the
+        criteria
+        """
+        outcome, treatment, covariates = self.get_terms_from_formula()
+        proper_backdoor_graph = causal_dag.get_proper_backdoor_graph(treatments=[treatment], outcomes=[outcome])
+        return causal_dag.constructive_backdoor_criterion(
+            proper_backdoor_graph=proper_backdoor_graph,
+            treatments=[treatment],
+            outcomes=[outcome],
+            covariates=list(covariates),
+        )
+
+
+class LogisticRegressionEstimator(RegressionEstimator):
+    """A Logistic Regression Estimator is a parametric estimator which restricts the variables in the data to a linear
+    combination of parameters and functions of the variables (note these functions need not be linear). It is designed
+    for estimating categorical outcomes.
+    """
+
+    def __init__(
+        # pylint: disable=too-many-arguments
+        self,
+        treatment: str,
+        treatment_value: float,
+        control_value: float,
+        adjustment_set: set,
+        outcome: str,
+        df: pd.DataFrame = None,
+        effect_modifiers: dict[str:Any] = None,
+        formula: str = None,
+    ):
+        super().__init__(
+            treatment, treatment_value, control_value, adjustment_set, outcome, df, effect_modifiers, formula
+        )
+
+        self.model = None
 
     def add_modelling_assumptions(self):
         """
@@ -274,7 +356,7 @@ def estimate_unit_odds_ratio(self) -> float:
         return np.exp(model.params[self.treatment])
 
 
-class LinearRegressionEstimator(Estimator):
+class LinearRegressionEstimator(RegressionEstimator):
     """A Linear Regression Estimator is a parametric estimator which restricts the variables in the data to a linear
     combination of parameters and functions of the variables (note these functions need not be linear).
     """
@@ -293,18 +375,18 @@ def __init__(
         alpha: float = 0.05,
     ):
         super().__init__(
-            treatment, treatment_value, control_value, adjustment_set, outcome, df, effect_modifiers, alpha=alpha
+            treatment,
+            treatment_value,
+            control_value,
+            adjustment_set,
+            outcome,
+            df,
+            effect_modifiers,
+            alpha=alpha,
+            formula=formula,
         )
 
         self.model = None
-        if effect_modifiers is None:
-            effect_modifiers = []
-
-        if formula is not None:
-            self.formula = formula
-        else:
-            terms = [treatment] + sorted(list(adjustment_set)) + sorted(list(effect_modifiers))
-            self.formula = f"{outcome} ~ {'+'.join(terms)}"
 
         for term in self.effect_modifiers:
             self.adjustment_set.add(term)
diff --git a/docs/source/usage.rst b/docs/source/usage.rst
@@ -55,7 +55,7 @@ the given output and input and the desired effect. This information is the minim
    base_test_case = BaseTestCase(
       treatment_variable = x, # Set the treatment (input) variable to x
       outcome_variable = y, # set the outcome (output) variable to y
-      effect = Effect.direct.value) # effect type, current accepted types are direct and total
+      effect = Effect.DIRECT.value) # effect type, current accepted types are direct and total
 
    causal_test_case = CausalTestCase(
       base_test_case = base_test_case,
diff --git a/tests/json_front_tests/test_json_class.py b/tests/json_front_tests/test_json_class.py
@@ -272,19 +272,6 @@ def test_concrete_generate_params(self):
         self.assertIn("failed", temp_out[-1])
 
     def test_no_data_provided(self):
-        example_test = {
-            "tests": [
-                {
-                    "name": "test1",
-                    "mutations": {"test_input": "Increase"},
-                    "estimator": "LinearRegressionEstimator",
-                    "estimate_type": "ate",
-                    "effect_modifiers": [],
-                    "expected_effect": {"test_output": "NoEffect"},
-                    "skip": False,
-                }
-            ]
-        }
         json_class = JsonUtility("temp_out.txt", True)
         json_class.set_paths(self.json_path, self.dag_path)
 
@@ -316,6 +303,59 @@ def test_estimator_formula_type_check(self):
         with self.assertRaises(TypeError):
             self.json_class.run_json_tests(effects=effects, mutates=mutates, estimators=estimators, f_flag=False)
 
+
+    def test_constructive_back_door_not_met(self):
+        example_test = {
+            "tests": [
+                {
+                    "name": "test1",
+                    "mutations": {"X": "Increase"},
+                    "estimator": "LinearRegressionEstimator",
+                    "estimate_type": "ate",
+                    "effect_modifiers": [],
+                    "expected_effect": {"Y": "NoEffect"},
+                    "skip": False,
+                    "formula": "Y ~ X",
+                }
+            ]
+        }
+        inputs = [
+            Input("X", int),
+            Input("Z", int)
+        ]
+        outputs = [
+            Output("Y", int)
+        ]
+        variables = inputs + outputs
+        modelling_scenario = Scenario(variables)
+        modelling_scenario.setup_treatment_variables()
+        json_utility = JsonUtility("temp_out.txt", True)
+        test_data_dir_path = Path("tests/resources/data")
+        dag_path = str(test_data_dir_path / "dag_not_descendent.dot")
+        data_path = [str(test_data_dir_path / "not_descendent.csv")]
+        input_dict_list = [
+            {"name": "X", "datatype": float},
+            {"name": "Z", "datatype": float},
+        ]
+        output_dict_list = [{"name": "Y", "datatype": float}]
+        variables = CausalVariables(
+            inputs=input_dict_list, outputs=output_dict_list, metas=None
+        )
+
+        effects = {"NoEffect": NoEffect()}
+        mutates = {
+            "Increase": lambda x: json_utility.scenario.treatment_variables[x].z3
+                                  > json_utility.scenario.variables[x].z3
+        }
+        estimators = {"LinearRegressionEstimator": LinearRegressionEstimator}
+
+        scenario = Scenario(variables=variables, constraints=None)
+        json_utility.set_paths(self.json_path, dag_path, data_path)
+        json_utility.setup(scenario)
+        json_utility.test_plan = example_test
+        with self.assertRaises(ValueError):
+            json_utility.run_json_tests(effects=effects, mutates=mutates, estimators=estimators, f_flag=False)
+
     def tearDown(self) -> None:
         remove_temp_dir_if_existent()
         if os.path.exists("temp_out.txt"):
diff --git a/tests/resources/data/dag_not_descendent.dot b/tests/resources/data/dag_not_descendent.dot
@@ -0,0 +1 @@
+digraph G {X -> Y; Z -> X; Z -> Y}
diff --git a/tests/resources/data/not_descendent.csv b/tests/resources/data/not_descendent.csv
@@ -0,0 +1,2 @@
+X,Y,Z
+0,0,0
diff --git a/tests/testing_tests/test_estimators.py b/tests/testing_tests/test_estimators.py
@@ -7,6 +7,7 @@
     CausalForestEstimator,
     LogisticRegressionEstimator,
     InstrumentalVariableEstimator,
+    RegressionEstimator,
 )
 from causal_testing.specification.variable import Input
 from causal_testing.utils.validation import CausalValidator
@@ -124,15 +125,15 @@ def test_ate_adjustment(self):
         logistic_regression_estimator = LogisticRegressionEstimator(
             "length_in", 65, 55, {"large_gauge"}, "completed", df
         )
-        ate, _ = logistic_regression_estimator.estimate_ate(adjustment_config = {"large_gauge": 0})
+        ate, _ = logistic_regression_estimator.estimate_ate(adjustment_config={"large_gauge": 0})
         self.assertEqual(round(ate, 4), -0.3388)
 
     def test_ate_invalid_adjustment(self):
         df = self.scarf_df.copy()
         logistic_regression_estimator = LogisticRegressionEstimator("length_in", 65, 55, {}, "completed", df)
         with self.assertRaises(ValueError):
             ate, _ = logistic_regression_estimator.estimate_ate(
-                adjustment_config = {"large_gauge": 0}
+                adjustment_config={"large_gauge": 0}
             )
 
     def test_ate_effect_modifiers(self):
@@ -394,7 +395,7 @@ def test_program_15_no_interaction_ate_calculated(self):
         # for term_to_square in terms_to_square:
 
         ate, [ci_low, ci_high] = linear_regression_estimator.estimate_ate_calculated(
-            adjustment_config = {k: self.nhefs_df.mean()[k] for k in covariates}
+            adjustment_config={k: self.nhefs_df.mean()[k] for k in covariates}
         )
         self.assertEqual(round(ate, 1), 3.5)
         self.assertEqual([round(ci_low, 1), round(ci_high, 1)], [1.9, 5])
@@ -491,3 +492,41 @@ def test_X1_effect(self):
         test_results = lr_model.estimate_ate()
         ate = test_results[0]
         self.assertAlmostEqual(ate, 2.0)
+
+
+class TestRegressionEstimator(unittest.TestCase):
+    """Test the extended functionality of the TestRegressionEstimator"""
+
+    @classmethod
+    def setUpClass(cls):
+        class RegressionEstimatorTesting(RegressionEstimator):
+            def add_modelling_assumptions(self):
+                pass
+
+        cls.regression_estimator = RegressionEstimatorTesting("X", 1, 0, {"Z"}, "Y", formula="Y ~ X + Z")
+
+    def test_get_formulae(self):
+        outcome, treatment, covariates = self.regression_estimator.get_terms_from_formula()
+        self.assertEqual(outcome, "Y")
+        self.assertEqual(treatment, "X")
+        self.assertEqual(covariates, ["Z"])
+
+    def test_multiple_lhs_terms(self):
+        regression_estimator = self.regression_estimator
+        regression_estimator.formula = "Y + Z ~ X"
+        with self.assertRaises(ValueError):
+            self.regression_estimator.get_terms_from_formula()
+
+    def test_no_treatment_variable_in_formula(self):
+        regression_estimator = self.regression_estimator
+        regression_estimator.formula = "Y ~ A + Z"
+        with self.assertRaises(ValueError):
+            self.regression_estimator.get_terms_from_formula()
+
+
+    def test_no_covariate_in_formula(self):
+        regression_estimator = self.regression_estimator
+        regression_estimator.formula = "Y ~ X"
+        outcome, treatment, covariates = self.regression_estimator.get_terms_from_formula()
+        self.assertEqual(outcome, "Y")
+        self.assertEqual(treatment, "X")