Merge branch 'main' into fix-linalg-error

jmafoster1 · web-flow · commit 6ebbdc80a7c7 · 2023-09-19T15:55:07.000+01:00
diff --git a/causal_testing/json_front/json_class.py b/causal_testing/json_front/json_class.py
@@ -270,9 +270,6 @@ def _execute_test_case(
         failed = False
 
         estimation_model = self._setup_test(causal_test_case=causal_test_case, test=test)
-        if "formula" in test:
-            if not estimation_model.validate_formula(self.causal_specification.causal_dag):
-                raise ValueError("Formula covariates do not satisfy the constructive back-door criterion.")
         causal_test_result = causal_test_case.execute_test(
             estimator=estimation_model, data_collector=self.data_collector
         )
@@ -334,7 +331,6 @@ def _setup_test(self, causal_test_case: CausalTestCase, test: Mapping) -> Estima
         estimator_kwargs["alpha"] = test["alpha"] if "alpha" in test else 0.05
 
         estimation_model = test["estimator"](**estimator_kwargs)
-
         return estimation_model
 
     def _append_to_file(self, line: str, log_level: int = None):
diff --git a/causal_testing/testing/estimators.py b/causal_testing/testing/estimators.py
@@ -10,14 +10,13 @@
 import statsmodels.api as sm
 import statsmodels.formula.api as smf
 from econml.dml import CausalForestDML
-from patsy import dmatrix, ModelDesc
+from patsy import dmatrix
 
 from sklearn.ensemble import GradientBoostingRegressor
 from statsmodels.regression.linear_model import RegressionResultsWrapper
 from statsmodels.tools.sm_exceptions import PerfectSeparationError
 
 from causal_testing.specification.variable import Variable
-from causal_testing.specification.causal_dag import CausalDAG
 
 logger = logging.getLogger(__name__)
 
@@ -84,92 +83,7 @@ def compute_confidence_intervals(self) -> list[float, float]:
         """
 
 
-class RegressionEstimator(Estimator):
-    """An abstract class extending the Estimator functionality to add support for formulae, which are used in
-    regression based estimators.
-
-    """
-
-    def __init__(
-        # pylint: disable=too-many-arguments
-        self,
-        treatment: str,
-        treatment_value: float,
-        control_value: float,
-        adjustment_set: set,
-        outcome: str,
-        df: pd.DataFrame = None,
-        effect_modifiers: dict[str:Any] = None,
-        formula: str = None,
-        alpha: float = 0.05,
-    ):
-        super().__init__(
-            treatment, treatment_value, control_value, adjustment_set, outcome, df, effect_modifiers, alpha=alpha
-        )
-
-        if effect_modifiers is None:
-            effect_modifiers = []
-
-        if formula is not None:
-            self.formula = formula
-
-        else:
-            terms = [treatment] + sorted(list(adjustment_set)) + sorted(list(effect_modifiers))
-            self.formula = f"{outcome} ~ {'+'.join(terms)}"
-
-    @abstractmethod
-    def add_modelling_assumptions(self):
-        """
-        Add modelling assumptions to the estimator. This is a list of strings which list the modelling assumptions that
-        must hold if the resulting causal inference is to be considered valid.
-        """
-
-    def get_terms_from_formula(self) -> tuple[str, str, list[str]]:
-        """
-        Parse all the terms from a Patsy formula string into outcome, treatment and covariate variables.
-
-        Formulae are expected to only have a single left hand side term.
-
-        :return: a truple containing the outcome, treatment and covariate variable names in string format
-        """
-        desc = ModelDesc.from_formula(self.formula)
-        if len(desc.lhs_termlist) > 1:
-            raise ValueError("More than 1 left hand side term provided in formula, only single term is accepted")
-        outcome = desc.lhs_termlist[0].factors[0].code
-        rhs_terms = set()
-        for term in desc.rhs_termlist:
-            if term.factors:
-                rhs_terms.add(term.factors[0].code)
-        if self.treatment not in rhs_terms:
-            raise ValueError(f"Treatment variable '{self.treatment}' not found in formula")
-        rhs_terms.remove(self.treatment)
-        covariates = rhs_terms
-        if covariates is None:
-            covariates = []
-        else:
-            covariates = list(covariates)
-        return outcome, self.treatment, covariates
-
-    def validate_formula(self, causal_dag: CausalDAG):
-        """
-        Validate the provided Patsy formula string using the constructive backdoor criterion method found in the
-        CausalDAG class
-
-        :param causal_dag: A CausalDAG object containing for the current test scenario
-        :return: True for a formula that does not violate the criteria and False if the formula does violate the
-        criteria
-        """
-        outcome, treatment, covariates = self.get_terms_from_formula()
-        proper_backdoor_graph = causal_dag.get_proper_backdoor_graph(treatments=[treatment], outcomes=[outcome])
-        return causal_dag.constructive_backdoor_criterion(
-            proper_backdoor_graph=proper_backdoor_graph,
-            treatments=[treatment],
-            outcomes=[outcome],
-            covariates=list(covariates),
-        )
-
-
-class LogisticRegressionEstimator(RegressionEstimator):
+class LogisticRegressionEstimator(Estimator):
     """A Logistic Regression Estimator is a parametric estimator which restricts the variables in the data to a linear
     combination of parameters and functions of the variables (note these functions need not be linear). It is designed
     for estimating categorical outcomes.
@@ -187,12 +101,16 @@ def __init__(
         effect_modifiers: dict[str:Any] = None,
         formula: str = None,
     ):
-        super().__init__(
-            treatment, treatment_value, control_value, adjustment_set, outcome, df, effect_modifiers, formula
-        )
+        super().__init__(treatment, treatment_value, control_value, adjustment_set, outcome, df, effect_modifiers)
 
         self.model = None
 
+        if formula is not None:
+            self.formula = formula
+        else:
+            terms = [treatment] + sorted(list(adjustment_set)) + sorted(list(self.effect_modifiers))
+            self.formula = f"{outcome} ~ {'+'.join(((terms)))}"
+
     def add_modelling_assumptions(self):
         """
         Add modelling assumptions to the estimator. This is a list of strings which list the modelling assumptions that
@@ -356,7 +274,7 @@ def estimate_unit_odds_ratio(self) -> float:
         return np.exp(model.params[self.treatment])
 
 
-class LinearRegressionEstimator(RegressionEstimator):
+class LinearRegressionEstimator(Estimator):
     """A Linear Regression Estimator is a parametric estimator which restricts the variables in the data to a linear
     combination of parameters and functions of the variables (note these functions need not be linear).
     """
@@ -375,18 +293,18 @@ def __init__(
         alpha: float = 0.05,
     ):
         super().__init__(
-            treatment,
-            treatment_value,
-            control_value,
-            adjustment_set,
-            outcome,
-            df,
-            effect_modifiers,
-            alpha=alpha,
-            formula=formula,
+            treatment, treatment_value, control_value, adjustment_set, outcome, df, effect_modifiers, alpha=alpha
         )
 
         self.model = None
+        if effect_modifiers is None:
+            effect_modifiers = []
+
+        if formula is not None:
+            self.formula = formula
+        else:
+            terms = [treatment] + sorted(list(adjustment_set)) + sorted(list(effect_modifiers))
+            self.formula = f"{outcome} ~ {'+'.join(terms)}"
 
         for term in self.effect_modifiers:
             self.adjustment_set.add(term)
diff --git a/tests/json_front_tests/test_json_class.py b/tests/json_front_tests/test_json_class.py
@@ -272,6 +272,19 @@ def test_concrete_generate_params(self):
         self.assertIn("failed", temp_out[-1])
 
     def test_no_data_provided(self):
+        example_test = {
+            "tests": [
+                {
+                    "name": "test1",
+                    "mutations": {"test_input": "Increase"},
+                    "estimator": "LinearRegressionEstimator",
+                    "estimate_type": "ate",
+                    "effect_modifiers": [],
+                    "expected_effect": {"test_output": "NoEffect"},
+                    "skip": False,
+                }
+            ]
+        }
         json_class = JsonUtility("temp_out.txt", True)
         json_class.set_paths(self.json_path, self.dag_path)
 
@@ -303,59 +316,6 @@ def test_estimator_formula_type_check(self):
         with self.assertRaises(TypeError):
             self.json_class.run_json_tests(effects=effects, mutates=mutates, estimators=estimators, f_flag=False)
 
-
-    def test_constructive_back_door_not_met(self):
-        example_test = {
-            "tests": [
-                {
-                    "name": "test1",
-                    "mutations": {"X": "Increase"},
-                    "estimator": "LinearRegressionEstimator",
-                    "estimate_type": "ate",
-                    "effect_modifiers": [],
-                    "expected_effect": {"Y": "NoEffect"},
-                    "skip": False,
-                    "formula": "Y ~ X",
-                }
-            ]
-        }
-        inputs = [
-            Input("X", int),
-            Input("Z", int)
-        ]
-        outputs = [
-            Output("Y", int)
-        ]
-        variables = inputs + outputs
-        modelling_scenario = Scenario(variables)
-        modelling_scenario.setup_treatment_variables()
-        json_utility = JsonUtility("temp_out.txt", True)
-        test_data_dir_path = Path("tests/resources/data")
-        dag_path = str(test_data_dir_path / "dag_not_descendent.dot")
-        data_path = [str(test_data_dir_path / "not_descendent.csv")]
-        input_dict_list = [
-            {"name": "X", "datatype": float},
-            {"name": "Z", "datatype": float},
-        ]
-        output_dict_list = [{"name": "Y", "datatype": float}]
-        variables = CausalVariables(
-            inputs=input_dict_list, outputs=output_dict_list, metas=None
-        )
-
-        effects = {"NoEffect": NoEffect()}
-        mutates = {
-            "Increase": lambda x: json_utility.scenario.treatment_variables[x].z3
-                                  > json_utility.scenario.variables[x].z3
-        }
-        estimators = {"LinearRegressionEstimator": LinearRegressionEstimator}
-
-        scenario = Scenario(variables=variables, constraints=None)
-        json_utility.set_paths(self.json_path, dag_path, data_path)
-        json_utility.setup(scenario)
-        json_utility.test_plan = example_test
-        with self.assertRaises(ValueError):
-            json_utility.run_json_tests(effects=effects, mutates=mutates, estimators=estimators, f_flag=False)
-
     def tearDown(self) -> None:
         remove_temp_dir_if_existent()
         if os.path.exists("temp_out.txt"):
diff --git a/tests/resources/data/dag_not_descendent.dot b/tests/resources/data/dag_not_descendent.dot
diff --git a/tests/resources/data/not_descendent.csv b/tests/resources/data/not_descendent.csv
diff --git a/tests/testing_tests/test_estimators.py b/tests/testing_tests/test_estimators.py
@@ -7,7 +7,6 @@
     CausalForestEstimator,
     LogisticRegressionEstimator,
     InstrumentalVariableEstimator,
-    RegressionEstimator,
 )
 from causal_testing.specification.variable import Input
 from causal_testing.utils.validation import CausalValidator
@@ -125,15 +124,15 @@ def test_ate_adjustment(self):
         logistic_regression_estimator = LogisticRegressionEstimator(
             "length_in", 65, 55, {"large_gauge"}, "completed", df
         )
-        ate, _ = logistic_regression_estimator.estimate_ate(adjustment_config={"large_gauge": 0})
+        ate, _ = logistic_regression_estimator.estimate_ate(adjustment_config = {"large_gauge": 0})
         self.assertEqual(round(ate, 4), -0.3388)
 
     def test_ate_invalid_adjustment(self):
         df = self.scarf_df.copy()
         logistic_regression_estimator = LogisticRegressionEstimator("length_in", 65, 55, {}, "completed", df)
         with self.assertRaises(ValueError):
             ate, _ = logistic_regression_estimator.estimate_ate(
-                adjustment_config={"large_gauge": 0}
+                adjustment_config = {"large_gauge": 0}
             )
 
     def test_ate_effect_modifiers(self):
@@ -395,7 +394,7 @@ def test_program_15_no_interaction_ate_calculated(self):
         # for term_to_square in terms_to_square:
 
         ate, [ci_low, ci_high] = linear_regression_estimator.estimate_ate_calculated(
-            adjustment_config={k: self.nhefs_df.mean()[k] for k in covariates}
+            adjustment_config = {k: self.nhefs_df.mean()[k] for k in covariates}
         )
         self.assertEqual(round(ate, 1), 3.5)
         self.assertEqual([round(ci_low, 1), round(ci_high, 1)], [1.9, 5])
@@ -492,41 +491,3 @@ def test_X1_effect(self):
         test_results = lr_model.estimate_ate()
         ate = test_results[0]
         self.assertAlmostEqual(ate, 2.0)
-
-
-class TestRegressionEstimator(unittest.TestCase):
-    """Test the extended functionality of the TestRegressionEstimator"""
-
-    @classmethod
-    def setUpClass(cls):
-        class RegressionEstimatorTesting(RegressionEstimator):
-            def add_modelling_assumptions(self):
-                pass
-
-        cls.regression_estimator = RegressionEstimatorTesting("X", 1, 0, {"Z"}, "Y", formula="Y ~ X + Z")
-
-    def test_get_formulae(self):
-        outcome, treatment, covariates = self.regression_estimator.get_terms_from_formula()
-        self.assertEqual(outcome, "Y")
-        self.assertEqual(treatment, "X")
-        self.assertEqual(covariates, ["Z"])
-
-    def test_multiple_lhs_terms(self):
-        regression_estimator = self.regression_estimator
-        regression_estimator.formula = "Y + Z ~ X"
-        with self.assertRaises(ValueError):
-            self.regression_estimator.get_terms_from_formula()
-
-    def test_no_treatment_variable_in_formula(self):
-        regression_estimator = self.regression_estimator
-        regression_estimator.formula = "Y ~ A + Z"
-        with self.assertRaises(ValueError):
-            self.regression_estimator.get_terms_from_formula()
-
-
-    def test_no_covariate_in_formula(self):
-        regression_estimator = self.regression_estimator
-        regression_estimator.formula = "Y ~ X"
-        outcome, treatment, covariates = self.regression_estimator.get_terms_from_formula()
-        self.assertEqual(outcome, "Y")
-        self.assertEqual(treatment, "X")