Skip to content

Commit 4b4aae8

Browse files
authored
Merge branch 'main' into fix-linalg-error
2 parents fb5129c + 76557f2 commit 4b4aae8

File tree

7 files changed

+204
-36
lines changed

7 files changed

+204
-36
lines changed

causal_testing/json_front/json_class.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,9 @@ def _execute_test_case(
270270
failed = False
271271

272272
estimation_model = self._setup_test(causal_test_case=causal_test_case, test=test)
273+
if "formula" in test:
274+
if not estimation_model.validate_formula(self.causal_specification.causal_dag):
275+
raise ValueError("Formula covariates do not satisfy the constructive back-door criterion.")
273276
causal_test_result = causal_test_case.execute_test(
274277
estimator=estimation_model, data_collector=self.data_collector
275278
)
@@ -331,6 +334,7 @@ def _setup_test(self, causal_test_case: CausalTestCase, test: Mapping) -> Estima
331334
estimator_kwargs["alpha"] = test["alpha"] if "alpha" in test else 0.05
332335

333336
estimation_model = test["estimator"](**estimator_kwargs)
337+
334338
return estimation_model
335339

336340
def _append_to_file(self, line: str, log_level: int = None):

causal_testing/testing/estimators.py

Lines changed: 101 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,14 @@
1010
import statsmodels.api as sm
1111
import statsmodels.formula.api as smf
1212
from econml.dml import CausalForestDML
13-
from patsy import dmatrix
13+
from patsy import dmatrix, ModelDesc
1414

1515
from sklearn.ensemble import GradientBoostingRegressor
1616
from statsmodels.regression.linear_model import RegressionResultsWrapper
1717
from statsmodels.tools.sm_exceptions import PerfectSeparationError
1818

1919
from causal_testing.specification.variable import Variable
20+
from causal_testing.specification.causal_dag import CausalDAG
2021

2122
logger = logging.getLogger(__name__)
2223

@@ -83,10 +84,10 @@ def compute_confidence_intervals(self) -> list[float, float]:
8384
"""
8485

8586

86-
class LogisticRegressionEstimator(Estimator):
87-
"""A Logistic Regression Estimator is a parametric estimator which restricts the variables in the data to a linear
88-
combination of parameters and functions of the variables (note these functions need not be linear). It is designed
89-
for estimating categorical outcomes.
87+
class RegressionEstimator(Estimator):
88+
"""An abstract class extending the Estimator functionality to add support for formulae, which are used in
89+
regression based estimators.
90+
9091
"""
9192

9293
def __init__(
@@ -100,16 +101,97 @@ def __init__(
100101
df: pd.DataFrame = None,
101102
effect_modifiers: dict[str:Any] = None,
102103
formula: str = None,
104+
alpha: float = 0.05,
103105
):
104-
super().__init__(treatment, treatment_value, control_value, adjustment_set, outcome, df, effect_modifiers)
106+
super().__init__(
107+
treatment, treatment_value, control_value, adjustment_set, outcome, df, effect_modifiers, alpha=alpha
108+
)
105109

106-
self.model = None
110+
if effect_modifiers is None:
111+
effect_modifiers = []
107112

108113
if formula is not None:
109114
self.formula = formula
115+
116+
else:
117+
terms = [treatment] + sorted(list(adjustment_set)) + sorted(list(effect_modifiers))
118+
self.formula = f"{outcome} ~ {'+'.join(terms)}"
119+
120+
@abstractmethod
121+
def add_modelling_assumptions(self):
122+
"""
123+
Add modelling assumptions to the estimator. This is a list of strings which list the modelling assumptions that
124+
must hold if the resulting causal inference is to be considered valid.
125+
"""
126+
127+
def get_terms_from_formula(self) -> tuple[str, str, list[str]]:
128+
"""
129+
Parse all the terms from a Patsy formula string into outcome, treatment and covariate variables.
130+
131+
Formulae are expected to only have a single left hand side term.
132+
133+
:return: a truple containing the outcome, treatment and covariate variable names in string format
134+
"""
135+
desc = ModelDesc.from_formula(self.formula)
136+
if len(desc.lhs_termlist) > 1:
137+
raise ValueError("More than 1 left hand side term provided in formula, only single term is accepted")
138+
outcome = desc.lhs_termlist[0].factors[0].code
139+
rhs_terms = set()
140+
for term in desc.rhs_termlist:
141+
if term.factors:
142+
rhs_terms.add(term.factors[0].code)
143+
if self.treatment not in rhs_terms:
144+
raise ValueError(f"Treatment variable '{self.treatment}' not found in formula")
145+
rhs_terms.remove(self.treatment)
146+
covariates = rhs_terms
147+
if covariates is None:
148+
covariates = []
110149
else:
111-
terms = [treatment] + sorted(list(adjustment_set)) + sorted(list(self.effect_modifiers))
112-
self.formula = f"{outcome} ~ {'+'.join(((terms)))}"
150+
covariates = list(covariates)
151+
return outcome, self.treatment, covariates
152+
153+
def validate_formula(self, causal_dag: CausalDAG):
154+
"""
155+
Validate the provided Patsy formula string using the constructive backdoor criterion method found in the
156+
CausalDAG class
157+
158+
:param causal_dag: A CausalDAG object containing for the current test scenario
159+
:return: True for a formula that does not violate the criteria and False if the formula does violate the
160+
criteria
161+
"""
162+
outcome, treatment, covariates = self.get_terms_from_formula()
163+
proper_backdoor_graph = causal_dag.get_proper_backdoor_graph(treatments=[treatment], outcomes=[outcome])
164+
return causal_dag.constructive_backdoor_criterion(
165+
proper_backdoor_graph=proper_backdoor_graph,
166+
treatments=[treatment],
167+
outcomes=[outcome],
168+
covariates=list(covariates),
169+
)
170+
171+
172+
class LogisticRegressionEstimator(RegressionEstimator):
173+
"""A Logistic Regression Estimator is a parametric estimator which restricts the variables in the data to a linear
174+
combination of parameters and functions of the variables (note these functions need not be linear). It is designed
175+
for estimating categorical outcomes.
176+
"""
177+
178+
def __init__(
179+
# pylint: disable=too-many-arguments
180+
self,
181+
treatment: str,
182+
treatment_value: float,
183+
control_value: float,
184+
adjustment_set: set,
185+
outcome: str,
186+
df: pd.DataFrame = None,
187+
effect_modifiers: dict[str:Any] = None,
188+
formula: str = None,
189+
):
190+
super().__init__(
191+
treatment, treatment_value, control_value, adjustment_set, outcome, df, effect_modifiers, formula
192+
)
193+
194+
self.model = None
113195

114196
def add_modelling_assumptions(self):
115197
"""
@@ -274,7 +356,7 @@ def estimate_unit_odds_ratio(self) -> float:
274356
return np.exp(model.params[self.treatment])
275357

276358

277-
class LinearRegressionEstimator(Estimator):
359+
class LinearRegressionEstimator(RegressionEstimator):
278360
"""A Linear Regression Estimator is a parametric estimator which restricts the variables in the data to a linear
279361
combination of parameters and functions of the variables (note these functions need not be linear).
280362
"""
@@ -293,18 +375,18 @@ def __init__(
293375
alpha: float = 0.05,
294376
):
295377
super().__init__(
296-
treatment, treatment_value, control_value, adjustment_set, outcome, df, effect_modifiers, alpha=alpha
378+
treatment,
379+
treatment_value,
380+
control_value,
381+
adjustment_set,
382+
outcome,
383+
df,
384+
effect_modifiers,
385+
alpha=alpha,
386+
formula=formula,
297387
)
298388

299389
self.model = None
300-
if effect_modifiers is None:
301-
effect_modifiers = []
302-
303-
if formula is not None:
304-
self.formula = formula
305-
else:
306-
terms = [treatment] + sorted(list(adjustment_set)) + sorted(list(effect_modifiers))
307-
self.formula = f"{outcome} ~ {'+'.join(terms)}"
308390

309391
for term in self.effect_modifiers:
310392
self.adjustment_set.add(term)

docs/source/usage.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ the given output and input and the desired effect. This information is the minim
5555
base_test_case = BaseTestCase(
5656
treatment_variable = x, # Set the treatment (input) variable to x
5757
outcome_variable = y, # set the outcome (output) variable to y
58-
effect = Effect.direct.value) # effect type, current accepted types are direct and total
58+
effect = Effect.DIRECT.value) # effect type, current accepted types are direct and total
5959
6060
causal_test_case = CausalTestCase(
6161
base_test_case = base_test_case,

tests/json_front_tests/test_json_class.py

Lines changed: 53 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -272,19 +272,6 @@ def test_concrete_generate_params(self):
272272
self.assertIn("failed", temp_out[-1])
273273

274274
def test_no_data_provided(self):
275-
example_test = {
276-
"tests": [
277-
{
278-
"name": "test1",
279-
"mutations": {"test_input": "Increase"},
280-
"estimator": "LinearRegressionEstimator",
281-
"estimate_type": "ate",
282-
"effect_modifiers": [],
283-
"expected_effect": {"test_output": "NoEffect"},
284-
"skip": False,
285-
}
286-
]
287-
}
288275
json_class = JsonUtility("temp_out.txt", True)
289276
json_class.set_paths(self.json_path, self.dag_path)
290277

@@ -316,6 +303,59 @@ def test_estimator_formula_type_check(self):
316303
with self.assertRaises(TypeError):
317304
self.json_class.run_json_tests(effects=effects, mutates=mutates, estimators=estimators, f_flag=False)
318305

306+
307+
def test_constructive_back_door_not_met(self):
308+
example_test = {
309+
"tests": [
310+
{
311+
"name": "test1",
312+
"mutations": {"X": "Increase"},
313+
"estimator": "LinearRegressionEstimator",
314+
"estimate_type": "ate",
315+
"effect_modifiers": [],
316+
"expected_effect": {"Y": "NoEffect"},
317+
"skip": False,
318+
"formula": "Y ~ X",
319+
}
320+
]
321+
}
322+
inputs = [
323+
Input("X", int),
324+
Input("Z", int)
325+
]
326+
outputs = [
327+
Output("Y", int)
328+
]
329+
variables = inputs + outputs
330+
modelling_scenario = Scenario(variables)
331+
modelling_scenario.setup_treatment_variables()
332+
json_utility = JsonUtility("temp_out.txt", True)
333+
test_data_dir_path = Path("tests/resources/data")
334+
dag_path = str(test_data_dir_path / "dag_not_descendent.dot")
335+
data_path = [str(test_data_dir_path / "not_descendent.csv")]
336+
input_dict_list = [
337+
{"name": "X", "datatype": float},
338+
{"name": "Z", "datatype": float},
339+
]
340+
output_dict_list = [{"name": "Y", "datatype": float}]
341+
variables = CausalVariables(
342+
inputs=input_dict_list, outputs=output_dict_list, metas=None
343+
)
344+
345+
effects = {"NoEffect": NoEffect()}
346+
mutates = {
347+
"Increase": lambda x: json_utility.scenario.treatment_variables[x].z3
348+
> json_utility.scenario.variables[x].z3
349+
}
350+
estimators = {"LinearRegressionEstimator": LinearRegressionEstimator}
351+
352+
scenario = Scenario(variables=variables, constraints=None)
353+
json_utility.set_paths(self.json_path, dag_path, data_path)
354+
json_utility.setup(scenario)
355+
json_utility.test_plan = example_test
356+
with self.assertRaises(ValueError):
357+
json_utility.run_json_tests(effects=effects, mutates=mutates, estimators=estimators, f_flag=False)
358+
319359
def tearDown(self) -> None:
320360
remove_temp_dir_if_existent()
321361
if os.path.exists("temp_out.txt"):
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
digraph G {X -> Y; Z -> X; Z -> Y}
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
X,Y,Z
2+
0,0,0

tests/testing_tests/test_estimators.py

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
CausalForestEstimator,
88
LogisticRegressionEstimator,
99
InstrumentalVariableEstimator,
10+
RegressionEstimator,
1011
)
1112
from causal_testing.specification.variable import Input
1213
from causal_testing.utils.validation import CausalValidator
@@ -124,15 +125,15 @@ def test_ate_adjustment(self):
124125
logistic_regression_estimator = LogisticRegressionEstimator(
125126
"length_in", 65, 55, {"large_gauge"}, "completed", df
126127
)
127-
ate, _ = logistic_regression_estimator.estimate_ate(adjustment_config = {"large_gauge": 0})
128+
ate, _ = logistic_regression_estimator.estimate_ate(adjustment_config={"large_gauge": 0})
128129
self.assertEqual(round(ate, 4), -0.3388)
129130

130131
def test_ate_invalid_adjustment(self):
131132
df = self.scarf_df.copy()
132133
logistic_regression_estimator = LogisticRegressionEstimator("length_in", 65, 55, {}, "completed", df)
133134
with self.assertRaises(ValueError):
134135
ate, _ = logistic_regression_estimator.estimate_ate(
135-
adjustment_config = {"large_gauge": 0}
136+
adjustment_config={"large_gauge": 0}
136137
)
137138

138139
def test_ate_effect_modifiers(self):
@@ -394,7 +395,7 @@ def test_program_15_no_interaction_ate_calculated(self):
394395
# for term_to_square in terms_to_square:
395396

396397
ate, [ci_low, ci_high] = linear_regression_estimator.estimate_ate_calculated(
397-
adjustment_config = {k: self.nhefs_df.mean()[k] for k in covariates}
398+
adjustment_config={k: self.nhefs_df.mean()[k] for k in covariates}
398399
)
399400
self.assertEqual(round(ate, 1), 3.5)
400401
self.assertEqual([round(ci_low, 1), round(ci_high, 1)], [1.9, 5])
@@ -491,3 +492,41 @@ def test_X1_effect(self):
491492
test_results = lr_model.estimate_ate()
492493
ate = test_results[0]
493494
self.assertAlmostEqual(ate, 2.0)
495+
496+
497+
class TestRegressionEstimator(unittest.TestCase):
498+
"""Test the extended functionality of the TestRegressionEstimator"""
499+
500+
@classmethod
501+
def setUpClass(cls):
502+
class RegressionEstimatorTesting(RegressionEstimator):
503+
def add_modelling_assumptions(self):
504+
pass
505+
506+
cls.regression_estimator = RegressionEstimatorTesting("X", 1, 0, {"Z"}, "Y", formula="Y ~ X + Z")
507+
508+
def test_get_formulae(self):
509+
outcome, treatment, covariates = self.regression_estimator.get_terms_from_formula()
510+
self.assertEqual(outcome, "Y")
511+
self.assertEqual(treatment, "X")
512+
self.assertEqual(covariates, ["Z"])
513+
514+
def test_multiple_lhs_terms(self):
515+
regression_estimator = self.regression_estimator
516+
regression_estimator.formula = "Y + Z ~ X"
517+
with self.assertRaises(ValueError):
518+
self.regression_estimator.get_terms_from_formula()
519+
520+
def test_no_treatment_variable_in_formula(self):
521+
regression_estimator = self.regression_estimator
522+
regression_estimator.formula = "Y ~ A + Z"
523+
with self.assertRaises(ValueError):
524+
self.regression_estimator.get_terms_from_formula()
525+
526+
527+
def test_no_covariate_in_formula(self):
528+
regression_estimator = self.regression_estimator
529+
regression_estimator.formula = "Y ~ X"
530+
outcome, treatment, covariates = self.regression_estimator.get_terms_from_formula()
531+
self.assertEqual(outcome, "Y")
532+
self.assertEqual(treatment, "X")

0 commit comments

Comments
 (0)