Skip to content

Commit a809a00

Browse files
committed
Nearly passing
1 parent a316203 commit a809a00

File tree

2 files changed

+87
-78
lines changed

2 files changed

+87
-78
lines changed

causal_testing/testing/estimators.py

Lines changed: 54 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import statsmodels.api as sm
1010
import statsmodels.formula.api as smf
1111
from econml.dml import CausalForestDML
12+
from patsy import dmatrix
1213

1314
from sklearn.ensemble import GradientBoostingRegressor
1415
from statsmodels.regression.linear_model import RegressionResultsWrapper
@@ -314,7 +315,7 @@ def __init__(
314315
self.formula = formula
315316
else:
316317
terms = [treatment] + sorted(list(adjustment_set)) + sorted(list(effect_modifiers))
317-
self.formula = f"{outcome} ~ {'+'.join(((terms)))} + Intercept"
318+
self.formula = f"{outcome} ~ {'+'.join(((terms)))}"
318319

319320
for term in self.effect_modifiers:
320321
self.adjustment_set.add(term)
@@ -330,53 +331,53 @@ def add_modelling_assumptions(self):
330331
"do not need to be linear."
331332
)
332333

333-
def add_squared_term_to_df(self, term_to_square: str):
334-
"""Add a squared term to the linear regression model and df.
335-
336-
This enables the user to capture curvilinear relationships with a linear regression model, not just straight
337-
lines, while automatically adding the modelling assumption imposed by the addition of this term.
338-
339-
:param term_to_square: The term (column in data and variable in DAG) which is to be squared.
340-
"""
341-
new_term = str(term_to_square) + "^2"
342-
self.df[new_term] = self.df[term_to_square] ** 2
343-
self.adjustment_set.add(new_term)
344-
self.modelling_assumptions += (
345-
f"Relationship between {self.treatment} and {self.outcome} varies quadratically" f"with {term_to_square}."
346-
)
347-
self.square_terms.append(term_to_square)
348-
349-
def add_inverse_term_to_df(self, term_to_invert: str):
350-
"""Add an inverse term to the linear regression model and df.
351-
352-
This enables the user to capture curvilinear relationships with a linear regression model, not just straight
353-
lines, while automatically adding the modelling assumption imposed by the addition of this term.
354-
355-
:param term_to_square: The term (column in data and variable in DAG) which is to be squared.
356-
"""
357-
new_term = "1/" + str(term_to_invert)
358-
self.df[new_term] = 1 / self.df[term_to_invert]
359-
self.adjustment_set.add(new_term)
360-
self.modelling_assumptions += (
361-
f"Relationship between {self.treatment} and {self.outcome} varies inversely" f"with {term_to_invert}."
362-
)
363-
self.inverse_terms.append(term_to_invert)
364-
365-
def add_product_term_to_df(self, term_a: str, term_b: str):
366-
"""Add a product term to the linear regression model and df.
367-
368-
This enables the user to capture interaction between a pair of variables in the model. In other words, while
369-
each covariate's contribution to the mean is assumed to be independent of the other covariates, the pair of
370-
product terms term_a*term_b a are restricted to vary linearly with each other.
371-
372-
:param term_a: The first term of the product term.
373-
:param term_b: The second term of the product term.
374-
"""
375-
new_term = str(term_a) + "*" + str(term_b)
376-
self.df[new_term] = self.df[term_a] * self.df[term_b]
377-
self.adjustment_set.add(new_term)
378-
self.modelling_assumptions += f"{term_a} and {term_b} vary linearly with each other."
379-
self.product_terms.append((term_a, term_b))
334+
# def add_squared_term_to_df(self, term_to_square: str):
335+
# """Add a squared term to the linear regression model and df.
336+
#
337+
# This enables the user to capture curvilinear relationships with a linear regression model, not just straight
338+
# lines, while automatically adding the modelling assumption imposed by the addition of this term.
339+
#
340+
# :param term_to_square: The term (column in data and variable in DAG) which is to be squared.
341+
# """
342+
# new_term = str(term_to_square) + "^2"
343+
# self.df[new_term] = self.df[term_to_square] ** 2
344+
# self.adjustment_set.add(new_term)
345+
# self.modelling_assumptions += (
346+
# f"Relationship between {self.treatment} and {self.outcome} varies quadratically" f"with {term_to_square}."
347+
# )
348+
# self.square_terms.append(term_to_square)
349+
350+
# def add_inverse_term_to_df(self, term_to_invert: str):
351+
# """Add an inverse term to the linear regression model and df.
352+
#
353+
# This enables the user to capture curvilinear relationships with a linear regression model, not just straight
354+
# lines, while automatically adding the modelling assumption imposed by the addition of this term.
355+
#
356+
# :param term_to_square: The term (column in data and variable in DAG) which is to be squared.
357+
# """
358+
# new_term = "1/" + str(term_to_invert)
359+
# self.df[new_term] = 1 / self.df[term_to_invert]
360+
# self.adjustment_set.add(new_term)
361+
# self.modelling_assumptions += (
362+
# f"Relationship between {self.treatment} and {self.outcome} varies inversely" f"with {term_to_invert}."
363+
# )
364+
# self.inverse_terms.append(term_to_invert)
365+
366+
# def add_product_term_to_df(self, term_a: str, term_b: str):
367+
# """Add a product term to the linear regression model and df.
368+
#
369+
# This enables the user to capture interaction between a pair of variables in the model. In other words, while
370+
# each covariate's contribution to the mean is assumed to be independent of the other covariates, the pair of
371+
# product terms term_a*term_b a are restricted to vary linearly with each other.
372+
#
373+
# :param term_a: The first term of the product term.
374+
# :param term_b: The second term of the product term.
375+
# """
376+
# new_term = str(term_a) + "*" + str(term_b)
377+
# self.df[new_term] = self.df[term_a] * self.df[term_b]
378+
# self.adjustment_set.add(new_term)
379+
# self.modelling_assumptions += f"{term_a} and {term_b} vary linearly with each other."
380+
# self.product_terms.append((term_a, term_b))
380381

381382
def estimate_unit_ate(self) -> float:
382383
"""Estimate the unit average treatment effect of the treatment on the outcome. That is, the change in outcome
@@ -430,19 +431,16 @@ def estimate_control_treatment(self, adjustment_config: dict = None) -> tuple[pd
430431
model = self._run_linear_regression()
431432
self.model = model
432433

433-
x = pd.DataFrame()
434+
435+
436+
x = pd.DataFrame(columns=self.df.columns)
434437
x[self.treatment] = [self.treatment_value, self.control_value]
435438
x["Intercept"] = 1#self.intercept
436439
for k, v in adjustment_config.items():
437440
x[k] = v
438441
for k, v in self.effect_modifiers.items():
439442
x[k] = v
440-
for t in self.square_terms:
441-
x[t + "^2"] = x[t] ** 2
442-
for t in self.inverse_terms:
443-
x["1/" + t] = 1 / x[t]
444-
for a, b in self.product_terms:
445-
x[f"{a}*{b}"] = x[a] * x[b]
443+
x = dmatrix(self.formula.split("~")[1], x, return_type="dataframe")
446444
for col in x:
447445
if str(x.dtypes[col]) == "object":
448446
x = pd.get_dummies(x, columns=[col], drop_first=True)
@@ -534,6 +532,7 @@ def _run_linear_regression(self) -> RegressionResultsWrapper:
534532
)
535533
# model = sm.OLS(outcome_col, treatment_and_adjustments_cols).fit()
536534
model = smf.ols(formula=self.formula, data=self.df).fit()
535+
print(model.summary())
537536
return model
538537

539538
def _get_confidence_intervals(self, model):

tests/testing_tests/test_estimators.py

Lines changed: 33 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ def test_program_11_2(self):
158158
model = linear_regression_estimator._run_linear_regression()
159159
ate, _ = linear_regression_estimator.estimate_unit_ate()
160160

161+
print(model.summary())
161162
self.assertEqual(round(model.params["Intercept"] + 90 * model.params["treatments"], 1), 216.9)
162163

163164
# Increasing treatments from 90 to 100 should be the same as 10 times the unit ATE
@@ -166,13 +167,13 @@ def test_program_11_2(self):
166167
def test_program_11_3(self):
167168
"""Test whether our linear regression implementation produces the same results as program 11.3 (p. 144)."""
168169
df = self.chapter_11_df.copy()
169-
linear_regression_estimator = LinearRegressionEstimator("treatments", 100, 90, set(), "outcomes", df)
170-
linear_regression_estimator.add_squared_term_to_df("treatments")
170+
linear_regression_estimator = LinearRegressionEstimator("treatments", 100, 90, set(), "outcomes", df, formula="outcomes ~ treatments + np.power(treatments, 2)")
171+
# linear_regression_estimator.add_squared_term_to_df("treatments")
171172
model = linear_regression_estimator._run_linear_regression()
172173
ate, _ = linear_regression_estimator.estimate_unit_ate()
173174
self.assertEqual(
174175
round(
175-
model.params["Intercept"] + 90 * model.params["treatments"] + 90 * 90 * model.params["treatments^2"], 1
176+
model.params["Intercept"] + 90 * model.params["treatments"] + 90 * 90 * model.params["np.power(treatments, 2)"], 1
176177
),
177178
197.1,
178179
)
@@ -198,13 +199,19 @@ def test_program_15_1A(self):
198199
"smokeintensity",
199200
"smokeyrs",
200201
}
201-
linear_regression_estimator = LinearRegressionEstimator("qsmk", 1, 0, covariates, "wt82_71", df)
202-
terms_to_square = ["age", "wt71", "smokeintensity", "smokeyrs"]
203-
terms_to_product = [("qsmk", "smokeintensity")]
204-
for term_to_square in terms_to_square:
205-
linear_regression_estimator.add_squared_term_to_df(term_to_square)
206-
for term_a, term_b in terms_to_product:
207-
linear_regression_estimator.add_product_term_to_df(term_a, term_b)
202+
linear_regression_estimator = LinearRegressionEstimator("qsmk", 1, 0, covariates, "wt82_71", df,
203+
formula="""wt82_71 ~ qsmk +
204+
age + np.power(age, 2) +
205+
wt71 + np.power(wt71, 2) +
206+
smokeintensity + np.power(smokeintensity, 2) +
207+
smokeyrs + np.power(smokeyrs, 2) +
208+
(qsmk * smokeintensity)""")
209+
# terms_to_square = ["age", "wt71", "smokeintensity", "smokeyrs"]
210+
# terms_to_product = [("qsmk", "smokeintensity")]
211+
# for term_to_square in terms_to_square:
212+
# linear_regression_estimator.add_squared_term_to_df(term_to_square)
213+
# for term_a, term_b in terms_to_product:
214+
# linear_regression_estimator.add_product_term_to_df(term_a, term_b)
208215

209216
model = linear_regression_estimator._run_linear_regression()
210217
self.assertEqual(round(model.params["qsmk"], 1), 2.6)
@@ -230,10 +237,11 @@ def test_program_15_no_interaction(self):
230237
"smokeintensity",
231238
"smokeyrs",
232239
}
233-
linear_regression_estimator = LinearRegressionEstimator("qsmk", 1, 0, covariates, "wt82_71", df)
234-
terms_to_square = ["age", "wt71", "smokeintensity", "smokeyrs"]
235-
for term_to_square in terms_to_square:
236-
linear_regression_estimator.add_squared_term_to_df(term_to_square)
240+
linear_regression_estimator = LinearRegressionEstimator("qsmk", 1, 0, covariates, "wt82_71", df,
241+
formula="wt82_71 ~ qsmk + age + np.power(age, 2) + wt71 + np.power(wt71, 2) + smokeintensity + np.power(smokeintensity, 2) + smokeyrs + np.power(smokeyrs, 2)")
242+
# terms_to_square = ["age", "wt71", "smokeintensity", "smokeyrs"]
243+
# for term_to_square in terms_to_square:
244+
# linear_regression_estimator.add_squared_term_to_df(term_to_square)
237245
ate, [ci_low, ci_high] = linear_regression_estimator.estimate_unit_ate()
238246
self.assertEqual(round(ate, 1), 3.5)
239247
self.assertEqual([round(ci_low, 1), round(ci_high, 1)], [2.6, 4.3])
@@ -258,10 +266,11 @@ def test_program_15_no_interaction_ate(self):
258266
"smokeintensity",
259267
"smokeyrs",
260268
}
261-
linear_regression_estimator = LinearRegressionEstimator("qsmk", 1, 0, covariates, "wt82_71", df)
262-
terms_to_square = ["age", "wt71", "smokeintensity", "smokeyrs"]
263-
for term_to_square in terms_to_square:
264-
linear_regression_estimator.add_squared_term_to_df(term_to_square)
269+
linear_regression_estimator = LinearRegressionEstimator("qsmk", 1, 0, covariates, "wt82_71", df,
270+
formula="wt82_71 ~ qsmk + age + np.power(age, 2) + wt71 + np.power(wt71, 2) + smokeintensity + np.power(smokeintensity, 2) + smokeyrs + np.power(smokeyrs, 2)")
271+
# terms_to_square = ["age", "wt71", "smokeintensity", "smokeyrs"]
272+
# for term_to_square in terms_to_square:
273+
# linear_regression_estimator.add_squared_term_to_df(term_to_square)
265274
ate, [ci_low, ci_high] = linear_regression_estimator.estimate_ate()
266275
self.assertEqual(round(ate, 1), 3.5)
267276
self.assertEqual([round(ci_low, 1), round(ci_high, 1)], [2.6, 4.3])
@@ -286,10 +295,11 @@ def test_program_15_no_interaction_ate_calculated(self):
286295
"smokeintensity",
287296
"smokeyrs",
288297
}
289-
linear_regression_estimator = LinearRegressionEstimator("qsmk", 1, 0, covariates, "wt82_71", df)
290-
terms_to_square = ["age", "wt71", "smokeintensity", "smokeyrs"]
291-
for term_to_square in terms_to_square:
292-
linear_regression_estimator.add_squared_term_to_df(term_to_square)
298+
linear_regression_estimator = LinearRegressionEstimator("qsmk", 1, 0, covariates, "wt82_71", df,
299+
formula="wt82_71 ~ qsmk + age + np.power(age, 2) + wt71 + np.power(wt71, 2) + smokeintensity + np.power(smokeintensity, 2) + smokeyrs + np.power(smokeyrs, 2)")
300+
# terms_to_square = ["age", "wt71", "smokeintensity", "smokeyrs"]
301+
# for term_to_square in terms_to_square:
302+
# linear_regression_estimator.add_squared_term_to_df(term_to_square)
293303
ate, [ci_low, ci_high] = linear_regression_estimator.estimate_ate_calculated(
294304
{k: self.nhefs_df.mean()[k] for k in covariates}
295305
)
@@ -377,7 +387,7 @@ def test_X1_effect(self):
377387
"""When we fix the value of X2 to 0, the effect of X1 on Y should become ~2 (because X2 terms are cancelled)."""
378388
x2 = Input("X2", float)
379389
lr_model = LinearRegressionEstimator(
380-
("X1",), 1, 0, {"X2"}, ("Y",), effect_modifiers={x2: 0}, formula="Y ~ X1 + X2 + (X1 * X2)", df=self.df
390+
"X1", 1, 0, {"X2"}, "Y", effect_modifiers={x2: 0}, formula="Y ~ X1 + X2 + (X1 * X2)", df=self.df
381391
)
382392
test_results = lr_model.estimate_ate()
383393
ate = test_results[0]

0 commit comments

Comments
 (0)