Skip to content

Commit 05c5499

Browse files
committed
RegressionEstimator class to combine common elements of Linear and Logistic Regression Estimator classes.
1 parent e7deb78 commit 05c5499

File tree

5 files changed

+68
-151
lines changed

5 files changed

+68
-151
lines changed

causal_testing/estimation/cubic_spline_estimator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def __init__(
4646
self.formula = f"{outcome} ~ cr({'+'.join(terms)}, df={basis})"
4747

4848
def estimate_ate_calculated(self, adjustment_config: dict = None) -> pd.Series:
49-
model = self._run_linear_regression()
49+
model = self._run_regression()
5050

5151
x = {"Intercept": 1, self.treatment: self.treatment_value}
5252
if adjustment_config is not None:

causal_testing/estimation/linear_regression_estimator.py

Lines changed: 15 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -5,22 +5,22 @@
55

66
import pandas as pd
77
import statsmodels.formula.api as smf
8-
from patsy import dmatrix # pylint: disable = no-name-in-module
9-
from patsy import ModelDesc
10-
from statsmodels.regression.linear_model import RegressionResultsWrapper
8+
from patsy import dmatrix, ModelDesc # pylint: disable = no-name-in-module
119

1210
from causal_testing.specification.variable import Variable
1311
from causal_testing.estimation.gp import GP
14-
from causal_testing.estimation.estimator import Estimator
12+
from causal_testing.estimation.regression_estimator import RegressionEstimator
1513

1614
logger = logging.getLogger(__name__)
1715

1816

19-
class LinearRegressionEstimator(Estimator):
17+
class LinearRegressionEstimator(RegressionEstimator):
2018
"""A Linear Regression Estimator is a parametric estimator which restricts the variables in the data to a linear
2119
combination of parameters and functions of the variables (note these functions need not be linear).
2220
"""
2321

22+
regressor = smf.ols
23+
2424
def __init__(
2525
# pylint: disable=too-many-arguments
2626
self,
@@ -35,6 +35,7 @@ def __init__(
3535
alpha: float = 0.05,
3636
query: str = "",
3737
):
38+
# pylint: disable=too-many-arguments
3839
super().__init__(
3940
treatment,
4041
treatment_value,
@@ -43,20 +44,10 @@ def __init__(
4344
outcome,
4445
df,
4546
effect_modifiers,
46-
alpha=alpha,
47-
query=query,
47+
formula,
48+
alpha,
49+
query,
4850
)
49-
50-
self.model = None
51-
if effect_modifiers is None:
52-
effect_modifiers = []
53-
54-
if formula is not None:
55-
self.formula = formula
56-
else:
57-
terms = [treatment] + sorted(list(adjustment_set)) + sorted(list(effect_modifiers))
58-
self.formula = f"{outcome} ~ {'+'.join(terms)}"
59-
6051
for term in self.effect_modifiers:
6152
self.adjustment_set.add(term)
6253

@@ -118,7 +109,7 @@ def estimate_coefficient(self) -> tuple[pd.Series, list[pd.Series, pd.Series]]:
118109
119110
:return: The unit average treatment effect and the 95% Wald confidence intervals.
120111
"""
121-
model = self._run_linear_regression()
112+
model = self._run_regression()
122113
newline = "\n"
123114
patsy_md = ModelDesc.from_formula(self.treatment)
124115

@@ -147,7 +138,7 @@ def estimate_ate(self) -> tuple[pd.Series, list[pd.Series, pd.Series]]:
147138
148139
:return: The average treatment effect and the 95% Wald confidence intervals.
149140
"""
150-
model = self._run_linear_regression()
141+
model = self._run_regression()
151142

152143
# Create an empty individual for the control and treated
153144
individuals = pd.DataFrame(1, index=["control", "treated"], columns=model.params.index)
@@ -167,37 +158,6 @@ def estimate_ate(self) -> tuple[pd.Series, list[pd.Series, pd.Series]]:
167158
confidence_intervals = [pd.Series(interval) for interval in confidence_intervals]
168159
return ate, confidence_intervals
169160

170-
def estimate_control_treatment(self, adjustment_config: dict = None) -> tuple[pd.Series, pd.Series]:
171-
"""Estimate the outcomes under control and treatment.
172-
173-
:return: The estimated outcome under control and treatment in the form
174-
(control_outcome, treatment_outcome).
175-
"""
176-
if adjustment_config is None:
177-
adjustment_config = {}
178-
model = self._run_linear_regression()
179-
180-
x = pd.DataFrame(columns=self.df.columns)
181-
x[self.treatment] = [self.treatment_value, self.control_value]
182-
x["Intercept"] = 1 # self.intercept
183-
184-
print(x[self.treatment])
185-
for k, v in adjustment_config.items():
186-
x[k] = v
187-
for k, v in self.effect_modifiers.items():
188-
x[k] = v
189-
x = dmatrix(self.formula.split("~")[1], x, return_type="dataframe")
190-
for col in x:
191-
if str(x.dtypes[col]) == "object":
192-
x = pd.get_dummies(x, columns=[col], drop_first=True)
193-
x = x[model.params.index]
194-
195-
x[self.treatment] = [self.treatment_value, self.control_value]
196-
197-
y = model.get_prediction(x).summary_frame()
198-
199-
return y.iloc[1], y.iloc[0]
200-
201161
def estimate_risk_ratio(self, adjustment_config: dict = None) -> tuple[pd.Series, list[pd.Series, pd.Series]]:
202162
"""Estimate the risk_ratio effect of the treatment on the outcome. That is, the change in outcome caused
203163
by changing the treatment variable from the control value to the treatment value.
@@ -206,7 +166,8 @@ def estimate_risk_ratio(self, adjustment_config: dict = None) -> tuple[pd.Series
206166
"""
207167
if adjustment_config is None:
208168
adjustment_config = {}
209-
control_outcome, treatment_outcome = self.estimate_control_treatment(adjustment_config=adjustment_config)
169+
prediction = self._predict(adjustment_config=adjustment_config)
170+
control_outcome, treatment_outcome = prediction.iloc[1], prediction.iloc[0]
210171
ci_low = pd.Series(treatment_outcome["mean_ci_lower"] / control_outcome["mean_ci_upper"])
211172
ci_high = pd.Series(treatment_outcome["mean_ci_upper"] / control_outcome["mean_ci_lower"])
212173
return pd.Series(treatment_outcome["mean"] / control_outcome["mean"]), [ci_low, ci_high]
@@ -221,20 +182,12 @@ def estimate_ate_calculated(self, adjustment_config: dict = None) -> tuple[pd.Se
221182
"""
222183
if adjustment_config is None:
223184
adjustment_config = {}
224-
control_outcome, treatment_outcome = self.estimate_control_treatment(adjustment_config=adjustment_config)
185+
prediction = self._predict(adjustment_config=adjustment_config)
186+
control_outcome, treatment_outcome = prediction.iloc[1], prediction.iloc[0]
225187
ci_low = pd.Series(treatment_outcome["mean_ci_lower"] - control_outcome["mean_ci_upper"])
226188
ci_high = pd.Series(treatment_outcome["mean_ci_upper"] - control_outcome["mean_ci_lower"])
227189
return pd.Series(treatment_outcome["mean"] - control_outcome["mean"]), [ci_low, ci_high]
228190

229-
def _run_linear_regression(self) -> RegressionResultsWrapper:
230-
"""Run linear regression of the treatment and adjustment set against the outcome and return the model.
231-
232-
:return: The model after fitting to data.
233-
"""
234-
model = smf.ols(formula=self.formula, data=self.df).fit()
235-
self.model = model
236-
return model
237-
238191
def _get_confidence_intervals(self, model, treatment):
239192
confidence_intervals = model.conf_int(alpha=self.alpha, cols=None)
240193
ci_low, ci_high = (

causal_testing/estimation/logistic_regression_estimator.py

Lines changed: 8 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -1,59 +1,25 @@
11
"""This module contains the LogisticRegressionEstimator class for estimating categorical outcomes."""
22

33
import logging
4-
from typing import Any
54
from math import ceil
65

76
import numpy as np
87
import pandas as pd
98
import statsmodels.formula.api as smf
10-
from patsy import dmatrix # pylint: disable = no-name-in-module
11-
from statsmodels.regression.linear_model import RegressionResultsWrapper
129
from statsmodels.tools.sm_exceptions import PerfectSeparationError
1310

14-
from causal_testing.estimation.estimator import Estimator
11+
from causal_testing.estimation.regression_estimator import RegressionEstimator
1512

1613
logger = logging.getLogger(__name__)
1714

1815

19-
class LogisticRegressionEstimator(Estimator):
16+
class LogisticRegressionEstimator(RegressionEstimator):
2017
"""A Logistic Regression Estimator is a parametric estimator which restricts the variables in the data to a linear
2118
combination of parameters and functions of the variables (note these functions need not be linear). It is designed
2219
for estimating categorical outcomes.
2320
"""
2421

25-
def __init__(
26-
# pylint: disable=too-many-arguments
27-
self,
28-
treatment: str,
29-
treatment_value: float,
30-
control_value: float,
31-
adjustment_set: set,
32-
outcome: str,
33-
df: pd.DataFrame = None,
34-
effect_modifiers: dict[str:Any] = None,
35-
formula: str = None,
36-
query: str = "",
37-
):
38-
super().__init__(
39-
treatment=treatment,
40-
treatment_value=treatment_value,
41-
control_value=control_value,
42-
adjustment_set=adjustment_set,
43-
outcome=outcome,
44-
df=df,
45-
effect_modifiers=effect_modifiers,
46-
query=query,
47-
)
48-
49-
self.model = None
50-
if effect_modifiers is None:
51-
effect_modifiers = []
52-
if formula is not None:
53-
self.formula = formula
54-
else:
55-
terms = [treatment] + sorted(list(adjustment_set)) + sorted(list(self.effect_modifiers))
56-
self.formula = f"{outcome} ~ {'+'.join(((terms)))}"
22+
regressor = smf.logit
5723

5824
def add_modelling_assumptions(self):
5925
"""
@@ -68,43 +34,6 @@ def add_modelling_assumptions(self):
6834
self.modelling_assumptions.append("The outcome must be binary.")
6935
self.modelling_assumptions.append("Independently and identically distributed errors.")
7036

71-
def _run_logistic_regression(self, data) -> RegressionResultsWrapper:
72-
"""Run logistic regression of the treatment and adjustment set against the outcome and return the model.
73-
74-
:return: The model after fitting to data.
75-
"""
76-
model = smf.logit(formula=self.formula, data=data).fit(disp=0)
77-
self.model = model
78-
return model
79-
80-
def estimate(self, data: pd.DataFrame, adjustment_config: dict = None) -> RegressionResultsWrapper:
81-
"""add terms to the dataframe and estimate the outcome from the data
82-
:param data: A pandas dataframe containing execution data from the system-under-test.
83-
:param adjustment_config: Dictionary containing the adjustment configuration of the adjustment set
84-
"""
85-
if adjustment_config is None:
86-
adjustment_config = {}
87-
if set(self.adjustment_set) != set(adjustment_config):
88-
raise ValueError(
89-
f"Invalid adjustment configuration {adjustment_config}. Must specify values for {self.adjustment_set}"
90-
)
91-
92-
model = self._run_logistic_regression(data)
93-
94-
x = pd.DataFrame(columns=self.df.columns)
95-
x["Intercept"] = 1 # self.intercept
96-
x[self.treatment] = [self.treatment_value, self.control_value]
97-
for k, v in adjustment_config.items():
98-
x[k] = v
99-
for k, v in self.effect_modifiers.items():
100-
x[k] = v
101-
x = dmatrix(self.formula.split("~")[1], x, return_type="dataframe")
102-
for col in x:
103-
if str(x.dtypes[col]) == "object":
104-
x = pd.get_dummies(x, columns=[col], drop_first=True)
105-
# x = x[model.params.index]
106-
return model.predict(x)
107-
10837
def estimate_control_treatment(
10938
self, adjustment_config: dict = None, bootstrap_size: int = 100
11039
) -> tuple[pd.Series, pd.Series]:
@@ -115,11 +44,13 @@ def estimate_control_treatment(
11544
"""
11645
if adjustment_config is None:
11746
adjustment_config = {}
118-
y = self.estimate(self.df, adjustment_config=adjustment_config)
47+
y = self._predict(self.df, adjustment_config=adjustment_config)["predicted"]
11948

12049
try:
12150
bootstrap_samples = [
122-
self.estimate(self.df.sample(len(self.df), replace=True), adjustment_config=adjustment_config)
51+
self._predict(self.df.sample(len(self.df), replace=True), adjustment_config=adjustment_config)[
52+
"predicted"
53+
]
12354
for _ in range(bootstrap_size)
12455
]
12556
control, treatment = zip(*[(x.iloc[1], x.iloc[0]) for x in bootstrap_samples])
@@ -214,5 +145,5 @@ def estimate_unit_odds_ratio(self) -> float:
214145
215146
:return: The odds ratio. Confidence intervals are not yet supported.
216147
"""
217-
model = self._run_logistic_regression(self.df)
148+
model = self._run_regression(self.df)
218149
return np.exp(model.params[self.treatment])

causal_testing/estimation/regression_estimator.py

Lines changed: 43 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,13 @@
22

33
import logging
44
from typing import Any
5-
from abc import abstractmethod, abstractmethod
5+
from abc import abstractmethod
66

77
import pandas as pd
8-
import statsmodels.formula.api as smf
9-
from patsy import dmatrix # pylint: disable = no-name-in-module
10-
from patsy import ModelDesc
118
from statsmodels.regression.linear_model import RegressionResultsWrapper
9+
from patsy import dmatrix # pylint: disable = no-name-in-module
1210

1311
from causal_testing.specification.variable import Variable
14-
from causal_testing.estimation.gp import GP
1512
from causal_testing.estimation.estimator import Estimator
1613

1714
logger = logging.getLogger(__name__)
@@ -50,17 +47,23 @@ def __init__(
5047
self.model = None
5148
if effect_modifiers is None:
5249
effect_modifiers = []
50+
if adjustment_set is None:
51+
adjustment_set = []
5352
if formula is not None:
5453
self.formula = formula
5554
else:
5655
terms = [treatment] + sorted(list(adjustment_set)) + sorted(list(effect_modifiers))
5756
self.formula = f"{outcome} ~ {'+'.join(terms)}"
58-
for term in self.effect_modifiers:
59-
self.adjustment_set.add(term)
6057

6158
@property
6259
@abstractmethod
63-
def regression(self):
60+
def regressor(self):
61+
"""
62+
The regressor to use, e.g. ols or logit.
63+
This should be a property accessible with self.regressor.
64+
Define as `regressor = ...`` outside of __init__, not as `self.regressor = ...`, otherwise
65+
you'll get an "cannot instantiate with abstract method" error.
66+
"""
6467
raise NotImplementedError("Subclasses must implement the 'model' property.")
6568

6669
def add_modelling_assumptions(self):
@@ -81,6 +84,37 @@ def _run_regression(self, data=None) -> RegressionResultsWrapper:
8184
"""
8285
if data is None:
8386
data = self.df
84-
model = self.regression(formula=self.formula, data=data).fit(disp=0)
87+
model = self.regressor(formula=self.formula, data=data).fit(disp=0)
8588
self.model = model
8689
return model
90+
91+
def _predict(self, data=None, adjustment_config: dict = None) -> tuple[pd.Series, pd.Series]:
92+
"""Estimate the outcomes under control and treatment.
93+
94+
:param data: The data to use, defaults to `self.df`. Controllable for boostrap sampling.
95+
:param: adjustment_config: The values of the adjustment variables to use.
96+
97+
:return: The estimated outcome under control and treatment, with confidence intervals in the form of a
98+
dataframe with columns "predicted", "se", "ci_lower", and "ci_upper".
99+
"""
100+
if adjustment_config is None:
101+
adjustment_config = {}
102+
103+
model = self._run_regression(data)
104+
105+
x = pd.DataFrame(columns=self.df.columns)
106+
x["Intercept"] = 1 # self.intercept
107+
x[self.treatment] = [self.treatment_value, self.control_value]
108+
109+
for k, v in adjustment_config.items():
110+
x[k] = v
111+
for k, v in self.effect_modifiers.items():
112+
x[k] = v
113+
x = dmatrix(self.formula.split("~")[1], x, return_type="dataframe")
114+
for col in x:
115+
if str(x.dtypes[col]) == "object":
116+
x = pd.get_dummies(x, columns=[col], drop_first=True)
117+
118+
# This has to be here in case the treatment variable is in an I(...) block in the self.formula
119+
x[self.treatment] = [self.treatment_value, self.control_value]
120+
return model.get_prediction(x).summary_frame()

tests/estimation_tests/test_cubic_spline_estimator.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def test_program_11_3_cublic_spline(self):
2727

2828
cublic_spline_estimator = CubicSplineRegressionEstimator("treatments", 1, 0, set(), "outcomes", 3, df)
2929

30-
model = cublic_spline_estimator._run_linear_regression()
30+
ate_1 = cublic_spline_estimator.estimate_ate_calculated()
3131

3232
self.assertEqual(
3333
round(
@@ -37,7 +37,6 @@ def test_program_11_3_cublic_spline(self):
3737
195.6,
3838
)
3939

40-
ate_1 = cublic_spline_estimator.estimate_ate_calculated()
4140
cublic_spline_estimator.treatment_value = 2
4241
ate_2 = cublic_spline_estimator.estimate_ate_calculated()
4342

0 commit comments

Comments
 (0)