Skip to content

Commit 9105d31

Browse files
committed
Bootstrap confidence intervals for logistic regression
1 parent 479119d commit 9105d31

File tree

2 files changed

+47
-26
lines changed

2 files changed

+47
-26
lines changed

causal_testing/generation/abstract_causal_test_case.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,7 @@ def generate_concrete_tests(
182182
runs = pd.DataFrame()
183183
ks_stats = []
184184

185+
pre_break = False
185186
for i in range(hard_max):
186187
concrete_tests_, runs_ = self._generate_concrete_tests(sample_size, rct, seed + i)
187188
concrete_tests += concrete_tests_
@@ -216,11 +217,13 @@ def generate_concrete_tests(
216217
treatment_values = [test.treatment_input_configuration[self.treatment_variable] for test in concrete_tests]
217218

218219
if issubclass(self.treatment_variable.datatype, Enum) and set(zip(control_values, treatment_values)).issubset(itertools.product(self.treatment_variable.datatype, self.treatment_variable.datatype)):
220+
pre_break = True
219221
break
220222
elif target_ks_score and all((stat <= target_ks_score for stat in ks_stats.values())):
223+
pre_break = True
221224
break
222225

223-
if target_ks_score is not None and not all((stat <= target_ks_score for stat in ks_stats.values())):
226+
if target_ks_score is not None and not pre_break:
224227
logger.error(
225228
"Hard max of %s reached but could not achieve target ks_score of %s. Got %s.",
226229
hard_max,

causal_testing/testing/estimators.py

Lines changed: 43 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -121,13 +121,13 @@ def add_modelling_assumptions(self):
121121
self.modelling_assumptions += "The outcome must be binary."
122122
self.modelling_assumptions += "Independently and identically distributed errors."
123123

124-
def _run_logistic_regression(self) -> RegressionResultsWrapper:
124+
def _run_logistic_regression(self, data) -> RegressionResultsWrapper:
125125
"""Run logistic regression of the treatment and adjustment set against the outcome and return the model.
126126
127127
:return: The model after fitting to data.
128128
"""
129129
# 1. Reduce dataframe to contain only the necessary columns
130-
reduced_df = self.df.copy()
130+
reduced_df = data.copy()
131131
necessary_cols = list(self.treatment) + list(self.adjustment_set) + list(self.outcome)
132132
missing_rows = reduced_df[necessary_cols].isnull().any(axis=1)
133133
reduced_df = reduced_df[~missing_rows]
@@ -149,13 +149,8 @@ def _run_logistic_regression(self) -> RegressionResultsWrapper:
149149
model = regression.fit()
150150
return model
151151

152-
def estimate_control_treatment(self) -> tuple[pd.Series, pd.Series]:
153-
"""Estimate the outcomes under control and treatment.
154-
155-
:return: The estimated control and treatment values and their confidence
156-
intervals in the form ((ci_low, control, ci_high), (ci_low, treatment, ci_high)).
157-
"""
158-
model = self._run_logistic_regression()
152+
def estimate(self, data):
153+
model = self._run_logistic_regression(data)
159154
self.model = model
160155

161156
x = pd.DataFrame()
@@ -175,32 +170,47 @@ def estimate_control_treatment(self) -> tuple[pd.Series, pd.Series]:
175170
x = pd.get_dummies(x, columns=[col], drop_first=True)
176171
x = x[model.params.index]
177172

178-
y = model.predict(x)
173+
return model.predict(x)
174+
175+
def estimate_control_treatment(self, bootstrap_size=100) -> tuple[pd.Series, pd.Series]:
176+
"""Estimate the outcomes under control and treatment.
177+
178+
:return: The estimated control and treatment values and their confidence
179+
intervals in the form ((ci_low, control, ci_high), (ci_low, treatment, ci_high)).
180+
"""
181+
182+
y = self.estimate(self.df)
183+
184+
bootstrap_samples = [self.estimate(self.df.sample(len(self.df), replace=True)) for _ in range(bootstrap_size)]
185+
control, treatment = zip(*[(x.iloc[1], x.iloc[0]) for x in bootstrap_samples])
186+
179187

180188
# Delta method confidence intervals from
181189
# https://stackoverflow.com/questions/47414842/confidence-interval-of-probability-prediction-from-logistic-regression-statsmode
182-
cov = model.cov_params()
183-
gradient = (y * (1 - y) * x.T).T # matrix of gradients for each observation
184-
std_errors = np.array([np.sqrt(np.dot(np.dot(g, cov), g)) for g in gradient.to_numpy()])
185-
c = 1.96 # multiplier for confidence interval
186-
upper = np.maximum(0, np.minimum(1, y + std_errors * c))
187-
lower = np.maximum(0, np.minimum(1, y - std_errors * c))
190+
# cov = model.cov_params()
191+
# gradient = (y * (1 - y) * x.T).T # matrix of gradients for each observation
192+
# std_errors = np.array([np.sqrt(np.dot(np.dot(g, cov), g)) for g in gradient.to_numpy()])
193+
# c = 1.96 # multiplier for confidence interval
194+
# upper = np.maximum(0, np.minimum(1, y + std_errors * c))
195+
# lower = np.maximum(0, np.minimum(1, y - std_errors * c))
188196

189-
return (lower.iloc[1], y.iloc[1], upper.iloc[1]), (lower.iloc[0], y.iloc[0], upper.iloc[0])
197+
return (y.iloc[1], np.array(control)), (y.iloc[0], np.array(treatment))
190198

191-
def estimate_ate(self) -> float:
199+
def estimate_ate(self, bootstrap_size=100) -> float:
192200
"""Estimate the ate effect of the treatment on the outcome. That is, the change in outcome caused
193201
by changing the treatment variable from the control value to the treatment value. Here, we actually
194202
calculate the expected outcomes under control and treatment and take one away from the other. This
195203
allows for custom terms to be put in such as squares, inverses, products, etc.
196204
197205
:return: The estimated average treatment effect and 95% confidence intervals
198206
"""
199-
(cci_low, control_outcome, cci_high), (tci_low, treatment_outcome, tci_high) = self.estimate_control_treatment()
207+
(control_outcome, control_bootstraps), (treatment_outcome, treatment_bootstraps) = self.estimate_control_treatment()
200208

201-
ci_low = tci_low - cci_high
202-
ci_high = tci_high - cci_low
203209
estimate = treatment_outcome - control_outcome
210+
bootstraps = sorted(list(treatment_bootstraps - control_bootstraps))
211+
bound = int((bootstrap_size * 0.05)/2)
212+
ci_low = bootstraps[bound]
213+
ci_high = bootstraps[bootstrap_size - bound]
204214

205215
logger.info(
206216
f"Changing {self.treatment[0]} from {self.control_values} to {self.treatment_values} gives an estimated ATE of {ci_low} < {estimate} < {ci_high}"
@@ -217,12 +227,20 @@ def estimate_risk_ratio(self) -> float:
217227
218228
:return: The estimated risk ratio and 95% confidence intervals.
219229
"""
220-
(cci_low, control_outcome, cci_high), (tci_low, treatment_outcome, tci_high) = self.estimate_control_treatment()
230+
(control_outcome, control_bootstraps), (treatment_outcome, treatment_bootstraps) = self.estimate_control_treatment()
231+
232+
estimate = treatment_outcome / control_outcome
233+
bootstraps = sorted(list(treatment_bootstraps / control_bootstraps))
234+
bound = int((bootstrap_size * 0.05)/2)
235+
ci_low = bootstraps[bound]
236+
ci_high = bootstraps[bootstrap_size - bound]
221237

222-
ci_low = tci_low / cci_high
223-
ci_high = tci_high / cci_low
238+
logger.info(
239+
f"Changing {self.treatment[0]} from {self.control_values} to {self.treatment_values} gives an estimated risk ratio of {ci_low} < {estimate} < {ci_high}"
240+
)
241+
assert ci_low < estimate < ci_high, f"Expecting {ci_low} < {estimate} < {ci_high}"
224242

225-
return treatment_outcome / control_outcome, (ci_low, ci_high)
243+
return estimate, (ci_low, ci_high)
226244

227245
def estimate_unit_odds_ratio(self) -> float:
228246
"""Estimate the odds ratio of increasing the treatment by one. In logistic regression, this corresponds to the

0 commit comments

Comments
 (0)