@@ -121,13 +121,13 @@ def add_modelling_assumptions(self):
121
121
self .modelling_assumptions += "The outcome must be binary."
122
122
self .modelling_assumptions += "Independently and identically distributed errors."
123
123
124
- def _run_logistic_regression (self ) -> RegressionResultsWrapper :
124
+ def _run_logistic_regression (self , data ) -> RegressionResultsWrapper :
125
125
"""Run logistic regression of the treatment and adjustment set against the outcome and return the model.
126
126
127
127
:return: The model after fitting to data.
128
128
"""
129
129
# 1. Reduce dataframe to contain only the necessary columns
130
- reduced_df = self . df .copy ()
130
+ reduced_df = data .copy ()
131
131
necessary_cols = list (self .treatment ) + list (self .adjustment_set ) + list (self .outcome )
132
132
missing_rows = reduced_df [necessary_cols ].isnull ().any (axis = 1 )
133
133
reduced_df = reduced_df [~ missing_rows ]
@@ -149,13 +149,8 @@ def _run_logistic_regression(self) -> RegressionResultsWrapper:
149
149
model = regression .fit ()
150
150
return model
151
151
152
- def estimate_control_treatment (self ) -> tuple [pd .Series , pd .Series ]:
153
- """Estimate the outcomes under control and treatment.
154
-
155
- :return: The estimated control and treatment values and their confidence
156
- intervals in the form ((ci_low, control, ci_high), (ci_low, treatment, ci_high)).
157
- """
158
- model = self ._run_logistic_regression ()
152
+ def estimate (self , data ):
153
+ model = self ._run_logistic_regression (data )
159
154
self .model = model
160
155
161
156
x = pd .DataFrame ()
@@ -175,32 +170,47 @@ def estimate_control_treatment(self) -> tuple[pd.Series, pd.Series]:
175
170
x = pd .get_dummies (x , columns = [col ], drop_first = True )
176
171
x = x [model .params .index ]
177
172
178
- y = model .predict (x )
173
+ return model .predict (x )
174
+
175
+ def estimate_control_treatment (self , bootstrap_size = 100 ) -> tuple [pd .Series , pd .Series ]:
176
+ """Estimate the outcomes under control and treatment.
177
+
178
+ :return: The estimated control and treatment values and their confidence
179
+ intervals in the form ((ci_low, control, ci_high), (ci_low, treatment, ci_high)).
180
+ """
181
+
182
+ y = self .estimate (self .df )
183
+
184
+ bootstrap_samples = [self .estimate (self .df .sample (len (self .df ), replace = True )) for _ in range (bootstrap_size )]
185
+ control , treatment = zip (* [(x .iloc [1 ], x .iloc [0 ]) for x in bootstrap_samples ])
186
+
179
187
180
188
# Delta method confidence intervals from
181
189
# https://stackoverflow.com/questions/47414842/confidence-interval-of-probability-prediction-from-logistic-regression-statsmode
182
- cov = model .cov_params ()
183
- gradient = (y * (1 - y ) * x .T ).T # matrix of gradients for each observation
184
- std_errors = np .array ([np .sqrt (np .dot (np .dot (g , cov ), g )) for g in gradient .to_numpy ()])
185
- c = 1.96 # multiplier for confidence interval
186
- upper = np .maximum (0 , np .minimum (1 , y + std_errors * c ))
187
- lower = np .maximum (0 , np .minimum (1 , y - std_errors * c ))
190
+ # cov = model.cov_params()
191
+ # gradient = (y * (1 - y) * x.T).T # matrix of gradients for each observation
192
+ # std_errors = np.array([np.sqrt(np.dot(np.dot(g, cov), g)) for g in gradient.to_numpy()])
193
+ # c = 1.96 # multiplier for confidence interval
194
+ # upper = np.maximum(0, np.minimum(1, y + std_errors * c))
195
+ # lower = np.maximum(0, np.minimum(1, y - std_errors * c))
188
196
189
- return (lower . iloc [ 1 ], y .iloc [1 ], upper . iloc [ 1 ]) , (lower . iloc [ 0 ], y .iloc [0 ], upper . iloc [ 0 ] )
197
+ return (y .iloc [1 ], np . array ( control )) , (y .iloc [0 ], np . array ( treatment ) )
190
198
191
- def estimate_ate (self ) -> float :
199
+ def estimate_ate (self , bootstrap_size = 100 ) -> float :
192
200
"""Estimate the ate effect of the treatment on the outcome. That is, the change in outcome caused
193
201
by changing the treatment variable from the control value to the treatment value. Here, we actually
194
202
calculate the expected outcomes under control and treatment and take one away from the other. This
195
203
allows for custom terms to be put in such as squares, inverses, products, etc.
196
204
197
205
:return: The estimated average treatment effect and 95% confidence intervals
198
206
"""
199
- (cci_low , control_outcome , cci_high ), (tci_low , treatment_outcome , tci_high ) = self .estimate_control_treatment ()
207
+ (control_outcome , control_bootstraps ), (treatment_outcome , treatment_bootstraps ) = self .estimate_control_treatment ()
200
208
201
- ci_low = tci_low - cci_high
202
- ci_high = tci_high - cci_low
203
209
estimate = treatment_outcome - control_outcome
210
+ bootstraps = sorted (list (treatment_bootstraps - control_bootstraps ))
211
+ bound = int ((bootstrap_size * 0.05 )/ 2 )
212
+ ci_low = bootstraps [bound ]
213
+ ci_high = bootstraps [bootstrap_size - bound ]
204
214
205
215
logger .info (
206
216
f"Changing { self .treatment [0 ]} from { self .control_values } to { self .treatment_values } gives an estimated ATE of { ci_low } < { estimate } < { ci_high } "
@@ -217,12 +227,20 @@ def estimate_risk_ratio(self) -> float:
217
227
218
228
:return: The estimated risk ratio and 95% confidence intervals.
219
229
"""
220
- (cci_low , control_outcome , cci_high ), (tci_low , treatment_outcome , tci_high ) = self .estimate_control_treatment ()
230
+ (control_outcome , control_bootstraps ), (treatment_outcome , treatment_bootstraps ) = self .estimate_control_treatment ()
231
+
232
+ estimate = treatment_outcome / control_outcome
233
+ bootstraps = sorted (list (treatment_bootstraps / control_bootstraps ))
234
+ bound = int ((bootstrap_size * 0.05 )/ 2 )
235
+ ci_low = bootstraps [bound ]
236
+ ci_high = bootstraps [bootstrap_size - bound ]
221
237
222
- ci_low = tci_low / cci_high
223
- ci_high = tci_high / cci_low
238
+ logger .info (
239
+ f"Changing { self .treatment [0 ]} from { self .control_values } to { self .treatment_values } gives an estimated risk ratio of { ci_low } < { estimate } < { ci_high } "
240
+ )
241
+ assert ci_low < estimate < ci_high , f"Expecting { ci_low } < { estimate } < { ci_high } "
224
242
225
- return treatment_outcome / control_outcome , (ci_low , ci_high )
243
+ return estimate , (ci_low , ci_high )
226
244
227
245
def estimate_unit_odds_ratio (self ) -> float :
228
246
"""Estimate the odds ratio of increasing the treatment by one. In logistic regression, this corresponds to the
0 commit comments