@@ -135,20 +135,20 @@ def _run_logistic_regression(self, data) -> RegressionResultsWrapper:
135
135
"""
136
136
# 1. Reduce dataframe to contain only the necessary columns
137
137
reduced_df = data .copy ()
138
- necessary_cols = list ( self .treatment ) + list (self .adjustment_set ) + list ( self .outcome )
138
+ necessary_cols = [ self .treatment ] + list (self .adjustment_set ) + [ self .outcome ]
139
139
missing_rows = reduced_df [necessary_cols ].isnull ().any (axis = 1 )
140
140
reduced_df = reduced_df [~ missing_rows ]
141
- reduced_df = reduced_df .sort_values (list ( self .treatment ) )
141
+ reduced_df = reduced_df .sort_values ([ self .treatment ] )
142
142
logger .debug (reduced_df [necessary_cols ])
143
143
144
144
# 2. Add intercept
145
145
reduced_df ["Intercept" ] = self .intercept
146
146
147
147
# 3. Estimate the unit difference in outcome caused by unit difference in treatment
148
- cols = list ( self .treatment )
148
+ cols = [ self .treatment ]
149
149
cols += [x for x in self .adjustment_set if x not in cols ]
150
150
treatment_and_adjustments_cols = reduced_df [cols + ["Intercept" ]]
151
- outcome_col = reduced_df [list ( self .outcome ) ]
151
+ outcome_col = reduced_df [[ self .outcome ] ]
152
152
for col in treatment_and_adjustments_cols :
153
153
if str (treatment_and_adjustments_cols .dtypes [col ]) == "object" :
154
154
treatment_and_adjustments_cols = pd .get_dummies (
@@ -167,7 +167,7 @@ def estimate(self, data: pd.DataFrame) -> RegressionResultsWrapper:
167
167
self .model = model
168
168
169
169
x = pd .DataFrame ()
170
- x [self .treatment [ 0 ] ] = [self .treatment_value , self .control_value ]
170
+ x [self .treatment ] = [self .treatment_value , self .control_value ]
171
171
x ["Intercept" ] = self .intercept
172
172
for k , v in self .effect_modifiers .items ():
173
173
x [k ] = v
@@ -240,7 +240,7 @@ def estimate_ate(self, bootstrap_size=100) -> float:
240
240
ci_high = bootstraps [bootstrap_size - bound ]
241
241
242
242
logger .info (
243
- f"Changing { self .treatment [ 0 ] } from { self .control_value } to { self .treatment_value } gives an estimated "
243
+ f"Changing { self .treatment } from { self .control_value } to { self .treatment_value } gives an estimated "
244
244
f"ATE of { ci_low } < { estimate } < { ci_high } "
245
245
)
246
246
assert ci_low < estimate < ci_high , f"Expecting { ci_low } < { estimate } < { ci_high } "
@@ -270,7 +270,7 @@ def estimate_risk_ratio(self, bootstrap_size=100) -> float:
270
270
ci_high = bootstraps [bootstrap_size - bound ]
271
271
272
272
logger .info (
273
- f"Changing { self .treatment [ 0 ] } from { self .control_value } to { self .treatment_value } gives an estimated "
273
+ f"Changing { self .treatment } from { self .control_value } to { self .treatment_value } gives an estimated "
274
274
f"risk ratio of { ci_low } < { estimate } < { ci_high } "
275
275
)
276
276
assert ci_low < estimate < ci_high , f"Expecting { ci_low } < { estimate } < { ci_high } "
@@ -284,7 +284,7 @@ def estimate_unit_odds_ratio(self) -> float:
284
284
:return: The odds ratio. Confidence intervals are not yet supported.
285
285
"""
286
286
model = self ._run_logistic_regression (self .df )
287
- return np .exp (model .params [self .treatment [ 0 ] ])
287
+ return np .exp (model .params [self .treatment ])
288
288
289
289
290
290
class LinearRegressionEstimator (Estimator ):
@@ -385,7 +385,7 @@ def estimate_unit_ate(self) -> float:
385
385
:return: The unit average treatment effect and the 95% Wald confidence intervals.
386
386
"""
387
387
model = self ._run_linear_regression ()
388
- unit_effect = model .params [list ( self .treatment ) ].values [0 ] # Unit effect is the coefficient of the treatment
388
+ unit_effect = model .params [[ self .treatment ] ].values [0 ] # Unit effect is the coefficient of the treatment
389
389
[ci_low , ci_high ] = self ._get_confidence_intervals (model )
390
390
391
391
return unit_effect * self .treatment_value - unit_effect * self .control_value , [ci_low , ci_high ]
@@ -409,8 +409,8 @@ def estimate_ate(self) -> tuple[float, list[float, float], float]:
409
409
410
410
# It is ABSOLUTELY CRITICAL that these go last, otherwise we can't index
411
411
# the effect with "ate = t_test_results.effect[0]"
412
- individuals .loc ["control" , list ( self .treatment ) ] = self .control_value
413
- individuals .loc ["treated" , list ( self .treatment ) ] = self .treatment_value
412
+ individuals .loc ["control" , [ self .treatment ] ] = self .control_value
413
+ individuals .loc ["treated" , [ self .treatment ] ] = self .treatment_value
414
414
415
415
# Perform a t-test to compare the predicted outcome of the control and treated individual (ATE)
416
416
t_test_results = model .t_test (individuals .loc ["treated" ] - individuals .loc ["control" ])
@@ -431,7 +431,7 @@ def estimate_control_treatment(self, adjustment_config: dict = None) -> tuple[pd
431
431
self .model = model
432
432
433
433
x = pd .DataFrame ()
434
- x [self .treatment [ 0 ] ] = [self .treatment_value , self .control_value ]
434
+ x [self .treatment ] = [self .treatment_value , self .control_value ]
435
435
x ["Intercept" ] = 1 #self.intercept
436
436
for k , v in adjustment_config .items ():
437
437
x [k ] = v
@@ -487,7 +487,7 @@ def estimate_cates(self) -> tuple[float, list[float, float]]:
487
487
self .effect_modifiers
488
488
), f"Must have at least one effect modifier to compute CATE - { self .effect_modifiers } ."
489
489
x = pd .DataFrame ()
490
- x [self .treatment [ 0 ] ] = [self .treatment_value , self .control_value ]
490
+ x [self .treatment ] = [self .treatment_value , self .control_value ]
491
491
x ["Intercept" ] = 1 #self.intercept
492
492
for k , v in self .effect_modifiers .items ():
493
493
self .adjustment_set .add (k )
@@ -513,20 +513,20 @@ def _run_linear_regression(self) -> RegressionResultsWrapper:
513
513
"""
514
514
# 1. Reduce dataframe to contain only the necessary columns
515
515
reduced_df = self .df .copy ()
516
- necessary_cols = list ( self .treatment ) + list (self .adjustment_set ) + list ( self .outcome )
516
+ necessary_cols = [ self .treatment ] + list (self .adjustment_set ) + [ self .outcome ]
517
517
missing_rows = reduced_df [necessary_cols ].isnull ().any (axis = 1 )
518
518
reduced_df = reduced_df [~ missing_rows ]
519
- reduced_df = reduced_df .sort_values (list ( self .treatment ) )
519
+ reduced_df = reduced_df .sort_values ([ self .treatment ] )
520
520
logger .debug (reduced_df [necessary_cols ])
521
521
522
522
# 2. Add intercept
523
523
reduced_df ["Intercept" ] = 1 #self.intercept
524
524
525
525
# 3. Estimate the unit difference in outcome caused by unit difference in treatment
526
- cols = list ( self .treatment )
526
+ cols = [ self .treatment ]
527
527
cols += [x for x in self .adjustment_set if x not in cols ]
528
528
treatment_and_adjustments_cols = reduced_df [cols + ["Intercept" ]]
529
- outcome_col = reduced_df [list ( self .outcome ) ]
529
+ outcome_col = reduced_df [[ self .outcome ] ]
530
530
for col in treatment_and_adjustments_cols :
531
531
if str (treatment_and_adjustments_cols .dtypes [col ]) == "object" :
532
532
treatment_and_adjustments_cols = pd .get_dummies (
@@ -539,12 +539,66 @@ def _run_linear_regression(self) -> RegressionResultsWrapper:
539
539
def _get_confidence_intervals (self , model ):
540
540
confidence_intervals = model .conf_int (alpha = 0.05 , cols = None )
541
541
ci_low , ci_high = (
542
- confidence_intervals [0 ][list ( self .treatment ) ],
543
- confidence_intervals [1 ][list ( self .treatment ) ],
542
+ confidence_intervals [0 ][[ self .treatment ] ],
543
+ confidence_intervals [1 ][[ self .treatment ] ],
544
544
)
545
545
return [ci_low .values [0 ], ci_high .values [0 ]]
546
546
547
547
548
+ class InstrumentalVariableEstimator (Estimator ):
549
+ """
550
+ Carry out estimation using instrumental variable adjustment rather than conventional adjustment. This means we do
551
+ not need to observe all confounders in order to adjust for them. A key assumption here is linearity.
552
+ """
553
+
554
+ def __init__ (
555
+ self ,
556
+ treatment : str ,
557
+ treatment_value : float ,
558
+ control_value : float ,
559
+ adjustment_set : set ,
560
+ outcome : str ,
561
+ instrument : str ,
562
+ df : pd .DataFrame = None ,
563
+ intercept : int = 1 ,
564
+ effect_modifiers : dict = None # Not used (yet?). Needed for compatibility
565
+ ):
566
+ super ().__init__ (treatment , treatment_value , control_value , adjustment_set , outcome , df , None )
567
+ self .intercept = intercept
568
+ self .model = None
569
+ self .instrument = instrument
570
+
571
+
572
+ def add_modelling_assumptions (self ):
573
+ """
574
+ Add modelling assumptions to the estimator. This is a list of strings which list the modelling assumptions that
575
+ must hold if the resulting causal inference is to be considered valid.
576
+ """
577
+ self .modelling_assumptions += """The instrument and the treatment, and the treatment and the outcome must be
578
+ related linearly in the form Y = aX + b."""
579
+ self .modelling_assumptions += """The three IV conditions must hold
580
+ (i) Instrument is associated with treatment
581
+ (ii) Instrument does not affect outcome except through its potential effect on treatment
582
+ (iii) Instrument and outcome do not share causes
583
+ """
584
+
585
+ def estimate_coefficient (self ):
586
+ """
587
+ Estimate the linear regression coefficient of the treatment on the outcome.
588
+ """
589
+ # Estimate the total effect of instrument I on outcome Y = abI + c1
590
+ ab = sm .OLS (self .df [self .outcome ], self .df [[self .instrument ]]).fit ().params [self .instrument ]
591
+
592
+ # Estimate the direct effect of instrument I on treatment X = aI + c1
593
+ a = sm .OLS (self .df [self .treatment ], self .df [[self .instrument ]]).fit ().params [self .instrument ]
594
+
595
+ # Estimate the coefficient of I on X by cancelling
596
+ return ab / a
597
+
598
+ def estimate_ate (self ):
599
+ return (self .treatment_value - self .control_value ) * self .estimate_coefficient (), (None , None )
600
+
601
+
548
602
class CausalForestEstimator (Estimator ):
549
603
"""A causal random forest estimator is a non-parametric estimator which recursively partitions the covariate space
550
604
to learn a low-dimensional representation of treatment effect heterogeneity. This form of estimator is best suited
@@ -566,7 +620,7 @@ def estimate_ate(self) -> float:
566
620
"""
567
621
# Remove any NA containing rows
568
622
reduced_df = self .df .copy ()
569
- necessary_cols = list ( self .treatment ) + list (self .adjustment_set ) + list ( self .outcome )
623
+ necessary_cols = [ self .treatment ] + list (self .adjustment_set ) + [ self .outcome ]
570
624
missing_rows = reduced_df [necessary_cols ].isnull ().any (axis = 1 )
571
625
reduced_df = reduced_df [~ missing_rows ]
572
626
@@ -577,8 +631,8 @@ def estimate_ate(self) -> float:
577
631
else :
578
632
effect_modifier_df = reduced_df [list (self .adjustment_set )]
579
633
confounders_df = reduced_df [list (self .adjustment_set )]
580
- treatment_df = np .ravel (reduced_df [list ( self .treatment ) ])
581
- outcome_df = np .ravel (reduced_df [list ( self .outcome ) ])
634
+ treatment_df = np .ravel (reduced_df [[ self .treatment ] ])
635
+ outcome_df = np .ravel (reduced_df [[ self .outcome ] ])
582
636
583
637
# Fit the model to the data using a gradient boosting regressor for both the treatment and outcome model
584
638
model = CausalForestDML (
@@ -606,7 +660,7 @@ def estimate_cates(self) -> pd.DataFrame:
606
660
607
661
# Remove any NA containing rows
608
662
reduced_df = self .df .copy ()
609
- necessary_cols = list ( self .treatment ) + list (self .adjustment_set ) + list ( self .outcome )
663
+ necessary_cols = [ self .treatment ] + list (self .adjustment_set ) + [ self .outcome ]
610
664
missing_rows = reduced_df [necessary_cols ].isnull ().any (axis = 1 )
611
665
reduced_df = reduced_df [~ missing_rows ]
612
666
@@ -620,8 +674,8 @@ def estimate_cates(self) -> pd.DataFrame:
620
674
confounders_df = reduced_df [list (self .adjustment_set )]
621
675
else :
622
676
confounders_df = None
623
- treatment_df = reduced_df [list ( self .treatment ) ]
624
- outcome_df = reduced_df [list ( self .outcome ) ]
677
+ treatment_df = reduced_df [[ self .treatment ] ]
678
+ outcome_df = reduced_df [[ self .outcome ] ]
625
679
626
680
# Fit a model to the data
627
681
model = CausalForestDML (model_y = GradientBoostingRegressor (), model_t = GradientBoostingRegressor ())
0 commit comments