@@ -36,11 +36,11 @@ class Estimator(ABC):
36
36
37
37
def __init__ (
38
38
self ,
39
- treatment : tuple ,
39
+ treatment : str ,
40
40
treatment_value : float ,
41
41
control_value : float ,
42
42
adjustment_set : set ,
43
- outcome : tuple ,
43
+ outcome : str ,
44
44
df : pd .DataFrame = None ,
45
45
effect_modifiers : dict [Variable :Any ] = None ,
46
46
):
@@ -93,11 +93,11 @@ class LogisticRegressionEstimator(Estimator):
93
93
94
94
def __init__ (
95
95
self ,
96
- treatment : tuple ,
96
+ treatment : str ,
97
97
treatment_value : float ,
98
98
control_value : float ,
99
99
adjustment_set : set ,
100
- outcome : tuple ,
100
+ outcome : str ,
101
101
df : pd .DataFrame = None ,
102
102
effect_modifiers : dict [Variable :Any ] = None ,
103
103
intercept : int = 1 ,
@@ -133,20 +133,20 @@ def _run_logistic_regression(self, data) -> RegressionResultsWrapper:
133
133
"""
134
134
# 1. Reduce dataframe to contain only the necessary columns
135
135
reduced_df = data .copy ()
136
- necessary_cols = list ( self .treatment ) + list (self .adjustment_set ) + list ( self .outcome )
136
+ necessary_cols = [ self .treatment ] + list (self .adjustment_set ) + [ self .outcome ]
137
137
missing_rows = reduced_df [necessary_cols ].isnull ().any (axis = 1 )
138
138
reduced_df = reduced_df [~ missing_rows ]
139
- reduced_df = reduced_df .sort_values (list ( self .treatment ) )
139
+ reduced_df = reduced_df .sort_values ([ self .treatment ] )
140
140
logger .debug (reduced_df [necessary_cols ])
141
141
142
142
# 2. Add intercept
143
143
reduced_df ["Intercept" ] = self .intercept
144
144
145
145
# 3. Estimate the unit difference in outcome caused by unit difference in treatment
146
- cols = list ( self .treatment )
146
+ cols = [ self .treatment ]
147
147
cols += [x for x in self .adjustment_set if x not in cols ]
148
148
treatment_and_adjustments_cols = reduced_df [cols + ["Intercept" ]]
149
- outcome_col = reduced_df [list ( self .outcome ) ]
149
+ outcome_col = reduced_df [[ self .outcome ] ]
150
150
for col in treatment_and_adjustments_cols :
151
151
if str (treatment_and_adjustments_cols .dtypes [col ]) == "object" :
152
152
treatment_and_adjustments_cols = pd .get_dummies (
@@ -165,7 +165,7 @@ def estimate(self, data: pd.DataFrame) -> RegressionResultsWrapper:
165
165
self .model = model
166
166
167
167
x = pd .DataFrame ()
168
- x [self .treatment [ 0 ] ] = [self .treatment_value , self .control_value ]
168
+ x [self .treatment ] = [self .treatment_value , self .control_value ]
169
169
x ["Intercept" ] = self .intercept
170
170
for k , v in self .effect_modifiers .items ():
171
171
x [k ] = v
@@ -238,7 +238,7 @@ def estimate_ate(self, bootstrap_size=100) -> float:
238
238
ci_high = bootstraps [bootstrap_size - bound ]
239
239
240
240
logger .info (
241
- f"Changing { self .treatment [ 0 ] } from { self .control_value } to { self .treatment_value } gives an estimated "
241
+ f"Changing { self .treatment } from { self .control_value } to { self .treatment_value } gives an estimated "
242
242
f"ATE of { ci_low } < { estimate } < { ci_high } "
243
243
)
244
244
assert ci_low < estimate < ci_high , f"Expecting { ci_low } < { estimate } < { ci_high } "
@@ -268,7 +268,7 @@ def estimate_risk_ratio(self, bootstrap_size=100) -> float:
268
268
ci_high = bootstraps [bootstrap_size - bound ]
269
269
270
270
logger .info (
271
- f"Changing { self .treatment [ 0 ] } from { self .control_value } to { self .treatment_value } gives an estimated "
271
+ f"Changing { self .treatment } from { self .control_value } to { self .treatment_value } gives an estimated "
272
272
f"risk ratio of { ci_low } < { estimate } < { ci_high } "
273
273
)
274
274
assert ci_low < estimate < ci_high , f"Expecting { ci_low } < { estimate } < { ci_high } "
@@ -282,7 +282,7 @@ def estimate_unit_odds_ratio(self) -> float:
282
282
:return: The odds ratio. Confidence intervals are not yet supported.
283
283
"""
284
284
model = self ._run_logistic_regression (self .df )
285
- return np .exp (model .params [self .treatment [ 0 ] ])
285
+ return np .exp (model .params [self .treatment ])
286
286
287
287
288
288
class LinearRegressionEstimator (Estimator ):
@@ -292,11 +292,11 @@ class LinearRegressionEstimator(Estimator):
292
292
293
293
def __init__ (
294
294
self ,
295
- treatment : tuple ,
295
+ treatment : str ,
296
296
treatment_value : float ,
297
297
control_value : float ,
298
298
adjustment_set : set ,
299
- outcome : tuple ,
299
+ outcome : str ,
300
300
df : pd .DataFrame = None ,
301
301
effect_modifiers : dict [Variable :Any ] = None ,
302
302
product_terms : list [tuple [Variable , Variable ]] = None ,
@@ -382,7 +382,7 @@ def estimate_unit_ate(self) -> float:
382
382
:return: The unit average treatment effect and the 95% Wald confidence intervals.
383
383
"""
384
384
model = self ._run_linear_regression ()
385
- unit_effect = model .params [list ( self .treatment ) ].values [0 ] # Unit effect is the coefficient of the treatment
385
+ unit_effect = model .params [[ self .treatment ] ].values [0 ] # Unit effect is the coefficient of the treatment
386
386
[ci_low , ci_high ] = self ._get_confidence_intervals (model )
387
387
388
388
return unit_effect * self .treatment_value - unit_effect * self .control_value , [ci_low , ci_high ]
@@ -406,8 +406,8 @@ def estimate_ate(self) -> tuple[float, list[float, float], float]:
406
406
407
407
# It is ABSOLUTELY CRITICAL that these go last, otherwise we can't index
408
408
# the effect with "ate = t_test_results.effect[0]"
409
- individuals .loc ["control" , list ( self .treatment ) ] = self .control_value
410
- individuals .loc ["treated" , list ( self .treatment ) ] = self .treatment_value
409
+ individuals .loc ["control" , [ self .treatment ] ] = self .control_value
410
+ individuals .loc ["treated" , [ self .treatment ] ] = self .treatment_value
411
411
412
412
# Perform a t-test to compare the predicted outcome of the control and treated individual (ATE)
413
413
t_test_results = model .t_test (individuals .loc ["treated" ] - individuals .loc ["control" ])
@@ -428,7 +428,7 @@ def estimate_control_treatment(self, adjustment_config: dict = None) -> tuple[pd
428
428
self .model = model
429
429
430
430
x = pd .DataFrame ()
431
- x [self .treatment [ 0 ] ] = [self .treatment_value , self .control_value ]
431
+ x [self .treatment ] = [self .treatment_value , self .control_value ]
432
432
x ["Intercept" ] = self .intercept
433
433
for k , v in adjustment_config .items ():
434
434
x [k ] = v
@@ -484,7 +484,7 @@ def estimate_cates(self) -> tuple[float, list[float, float]]:
484
484
self .effect_modifiers
485
485
), f"Must have at least one effect modifier to compute CATE - { self .effect_modifiers } ."
486
486
x = pd .DataFrame ()
487
- x [self .treatment [ 0 ] ] = [self .treatment_value , self .control_value ]
487
+ x [self .treatment ] = [self .treatment_value , self .control_value ]
488
488
x ["Intercept" ] = self .intercept
489
489
for k , v in self .effect_modifiers .items ():
490
490
self .adjustment_set .add (k )
@@ -510,20 +510,20 @@ def _run_linear_regression(self) -> RegressionResultsWrapper:
510
510
"""
511
511
# 1. Reduce dataframe to contain only the necessary columns
512
512
reduced_df = self .df .copy ()
513
- necessary_cols = list ( self .treatment ) + list (self .adjustment_set ) + list ( self .outcome )
513
+ necessary_cols = [ self .treatment ] + list (self .adjustment_set ) + [ self .outcome ]
514
514
missing_rows = reduced_df [necessary_cols ].isnull ().any (axis = 1 )
515
515
reduced_df = reduced_df [~ missing_rows ]
516
- reduced_df = reduced_df .sort_values (list ( self .treatment ) )
516
+ reduced_df = reduced_df .sort_values ([ self .treatment ] )
517
517
logger .debug (reduced_df [necessary_cols ])
518
518
519
519
# 2. Add intercept
520
520
reduced_df ["Intercept" ] = self .intercept
521
521
522
522
# 3. Estimate the unit difference in outcome caused by unit difference in treatment
523
- cols = list ( self .treatment )
523
+ cols = [ self .treatment ]
524
524
cols += [x for x in self .adjustment_set if x not in cols ]
525
525
treatment_and_adjustments_cols = reduced_df [cols + ["Intercept" ]]
526
- outcome_col = reduced_df [list ( self .outcome ) ]
526
+ outcome_col = reduced_df [[ self .outcome ] ]
527
527
for col in treatment_and_adjustments_cols :
528
528
if str (treatment_and_adjustments_cols .dtypes [col ]) == "object" :
529
529
treatment_and_adjustments_cols = pd .get_dummies (
@@ -536,12 +536,65 @@ def _run_linear_regression(self) -> RegressionResultsWrapper:
536
536
def _get_confidence_intervals (self , model ):
537
537
confidence_intervals = model .conf_int (alpha = 0.05 , cols = None )
538
538
ci_low , ci_high = (
539
- confidence_intervals [0 ][list ( self .treatment ) ],
540
- confidence_intervals [1 ][list ( self .treatment ) ],
539
+ confidence_intervals [0 ][[ self .treatment ] ],
540
+ confidence_intervals [1 ][[ self .treatment ] ],
541
541
)
542
542
return [ci_low .values [0 ], ci_high .values [0 ]]
543
543
544
544
545
+ class InstrumentalVariableEstimator (Estimator ):
546
+ """
547
+ Carry out estimation using instrumental variable adjustment rather than conventional adjustment. This means we do
548
+ not need to observe all confounders in order to adjust for them. A key assumption here is linearity.
549
+ """
550
+
551
+ def __init__ (
552
+ self ,
553
+ treatment : str ,
554
+ treatment_value : float ,
555
+ control_value : float ,
556
+ adjustment_set : set ,
557
+ outcome : str ,
558
+ instrument : str ,
559
+ df : pd .DataFrame = None ,
560
+ intercept : int = 1 ,
561
+ effect_modifiers : dict = None , # Not used (yet?). Needed for compatibility
562
+ ):
563
+ super ().__init__ (treatment , treatment_value , control_value , adjustment_set , outcome , df , None )
564
+ self .intercept = intercept
565
+ self .model = None
566
+ self .instrument = instrument
567
+
568
+ def add_modelling_assumptions (self ):
569
+ """
570
+ Add modelling assumptions to the estimator. This is a list of strings which list the modelling assumptions that
571
+ must hold if the resulting causal inference is to be considered valid.
572
+ """
573
+ self .modelling_assumptions += """The instrument and the treatment, and the treatment and the outcome must be
574
+ related linearly in the form Y = aX + b."""
575
+ self .modelling_assumptions += """The three IV conditions must hold
576
+ (i) Instrument is associated with treatment
577
+ (ii) Instrument does not affect outcome except through its potential effect on treatment
578
+ (iii) Instrument and outcome do not share causes
579
+ """
580
+
581
+ def estimate_coefficient (self ):
582
+ """
583
+ Estimate the linear regression coefficient of the treatment on the outcome.
584
+ """
585
+ # Estimate the total effect of instrument I on outcome Y = abI + c1
586
+ ab = sm .OLS (self .df [self .outcome ], self .df [[self .instrument ]]).fit ().params [self .instrument ]
587
+
588
+ # Estimate the direct effect of instrument I on treatment X = aI + c1
589
+ a = sm .OLS (self .df [self .treatment ], self .df [[self .instrument ]]).fit ().params [self .instrument ]
590
+
591
+ # Estimate the coefficient of I on X by cancelling
592
+ return ab / a
593
+
594
+ def estimate_ate (self ):
595
+ return (self .treatment_value - self .control_value ) * self .estimate_coefficient (), (None , None )
596
+
597
+
545
598
class CausalForestEstimator (Estimator ):
546
599
"""A causal random forest estimator is a non-parametric estimator which recursively partitions the covariate space
547
600
to learn a low-dimensional representation of treatment effect heterogeneity. This form of estimator is best suited
@@ -563,7 +616,7 @@ def estimate_ate(self) -> float:
563
616
"""
564
617
# Remove any NA containing rows
565
618
reduced_df = self .df .copy ()
566
- necessary_cols = list ( self .treatment ) + list (self .adjustment_set ) + list ( self .outcome )
619
+ necessary_cols = [ self .treatment ] + list (self .adjustment_set ) + [ self .outcome ]
567
620
missing_rows = reduced_df [necessary_cols ].isnull ().any (axis = 1 )
568
621
reduced_df = reduced_df [~ missing_rows ]
569
622
@@ -574,8 +627,8 @@ def estimate_ate(self) -> float:
574
627
else :
575
628
effect_modifier_df = reduced_df [list (self .adjustment_set )]
576
629
confounders_df = reduced_df [list (self .adjustment_set )]
577
- treatment_df = np .ravel (reduced_df [list ( self .treatment ) ])
578
- outcome_df = np .ravel (reduced_df [list ( self .outcome ) ])
630
+ treatment_df = np .ravel (reduced_df [[ self .treatment ] ])
631
+ outcome_df = np .ravel (reduced_df [[ self .outcome ] ])
579
632
580
633
# Fit the model to the data using a gradient boosting regressor for both the treatment and outcome model
581
634
model = CausalForestDML (
@@ -603,7 +656,7 @@ def estimate_cates(self) -> pd.DataFrame:
603
656
604
657
# Remove any NA containing rows
605
658
reduced_df = self .df .copy ()
606
- necessary_cols = list ( self .treatment ) + list (self .adjustment_set ) + list ( self .outcome )
659
+ necessary_cols = [ self .treatment ] + list (self .adjustment_set ) + [ self .outcome ]
607
660
missing_rows = reduced_df [necessary_cols ].isnull ().any (axis = 1 )
608
661
reduced_df = reduced_df [~ missing_rows ]
609
662
@@ -617,8 +670,8 @@ def estimate_cates(self) -> pd.DataFrame:
617
670
confounders_df = reduced_df [list (self .adjustment_set )]
618
671
else :
619
672
confounders_df = None
620
- treatment_df = reduced_df [list ( self .treatment ) ]
621
- outcome_df = reduced_df [list ( self .outcome ) ]
673
+ treatment_df = reduced_df [[ self .treatment ] ]
674
+ outcome_df = reduced_df [[ self .outcome ] ]
622
675
623
676
# Fit a model to the data
624
677
model = CausalForestDML (model_y = GradientBoostingRegressor (), model_t = GradientBoostingRegressor ())
0 commit comments