Skip to content

Commit 06a0d25

Browse files
committed
adding classes for propensity models
Signed-off-by: Nathaniel <[email protected]>
1 parent 3026ab4 commit 06a0d25

File tree

6 files changed

+2734
-0
lines changed

6 files changed

+2734
-0
lines changed

causalpy/data/datasets.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
"anova1": {"filename": "ancova_generated.csv"},
2121
"geolift1": {"filename": "geolift1.csv"},
2222
"risk": {"filename": "AJR2001.csv"},
23+
"nhefs": {"filename": "nhefs.csv"}
2324
}
2425

2526

causalpy/data/nhefs.csv

Lines changed: 1567 additions & 0 deletions
Large diffs are not rendered by default.

causalpy/data_validation.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,3 +133,11 @@ def _input_validation(self):
133133
the assumption of a simple IV experiment.
134134
The coefficients should be interpreted appropriately."""
135135
)
136+
137+
138+
class PropensityDataValidator:
139+
"""Mixin class for validating the input data and model formula for IV experiments."""
140+
141+
def _input_validation(self):
142+
"""Validate the input data and model formula for correctness"""
143+
pass

causalpy/pymc_experiments.py

Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
RegressionKinkDataValidator,
3131
PrePostNEGDDataValidator,
3232
IVDataValidator,
33+
PropensityDataValidator
3334
)
3435
from causalpy.plot_utils import plot_xY
3536
from causalpy.utils import round_num
@@ -1477,3 +1478,207 @@ def get_naive_OLS_fit(self):
14771478
beta_params.insert(0, ols_reg.intercept_[0])
14781479
self.ols_beta_params = dict(zip(self._x_design_info.column_names, beta_params))
14791480
self.ols_reg = ols_reg
1481+
1482+
1483+
class InversePropensityWeighting(ExperimentalDesign, PropensityDataValidator):
1484+
"""
1485+
A class to analyse inverse propensity weighting experiments.
1486+
1487+
:param data:
1488+
A pandas dataframe
1489+
:param formula:
1490+
A statistical model formula for the propensity model
1491+
:param outcome_variable
1492+
A string denoting the outcome variable in datq to be reweighted
1493+
:param weighting_scheme:
1494+
A string denoting which weighting scheme to use among: 'raw', 'robust',
1495+
'doubly robust'
1496+
:param model:
1497+
A PyMC model
1498+
1499+
Example
1500+
--------
1501+
>>> import causalpy as cp
1502+
>>> df = cp.load_data("rd")
1503+
>>> seed = 42
1504+
>>> result = cp.pymc_experiments.InversePropensityWeighting(
1505+
... df,
1506+
... formula="t ~ 1 + x",
1507+
... outcome_variable ="y",
1508+
... weighting_scheme="robust",
1509+
... model=cp.pymc_models.PropensityScore(
1510+
... sample_kwargs={
1511+
... "draws": 100,
1512+
... "target_accept": 0.95,
1513+
... "random_seed": seed,
1514+
... "progressbar": False,
1515+
... },
1516+
... ),
1517+
... )
1518+
"""
1519+
1520+
def __init__(
1521+
self,
1522+
data: pd.DataFrame,
1523+
formula: str,
1524+
outcome_variable: str,
1525+
weighting_scheme: str,
1526+
model=None,
1527+
**kwargs,
1528+
):
1529+
super().__init__(model=model, **kwargs)
1530+
self.expt_type = "Inverse Propensity Score Weighting"
1531+
self.data = data
1532+
self.formula = formula
1533+
self.outcome_variable = outcome_variable
1534+
self.weighting_scheme = weighting_scheme
1535+
self._input_validation()
1536+
1537+
t, X = dmatrices(formula, self.data)
1538+
self._t_design_info = t.design_info
1539+
self._t_design_info = X.design_info
1540+
self.labels = X.design_info.column_names
1541+
self.t, self.X = np.asarray(t), np.asarray(X)
1542+
self.y = self.data[self.outcome_variable]
1543+
1544+
COORDS = {"obs_ind": list(range(self.X.shape[0])), "coeffs": self.labels}
1545+
self.coords = COORDS
1546+
self.model.fit(
1547+
X=self.X, t=self.t, coords=COORDS
1548+
)
1549+
1550+
def make_robust_adjustments(self, ps):
1551+
X = pd.DataFrame(self.X, columns=self.labels)
1552+
X['ps'] = ps
1553+
X[self.outcome_variable] = self.y
1554+
t = self.t.flatten()
1555+
p_of_t = np.mean(t)
1556+
X["i_ps"] = np.where(t==1, (p_of_t / X["ps"]), (1 - p_of_t) / (1 - X["ps"]))
1557+
n_ntrt = X[t == 0].shape[0]
1558+
n_trt = X[t == 1].shape[0]
1559+
outcome_trt = X[t == 1][self.outcome_variable]
1560+
outcome_ntrt = X[t == 0][self.outcome_variable]
1561+
i_propensity0 = X[t == 0]["i_ps"]
1562+
i_propensity1 = X[t == 1]["i_ps"]
1563+
weighted_outcome1 = outcome_trt * i_propensity1
1564+
weighted_outcome0 = outcome_ntrt * i_propensity0
1565+
return weighted_outcome0, weighted_outcome1, n_ntrt, n_trt
1566+
1567+
1568+
def make_raw_adjustments(self, ps):
1569+
X = pd.DataFrame(self.X, columns=self.labels)
1570+
X['ps'] = ps
1571+
X[self.outcome_variable] = self.y
1572+
t = self.t.flatten()
1573+
X["ps"] = np.where(t, X["ps"], 1 - X["ps"])
1574+
X["i_ps"] = 1 / X["ps"]
1575+
n_ntrt = n_trt = len(X)
1576+
outcome_trt = X[t == 1][self.outcome_variable]
1577+
outcome_ntrt = X[t == 0][self.outcome_variable]
1578+
i_propensity0 = X[t == 0]["i_ps"]
1579+
i_propensity1 = X[t == 1]["i_ps"]
1580+
weighted_outcome1 = outcome_trt * i_propensity1
1581+
weighted_outcome0 = outcome_ntrt * i_propensity0
1582+
return weighted_outcome0, weighted_outcome1, n_ntrt, n_trt
1583+
1584+
1585+
def make_overlap_adjustments(self, ps):
1586+
X = pd.DataFrame(self.X, columns=self.labels)
1587+
X['ps'] = ps
1588+
X[self.outcome_variable] = self.y
1589+
t = self.t.flatten()
1590+
X["i_ps"] = np.where(t, (1-X["ps"])*t, X["ps"]*(1-t))
1591+
n_ntrt = (1-t[t == 0])*X[t == 0]['i_ps']
1592+
n_trt = t[t == 1]*X[t == 1]['i_ps']
1593+
outcome_trt = X[t == 1][self.outcome_variable]
1594+
outcome_ntrt = X[t == 0][self.outcome_variable]
1595+
i_propensity0 = X[t == 0]["i_ps"]
1596+
i_propensity1 = X[t == 1]["i_ps"]
1597+
weighted_outcome1 = t[t == 1]*outcome_trt * i_propensity1
1598+
weighted_outcome0 = (1-t[t == 0])*outcome_ntrt * i_propensity0
1599+
return weighted_outcome0, weighted_outcome1, n_ntrt, n_trt
1600+
1601+
1602+
def make_doubly_robust_adjustment(self, ps):
1603+
X = pd.DataFrame(self.X, columns=self.labels)
1604+
X['ps'] = ps
1605+
t = self.t.flatten()
1606+
m0 = sk_lin_reg().fit(X[t == 0].astype(float), self.y[t == 0])
1607+
m1 = sk_lin_reg().fit(X[t == 1].astype(float), self.y[t == 1])
1608+
m0_pred = m0.predict(X)
1609+
m1_pred = m1.predict(X)
1610+
## Compromise between outcome and treatement assignment model
1611+
weighted_outcome0 = (1 - t) * (self.y - m0_pred) / (1 - X["ps"]) + m0_pred
1612+
weighted_outcome1 = t * (self.y - m1_pred) / X["ps"] + m1_pred
1613+
return weighted_outcome0, weighted_outcome1, None, None
1614+
1615+
def get_ate(self, i, idata, method="doubly_robust"):
1616+
### Post processing the sample posterior distribution for propensity scores
1617+
### One sample at a time.
1618+
ps = idata["posterior"]["p"].stack(z=("chain", "draw"))[:, i].values
1619+
if method == "robust":
1620+
weighted_outcome_ntrt, weighted_outcome_trt, n_ntrt, n_trt = self.make_robust_adjustments(ps)
1621+
ntrt = weighted_outcome_ntrt.sum() / n_ntrt
1622+
trt = weighted_outcome_trt.sum() / n_trt
1623+
elif method == "raw":
1624+
weighted_outcome_ntrt, weighted_outcome_trt, n_ntrt, n_trt = self.make_raw_adjustments(ps)
1625+
ntrt = weighted_outcome_ntrt.sum() / n_ntrt
1626+
trt = weighted_outcome_trt.sum() / n_trt
1627+
elif method == "overlap":
1628+
weighted_outcome_ntrt, weighted_outcome_trt, n_ntrt, n_trt = self.make_overlap_adjustments(ps)
1629+
ntrt = np.sum(weighted_outcome_ntrt) / np.sum(n_ntrt)
1630+
trt = np.sum(weighted_outcome_trt) / np.sum(n_trt)
1631+
else:
1632+
weighted_outcome_ntrt, weighted_outcome_trt, n_ntrt, n_trt = self.make_doubly_robust_adjustment(
1633+
ps
1634+
)
1635+
trt = np.mean(weighted_outcome_trt)
1636+
ntrt = np.mean(weighted_outcome_ntrt)
1637+
ate = trt - ntrt
1638+
return [ate, trt, ntrt]
1639+
1640+
def plot_ATE(self, idata=None, method=None, prop_draws=100, ate_draws=300):
1641+
if idata is None:
1642+
idata = self.idata
1643+
if method is None:
1644+
method = self.weighting_scheme
1645+
1646+
def plot_weights(bins, top0, top1, ax):
1647+
ax.axhline(0, c="gray", linewidth=1)
1648+
bars0 = ax.bar(bins[:-1] + 0.025, top0, width=0.04, facecolor="red", alpha=0.3)
1649+
bars1 = ax.bar(bins[:-1] + 0.025, -top1, width=0.04, facecolor="blue", alpha=0.3)
1650+
1651+
for bars in (bars0, bars1):
1652+
for bar in bars:
1653+
bar.set_edgecolor("black")
1654+
1655+
def make_hists(idata, i, axs):
1656+
p_i = az.extract(idata)['p'][:, i].values
1657+
bins = np.arange(0.025, .99, 0.005)
1658+
top0, _ = np.histogram(p_i[self.t.flatten() == 0], bins=bins)
1659+
top1, _ = np.histogram(p_i[self.t.flatten() == 1], bins=bins)
1660+
plot_weights(bins, top0, top1, axs[0])
1661+
1662+
mosaic = """AAAAAA
1663+
BBBBCC"""
1664+
1665+
fig, axs = plt.subplot_mosaic(mosaic, figsize=(20, 13))
1666+
axs = [axs[k] for k in axs.keys()]
1667+
axs[0].axvline(0.1, linestyle='--', label='Low Extreme Propensity Scores', color='black')
1668+
axs[0].axvline(0.9, linestyle='--', label='Hi Extreme Propensity Scores', color='black')
1669+
axs[0].set_title("Draws from the Posterior \n Propensity Scores Distribution", fontsize=20)
1670+
1671+
[make_hists(idata, i, axs) for i in range(prop_draws)];
1672+
ate_df = pd.DataFrame([self.get_ate(i, idata, method=method) for i in range(ate_draws)], columns=['ATE', 'Y(1)', 'Y(0)'])
1673+
axs[1].hist(ate_df['Y(1)'], label='E(Y(1))', ec='black', bins=10, alpha=0.8, color='blue');
1674+
axs[1].hist(ate_df['Y(0)'], label='E(Y(0))', ec='black', bins=10, alpha=0.8, color='red');
1675+
axs[1].legend()
1676+
axs[1].set_title(f'The Outcomes \n Under the {method} re-weighting scheme', fontsize=20)
1677+
axs[2].hist(ate_df['ATE'], label= 'ATE', ec='black', bins=10, color='slateblue', alpha=0.6);
1678+
axs[2].axvline(ate_df['ATE'].mean(), label='E(ATE)')
1679+
axs[2].legend()
1680+
axs[2].set_title("Average Treatment Effect", fontsize=20);
1681+
1682+
1683+
1684+

causalpy/pymc_models.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -361,3 +361,67 @@ def fit(self, X, Z, y, t, coords, priors):
361361
pm.sample_posterior_predictive(self.idata, progressbar=False)
362362
)
363363
return self.idata
364+
365+
366+
class PropensityScore(ModelBuilder):
367+
"""
368+
Custom PyMC model for inverse propensity score models
369+
370+
.. note:
371+
Generally, the `.fit()` method should be used rather than
372+
calling `.build_model()` directly.
373+
374+
Defines the PyMC model
375+
376+
.. math::
377+
\\beta &\sim \mathrm{Normal}(0, 50)
378+
379+
\sigma &\sim \mathrm{HalfNormal}(1)
380+
381+
\mu &= X * \\beta
382+
383+
p &= logit^{-1}(mu)
384+
385+
t &\sim \mathrm{Bernoulli}(p)
386+
387+
Example
388+
--------
389+
>>> import causalpy as cp
390+
>>> import numpy as np
391+
>>> from causalpy.pymc_models import PropensityScore
392+
>>> rd = cp.load_data("rd")
393+
>>> X = rd[["x", "treated"]]
394+
>>> y = np.asarray(rd["y"]).reshape((rd["y"].shape[0],1))
395+
>>> ps = PropensityScore(sample_kwargs={"progressbar": False})
396+
>>> ps.fit(X, y, coords={
397+
... 'coeffs': ['x', 'treated'],
398+
... 'obs_indx': np.arange(rd.shape[0])
399+
... },
400+
... )
401+
Inference...
402+
""" # noqa: W605
403+
404+
def build_model(self, X, t, coords):
405+
"Defines the PyMC propensity model"
406+
with self:
407+
self.add_coords(coords)
408+
X_data = pm.MutableData("X", X, dims=["obs_ind", "coeffs"])
409+
t_data = pm.MutableData("t", t.flatten(), dims="obs_ind")
410+
b = pm.Normal("b", mu=0, sigma=1, dims="coeffs")
411+
mu = pm.math.dot(X_data, b)
412+
p = pm.Deterministic("p", pm.math.invlogit(mu))
413+
t_pred = pm.Bernoulli("t_pred", p=p, observed=t_data, dims="obs_ind")
414+
415+
416+
def fit(self, X, t, coords):
417+
"""Draw samples from posterior, prior predictive, and posterior predictive
418+
distributions.
419+
"""
420+
self.build_model(X, t, coords)
421+
with self:
422+
self.idata = pm.sample(**self.sample_kwargs)
423+
self.idata.extend(pm.sample_prior_predictive())
424+
self.idata.extend(
425+
pm.sample_posterior_predictive(self.idata, progressbar=False)
426+
)
427+
return self.idata

0 commit comments

Comments
 (0)