From 7d66513c6eeafdfb9b054e80329a943edcf82e74 Mon Sep 17 00:00:00 2001 From: marcoBmota8 Date: Fri, 14 Jun 2024 21:06:15 -0500 Subject: [PATCH] Implemented possibility to choose Horvitz-Thompson or Hajek IPW estimator --- causallib/estimation/base_weight.py | 16 +++++++++++----- causallib/estimation/ipw.py | 13 +++++++++++-- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/causallib/estimation/base_weight.py b/causallib/estimation/base_weight.py index 72c602ad..668253ca 100644 --- a/causallib/estimation/base_weight.py +++ b/causallib/estimation/base_weight.py @@ -103,7 +103,7 @@ def compute_weight_matrix(self, X, a, use_stabilized=None, **kwargs): raise NotImplementedError @staticmethod - def _compute_stratified_weighted_aggregate(y, sample_weight=None, + def _compute_stratified_weighted_aggregate(y, sample_weight=None, normalize=True, stratify_by=None, treatment_values=None): """ Calculates aggregation of `y` weighted by `sample_weights` stratified by `stratify_by` variable. @@ -111,13 +111,14 @@ def _compute_stratified_weighted_aggregate(y, sample_weight=None, Args: y (pd.Series): The variable to aggregate (num_subjects,). sample_weight (pd.Series|None): Individual (sample) weights calculated. - Used to achieved unbiased average outcome. - If not provided, gives equal weights to every example. + Used to achieved unbiased average outcome. + If not provided, gives equal weights to every example. + normalize (bool): Whether to normalize the weights to sum to 1 within each strata. stratify_by (pd.Series|None): Categorical variable to stratify according to (num_subjects,). Namely, aggregate within subgroups sharing the same values. If not provided, the aggregation is on the entire treatment_values (Any): Subset of values to stratify on from `stratify_by`. - If not supplied, all available stratification values are used. + If not supplied, all available stratification values are used. Returns: pd.Series[Any, float]: Series which index are treatment values, and the values are numbers - the @@ -133,11 +134,16 @@ def _compute_stratified_weighted_aggregate(y, sample_weight=None, res = {} for treatment_value in treatment_values: subgroup_mask = stratify_by == treatment_value - aggregated_value = np.average(y[subgroup_mask], weights=sample_weight[subgroup_mask]) + if normalize: + aggregated_value = np.average(y[subgroup_mask], weights=sample_weight[subgroup_mask]) + else: + aggregated_value = np.sum(y[subgroup_mask] * sample_weight[subgroup_mask])/len(y) + res[treatment_value] = aggregated_value res = pd.Series(res) return res + def evaluate_balancing(self, X, a, y, w): pass # TODO: implement: (1) table one with smd (2) gather lots of metric (ks, kl, smd) (3) plot CDF of each feature diff --git a/causallib/estimation/ipw.py b/causallib/estimation/ipw.py index fda4bcab..c634fd67 100644 --- a/causallib/estimation/ipw.py +++ b/causallib/estimation/ipw.py @@ -201,7 +201,7 @@ def compute_propensity_matrix(self, X, a=None, clip_min=None, clip_max=None): return probabilities - def estimate_population_outcome(self, X, a, y, w=None, treatment_values=None): + def estimate_population_outcome(self, X, a, y, ipw_estimator, w=None, treatment_values=None): """ Calculates weighted population outcome for each subgroup stratified by treatment assignment. @@ -209,6 +209,9 @@ def estimate_population_outcome(self, X, a, y, w=None, treatment_values=None): X (pd.DataFrame): Covariate matrix of size (num_subjects, num_features). a (pd.Series): Treatment assignment of size (num_subjects,). y (pd.Series): Observed outcome of size (num_subjects,). + ipw_estimator (str): The type of estimator to use for the inverse probability weighting. + Horvitz-Thompson estimator is unbiased but high variance, while Hajek estimator + is biased but lower variance. w (pd.Series | None): Individual (sample) weights calculated. Used to achieved unbiased average outcome. If not provided, will be calculated on the data. treatment_values (Any): Desired treatment value/s to stratify upon. @@ -221,8 +224,14 @@ def estimate_population_outcome(self, X, a, y, w=None, treatment_values=None): """ if w is None: w = self.compute_weights(X, a) - res = self._compute_stratified_weighted_aggregate(y, sample_weight=w, stratify_by=a, + if ipw_estimator == 'Horvitz-Thompson': + res = self._compute_stratified_weighted_aggregate(y, sample_weight=w, stratify_by=a, normalize=False, treatment_values=treatment_values) + elif ipw_estimator == 'Hajek': + res = self._compute_stratified_weighted_aggregate(y, sample_weight=w, normalize=True, stratify_by=a, treatment_values=treatment_values) + else: + raise ValueError(f"Unknown ipw_estimator: {ipw_estimator}, not implemented in the current package version.") + return res @staticmethod